def test_guess_language_name_untrained_model(): with tempfile.TemporaryDirectory() as model_dir: guess = Guess(model_dir) assert not guess.is_trained with pytest.raises(GuesslangError): guess.language_name(C_CODE)
def test_guess_train_with_default_model(): guess = Guess() with tempfile.TemporaryDirectory() as source_files_dir: _create_training_files(source_files_dir) with pytest.raises(GuesslangError): guess.train(source_files_dir, max_steps=10)
def test_guess_train_without_subdirectories(): with tempfile.TemporaryDirectory() as model_dir: guess = Guess(model_dir) with tempfile.TemporaryDirectory() as source_files_dir: with pytest.raises(GuesslangError): guess.train(source_files_dir, max_steps=10)
def test_guess_probabilities(): guess = Guess() scores = guess.probabilities(PYTHON_CODE) assert len(scores) == len(guess.supported_languages) for language, probability in scores: assert language in guess.supported_languages assert 0 <= probability <= 1 top_language = scores[0][0] assert top_language == 'Python'
def is_source_code(data): if len(data) < SOURCE_CODE_MIN_LEN: return False try: guess = Guess() lang = guess.language_name(data) for l in C_SIMILAR_LANG: if lang == l: return True return False except GuesslangError as e: print(color.red(e))
def scan_plain(crawler, url, text): '''CPUTASK scan text/plain content to be a known language. try to evaluate if known language is found check for more urls''' LOGGER.info('Checking plain text content from url: %s' % url) langnames = Guess().probable_languages(text) LOGGER.info('Guessed langnames from url: %s %s' % (url, langnames)) if 'Javascript' in langnames: try: json_from_plain = json.loads(text) except Exception: # we dont care too much json_from_plain = None LOGGER.error('JSON evaluation failed on : %s' % text) if json_from_plain: LOGGER.info('Got JSON from plain: %s' % json_from_plain) if 'Python' in langnames: try: # python_fom_plain = eval(text) python_fom_plain = builtin_ast.literal_eval(text) except Exception: # we dont care too much LOGGER.error('Python evaluation failed on : %s' % text) python_fom_plain = None if python_fom_plain: LOGGER.info('Got Python from plain: %s' % python_fom_plain) if isinstance(python_fom_plain, dict): urls = list(nested_find('url', python_fom_plain)) LOGGER.info('Got urls from python dict: %s' % urls) for link in urls: crawler.check_link(url, link)
def detect_filetype(): global guess if guess is None: from guesslang import Guess guess = Guess() buffer = vim.current.buffer text = "".join(buffer) ft = str(guess.language_name(text)) if lang_map.__contains__(ft): ft = lang_map[ft] ft = ft.lower() vim.command("set filetype=" + ft) vim.command("echon ', set filetype=" + ft + "'")
def guess_language(code_value): print("code here") lang = Guess().language_name(code_value) channel_layer = get_channel_layer() async_to_sync(channel_layer.group_send)("snippet_snippet", { "type": "code_language", "message": lang })
def __init__(self, bot): self.bot = bot self.guild = bot.get_guild(constants.tortoise_guild_id) self.session = aiohttp.ClientSession() self.banned_words = ConfigHandler("banned_words.json") self.trusted = self.guild.get_role(constants.trusted_role_id) self.log_channel = bot.get_channel(constants.bot_log_channel_id) self.guess_language = Guess()
def detectionLang(listOfFiles):#prend la liste des fichiers er retourner dict contenant fichier:prog langage guess=Guess() b = {'0','1'} filesLangages=dict() for i in listOfFiles: if os.stat(i).st_size != 0: with open(i,'r',encoding='latin1',errors='ignore') as f: if b!=set(f.read()) and set(f.read())!={'0'} and set(f.read())!={'1'}: #filesLangages[i]=guess.language_name(f.read()) filesLangages[i]=subprocess.check_output(['guesslang',i]).decode("utf-8") return(filesLangages)
def login_page(): try: if request.method == "POST": json = request.get_json(silent=True) code = json['code'] name = Guess().language_name(code) language = lang_dict[name] return jsonify({'language': language}) except Exception as e: return jsonify({'status': "Error", 'message': "Some error occurred!"}), 500
def test_guess_train(): with tempfile.TemporaryDirectory() as model_dir: guess = Guess(model_dir) with tempfile.TemporaryDirectory() as source_files_dir: _create_training_files(source_files_dir) guess.train(source_files_dir, max_steps=10) assert guess.language_name(PYTHON_CODE) == 'Python' assert guess.language_name(C_CODE) == 'C'
def __init__(self, api_key: str, lang: Mapping[str, str], theme_image_ids: tuple[str], keyboards: Mapping[str, InlineKeyboardMarkup], guesslang_syntaxes: Mapping[str, str], *args: Any, admin_chat_id: Optional[str] = None, db_path: str = str( local.path(__file__).up() / 'user_themes.sqlite'), **kwargs: Any): self.lang = lang self.theme_image_ids = theme_image_ids self.kb = keyboards self.guesslang_syntaxes = guesslang_syntaxes self.admin_chat_id = admin_chat_id self.db_path = db_path self.user_themes = KeyValue(key_field=IntegerField(primary_key=True), value_field=CharField(), database=APSWDatabase(db_path)) self.log = mk_logger() self.bot = TeleBot(api_key, *args, **kwargs) self.register_handlers() self.guesser = Guess()
def fxn(): import warnings warnings.warn("deprecated", DeprecationWarning) import warnings warnings.filterwarnings('ignore') from guesslang import Guess z=0 with open('Test1.java') as e: z=e.read() e.close() name = Guess().language_name(z) print(name)
def test_guess_supported_languages(): guess = Guess() assert len(guess.supported_languages) >= 30 assert 'Python' in guess.supported_languages assert 'C' in guess.supported_languages
from guesslang import Guess guess = Guess() name = guess.language_name(""" RuleSet{ name={} readOnly={0} origin={} global={0} comment={} objrenamed={0} baseid={0} incid={1} featureLevel={16} useAppRules={0} id={} transobj={7.1.1.1} creator={} localCascade={0} allowRID={0} allowAppRules={0} prefixmatch={ } rulesettype={} loadsets={} name={} readOnly={0} origin={} global={0} comment={} netprefixobj={
import os import uuid from guesslang import Guess import requests from app.event_handlers.stylecheck.linters import * FILE_FOLDER = "./temp_files" SUPPORTED_LANGUAGES = ["C", "C++", "Python"] SUPPORTED_FILE_TYPES = ["c", "cpp", "py"] TYPE_TO_LANGUAGE_MAPPING = dict(zip(SUPPORTED_FILE_TYPES, SUPPORTED_LANGUAGES)) LANGUAGE_TO_TYPE_MAPPING = dict(zip(SUPPORTED_LANGUAGES, SUPPORTED_FILE_TYPES)) language_guess = Guess() class CodeEntry: def __init__(self, language, code, file_path): self.language = language self.code = code self.file_path = file_path def run_linting(self): """ :return: Linting Result """ if self.language in SUPPORTED_LANGUAGES[:2]: return run_c_lint(self) elif self.language == SUPPORTED_LANGUAGES[2]: return run_py_lint(self)
def Analytics(request): posts = Post.objects.all() text = '' total = Post.objects.all().count() anshm = {} com = Comments.objects.all() answered = 0 for i in com: if i.post.id not in anshm: answered += 1 anshm[i.post.id] = 1 r = Rake() for i in posts: titletext = i.title r.extract_keywords_from_text(titletext) r.get_ranked_phrases() li = r.get_ranked_phrases_with_scores() for h in li: if h[0] <= 3: text = text + (h[-1]) text = text + (h[-1]) else: for k in range(int(h[0])): text += (h[-1]) guess = Guess() lang = {} for i in posts: code = i.content language = guess.language_name(code) if language in lang: lang[language] += 1 else: lang[language] = 1 # print(text) # print(lang) # number_of_colors = 8 # color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) # for i in range(number_of_colors)] # print(color) sort_langs = sorted(lang.items(), key=lambda x: x[1], reverse=True) print(sort_langs) langlabels = list() langvals = list() for i in sort_langs: langlabels.append(str(i[0])) langvals.append(int(i[1])) langlabels = langlabels[0:5] # langvals = langvals[0:5] l1 = langlabels[0] l2 = langlabels[1] l3 = langlabels[2] l4 = langlabels[3] l5 = langlabels[4] v1 = langvals[0] v2 = langvals[1] v3 = langvals[2] v4 = langvals[3] v5 = langvals[4] print(langlabels) print(langvals) return render( request, 'blog/analytics.html', { 'text': text, 'langvals': langvals, 'l1': l1, 'l2': l2, 'l3': l3, 'l4': l4, 'l5': l5, 'v1': v1, 'v2': v2, 'v3': v3, 'v4': v4, 'v5': v5, 'total': total, 'answered': answered }) # def some_view(request): # # return render_to_response('../../../VidyoConnector/js/VidyoConnector.html') # # return render(request, 'blog/test/VidyoConnector.html')
from collections import Counter def word_count(fname): with open(fname) as f: return Counter(f.read().split()) print("Frequency of words in the file: ", word_count("testfile.txt")) from guesslang import Guess name = Guess().language_name("testo.py") print(name)
class ColorCodeBot: def __init__(self, api_key: str, lang: Mapping[str, str], theme_image_ids: tuple[str], keyboards: Mapping[str, InlineKeyboardMarkup], guesslang_syntaxes: Mapping[str, str], *args: Any, admin_chat_id: Optional[str] = None, db_path: str = str( local.path(__file__).up() / 'user_themes.sqlite'), **kwargs: Any): self.lang = lang self.theme_image_ids = theme_image_ids self.kb = keyboards self.guesslang_syntaxes = guesslang_syntaxes self.admin_chat_id = admin_chat_id self.db_path = db_path self.user_themes = KeyValue(key_field=IntegerField(primary_key=True), value_field=CharField(), database=APSWDatabase(db_path)) self.log = mk_logger() self.bot = TeleBot(api_key, *args, **kwargs) self.register_handlers() self.guesser = Guess() def register_handlers(self): self.welcome = self.bot.message_handler(commands=['start', 'help'])( self.welcome) self.browse_themes = self.bot.message_handler( commands=['theme', 'themes'])(self.browse_themes) self.mk_theme_previews = self.bot.message_handler( commands=['previews'])(self.mk_theme_previews) self.intake_snippet = self.bot.message_handler( func=lambda m: m.content_type == 'text')(self.intake_snippet) self.recv_photo = self.bot.message_handler(content_types=['photo'])( self.recv_photo) self.restore_kb = self.bot.callback_query_handler( lambda q: yload(q.data)['action'] == 'restore')(self.restore_kb) self.set_snippet_filetype = self.bot.callback_query_handler( lambda q: yload(q.data)['action'] == 'set ext')( self.set_snippet_filetype) self.set_theme = self.bot.callback_query_handler( lambda q: yload(q.data)['action'] == 'set theme')(self.set_theme) self.send_photo_elsewhere = self.bot.inline_handler( lambda q: q.query.startswith("img "))(self.send_photo_elsewhere) self.switch_from_inline = self.bot.inline_handler(lambda q: True)( self.switch_from_inline) @retry def switch_from_inline(self, inline_query: InlineQuery): self.log.msg("receiving inline query", user_id=inline_query.from_user.id, user_first_name=inline_query.from_user.first_name, query=inline_query.query) self.bot.answer_inline_query( inline_query.id, [], switch_pm_text=self.lang['switch to direct'], switch_pm_parameter='x') @retry def welcome(self, message: Message): self.log.msg("introducing myself", user_id=message.from_user.id, user_first_name=message.from_user.first_name, chat_id=message.chat.id) self.bot.reply_to( message, self.lang['welcome'], reply_markup=ForceReply( input_field_placeholder=self.lang['input field placeholder'])) @retry def mk_theme_previews(self, message: Message): if not self.admin_chat_id or str( message.chat.id) != self.admin_chat_id: self.log.msg("naughty preview attempt", user_id=message.from_user.id, user_first_name=message.from_user.first_name, chat_id=message.chat.id, admin_chat_id=self.admin_chat_id) return sample_code = dedent(""" # palinDay :: Int -> [ISO Date] def palinDay(y): '''A possibly empty list containing the palindromic date for the given year, if such a date exists. ''' s = str(y) r = s[::-1] iso = '-'.join([s, r[0:2], r[2:]]) try: datetime.strptime(iso, '%Y-%m-%d') return [iso] except ValueError: return [] """) for button in chain.from_iterable(self.kb['theme'].keyboard): theme = button.text html = mk_html(f"# {theme}{sample_code}", 'py', theme) with local.tempdir() as folder: png_path = mk_png(html, folder=folder) send_image(bot=self.bot, chat_id=message.chat.id, png_path=png_path, reply_msg_id=message.message_id) @retry def browse_themes(self, message: Message): self.log.msg("browsing themes", user_id=message.from_user.id, user_first_name=message.from_user.first_name, chat_id=message.chat.id) albums = [ self.theme_image_ids[i:i + 10] for i in range(0, len(self.theme_image_ids), 10) ] for album in albums: self.bot.send_media_group(message.chat.id, map(InputMediaPhoto, album), reply_to_message_id=message.message_id) self.bot.reply_to(message, self.lang['select theme'], reply_markup=self.kb['theme']) @retry def set_theme(self, cb_query: CallbackQuery): data = yload(cb_query.data) user = cb_query.message.reply_to_message.from_user self.log.msg("setting theme", user_id=user.id, user_first_name=user.first_name, theme=data['theme'], chat_id=cb_query.message.chat.id) self.bot.edit_message_reply_markup(cb_query.message.chat.id, cb_query.message.message_id, reply_markup=minikb('theme')) self.user_themes[user.id] = data['theme'] self.bot.answer_callback_query( cb_query.id, text=self.lang['acknowledge theme'].format(data['theme'])) if self.admin_chat_id: with open(self.db_path, 'rb') as doc: self.bot.send_document(self.admin_chat_id, doc) def guess_ext(self, code: str, probability_min: float = .12) -> Optional[str]: syntax, probability = self.guesser.probabilities(code)[0] ext = self.guesslang_syntaxes.get(syntax) self.log.msg("guessed syntax", probability_min=probability_min, probability=probability, syntax=syntax, ext=ext) if probability >= probability_min: return ext for start, ext in { '{': 'json', '---\n': 'yaml', '[[': 'toml', '[': 'ini', '<?php': 'php', '<': 'xml', '-- ': 'lua' }.items(): if code.startswith(start): return ext @retry def intake_snippet(self, message: Message): self.log.msg("receiving code", user_id=message.from_user.id, user_first_name=message.from_user.first_name, chat_id=message.chat.id) ext = self.guess_ext(message.text) if ext: kb_msg = self.bot.reply_to( message, f"{self.lang['query ext']}\n\n{self.lang['guessed syntax'].format(ext)}", reply_markup=minikb('syntax', self.lang['syntax picker']), parse_mode='Markdown', disable_web_page_preview=True) self.set_snippet_filetype(cb_query=None, query_message=kb_msg, ext=ext) else: self.bot.reply_to(message, self.lang['query ext'], reply_markup=self.kb['syntax'], parse_mode='Markdown', disable_web_page_preview=True) @retry def send_photo_elsewhere(self, inline_query: InlineQuery): file_id = inline_query.query.split('img ', 1)[-1] self.log.msg("creating inline query result", file_id=file_id, file_info=self.bot.get_file(file_id)) self.bot.answer_inline_query(inline_query.id, [ InlineQueryResultCachedPhoto( id=str(uuid4()), photo_file_id=file_id, title="Send Image") ], is_personal=True) @retry def restore_kb(self, cb_query: CallbackQuery): data = yload(cb_query.data) self.bot.edit_message_reply_markup( cb_query.message.chat.id, cb_query.message.message_id, reply_markup=self.kb[data['kb_name']]) self.bot.answer_callback_query(cb_query.id) @retry def set_snippet_filetype(self, cb_query: Optional[CallbackQuery] = None, query_message: Optional[Message] = None, ext: Optional[str] = None): if cb_query: query_message = cb_query.message ext = yload(cb_query.data)['ext'] elif not (query_message and ext): raise Exception( "Either cb_query or both query_message and ext are required") self.log.msg("colorizing code", user_id=query_message.reply_to_message.from_user.id, user_first_name=query_message.reply_to_message.from_user. first_name, syntax=ext, chat_id=query_message.chat.id) if cb_query: self.bot.edit_message_reply_markup(query_message.chat.id, query_message.message_id, reply_markup=minikb( 'syntax', self.lang['syntax picker'])) snippet = query_message.reply_to_message theme = self.user_themes.get(snippet.from_user.id, 'base16/gruvbox-dark-hard') html = mk_html(snippet.text, ext, theme) send_html(bot=self.bot, chat_id=snippet.chat.id, html=html, reply_msg_id=snippet.message_id) with local.tempdir() as folder: png_path = mk_png(html, folder=folder) did_send = False if len(snippet.text.splitlines()) <= 30: try: photo_msg = send_image(bot=self.bot, chat_id=snippet.chat.id, png_path=png_path, reply_msg_id=snippet.message_id) except ApiException as e: self.log.error("failed to send compressed image", exc_info=e, chat_id=snippet.chat.id) else: did_send = True kb_to_chat = InlineKeyboardMarkup() kb_to_chat.add( InlineKeyboardButton( self.lang['send to chat'], switch_inline_query= f"img {photo_msg.photo[-1].file_id}")) self.bot.edit_message_reply_markup(photo_msg.chat.id, photo_msg.message_id, reply_markup=kb_to_chat) if not did_send: send_image(bot=self.bot, chat_id=snippet.chat.id, png_path=png_path, reply_msg_id=snippet.message_id, compress=False) if cb_query: self.bot.answer_callback_query(cb_query.id) def recv_photo(self, message: Message): self.log.msg('received photo', file_id=message.photo[0].file_id, user_id=message.from_user.id, user_first_name=message.from_user.first_name, chat_id=message.chat.id)
def test_guess_language_name_plain_text(): guess = Guess() assert guess.language_name(PLAIN_TEXT) is None
for folder in folders: files = [f for f in listdir(folder) if isfile(join(folder, f))] for f in files: file = join(folder, f) description = "" not_recognized = False with open(file, 'r') as file_reader: data = file_reader.read() data = '\n'.join([x for x in data.split("\n") if x.strip() != '']) # Stage 1 - recognize code vs human readable file. coding_langs = Guess().scores(data) k, v = sorted(coding_langs.items(), key=lambda p: p[1])[-1] if k == "Markdown" or v < 0.9: not_recognized = True else: description += "Code in " + k + "." if not_recognized: # Stage 2 - Check weird interpunction. punctuation_coding = count( data, string.punctuation.replace("!", "").replace(".", "").replace( "?", "").replace(",", "")) if punctuation_coding / len(data) > 0.06:
def _real_main(): # Get the arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '-i', '--input-file', type=argparse.FileType('r'), default=sys.stdin, help="source code file. Default is standard input (stdin)") parser.add_argument('-a', '--all', default=False, action='store_true', help="print all matching languages when guessing") parser.add_argument( '--learn', metavar='LEARN_DIR', help="learn from a directory containing source code files") parser.add_argument( '--test', metavar='TEST_DIR', help="test Guesslang model accuracy using source code files directory") parser.add_argument('--model', metavar='MODEL_DIR', help="custom Guesslang learning model directory") parser.add_argument('-d', '--debug', default=False, action='store_true', help="show debug messages") args = parser.parse_args() if args.learn: if not args.model: parser.error("Argument --model is required when using --learn") if args.all: parser.error("Argument --all cannot be used with --learn") config_logging(debug=args.debug) LOGGER.debug("Run with args: %s", vars(args)) # Create a language guesser guess = Guess(args.model) if args.learn: # Learn from source files accuracy = guess.learn(args.learn) LOGGER.info("Guessing learning accuracy is %.2f%%", 100 * accuracy) if args.test: # Test Guesslang model accuracy results = guess.test(args.test) percent = 100 * results['overall-accuracy'] LOGGER.info("The overall accuracy of the test is %.2f%%", percent) LOGGER.info("Test report saved into '%s'", _save_report(results)) if not args.learn and not args.test: # Guess language content = _read_file(args.input_file) if args.all: language_info = " or ".join(guess.probable_languages(content)) else: language_info = guess.language_name(content) LOGGER.info("The source code is written in %s", language_info) LOGGER.debug("Exit OK")
from flask import Flask, render_template, request, redirect, abort from flask_limiter import Limiter from flask_limiter.util import get_remote_address from guesslang import Guess from os import environ from distutils.util import strtobool from threading import Thread pastey_version = "0.4.2" loaded_config = {} loaded_themes = [] app = Flask(__name__) limiter = Limiter(app, key_func=get_remote_address) guess = Guess() from pastey import config, common, routes, functions # Check environment variable overrides config.data_directory = environ[ "PASTEY_DATA_DIRECTORY"] if "PASTEY_DATA_DIRECTORY" in environ else config.data_directory config.listen_address = environ[ "PASTEY_LISTEN_ADDRESS"] if "PASTEY_LISTEN_ADDRESS" in environ else config.listen_address config.listen_port = environ[ "PASTEY_LISTEN_PORT"] if "PASTEY_LISTEN_PORT" in environ else config.listen_port config.use_whitelist = bool( strtobool(environ["PASTEY_USE_WHITELIST"] )) if "PASTEY_USE_WHITELIST" in environ else config.use_whitelist config.restrict_pasting = bool( strtobool(environ["PASTEY_RESTRICT_PASTING"]) ) if "PASTEY_RESTRICT_PASTING" in environ else config.restrict_pasting
def test_guess_init_with_model_dir(): with tempfile.TemporaryDirectory() as model_dir: guess = Guess(model_dir) assert not guess.is_trained
def test_guess_init(): guess = Guess() assert guess.is_trained
def detect_lang(request): code = request.POST['code'] lang = Guess().language_name(code) return HttpResponse(lang)
def test_guess_language_name(): guess = Guess() assert guess.language_name(PYTHON_CODE) == 'Python' assert guess.language_name(C_CODE) == 'C'
#!/usr/bin/env python3 import os from guesslang import Guess import warnings warnings.filterwarnings("ignore") os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' g = Guess() def guess_lang(text): """ text: input text return: predicted progamming language """ lang = g.language_name(text) return lang if __name__ == "__main__": sample_lang = """ % Quick sort -module (recursion). -export ([qsort/1]). qsort([]) -> []; qsort([Pivot|T]) -> qsort([X || X <- T, X < Pivot])
def test_guess_language_name_empty_code(): guess = Guess() assert guess.language_name('') is None assert guess.language_name(' \t \n ') is None