def newenvironment(self, name, nargs=0, definition=None, opt=None): """ Create a \\newenvironment Required Arguments: name -- name of the macro to create nargs -- integer number of arguments that the macro has definition -- two-element tuple containing the LaTeX definition. Each element should be a string. The first element corresponds to the beginning of the environment, and the second element is the end of the environment. opt -- string containing the LaTeX code to use in the optional argument Examples:: c.newenvironment('mylist', 0, (r'\\begin{itemize}', r'\\end{itemize}')) """ name = str(name) # Macro already exists if self.has_key(name): if not issubclass(self[name], (plasTeX.NewCommand, plasTeX.Definition)): return macrolog.debug('redefining environment "%s"', name) if nargs is None: nargs = 0 assert isinstance(nargs, int), 'nargs must be an integer' if definition is not None: assert isinstance(definition, (tuple,list)), \ 'definition must be a list or tuple' assert len(definition) == 2, 'definition must have 2 elements' if isinstance(definition[0], basestring): definition[0] = [x for x in Tokenizer(definition[0], self)] if isinstance(definition[1], basestring): definition[1] = [x for x in Tokenizer(definition[1], self)] if isinstance(opt, basestring): opt = [x for x in Tokenizer(opt, self)] macrolog.debug('creating newenvironment %s', name) # Begin portion newclass = new.classobj(name, (plasTeX.NewCommand, ), { 'nargs': nargs, 'opt': opt, 'definition': definition[0] }) self.addGlobal(name, newclass) # End portion newclass = new.classobj('end' + name, (plasTeX.NewCommand, ), { 'nargs': 0, 'opt': None, 'definition': definition[1] }) self.addGlobal('end' + name, newclass)
def test_issues__semicolon_missing_from_outfile(): fake_file = BytesIO(b""" var char key; // the key currently pressed by the user var boolean exit; let exit = false; """) t = Tokenizer(fake_file) assert (t.advance().token == 'var') assert (t.advance().token == 'char') assert (t.advance().token == 'key') assert (t.advance().token == ';') assert (t.advance().token == 'var') assert (t.advance().token == 'boolean') assert (t.advance().token == 'exit') assert (t.advance().token == ';') assert (t.advance().token == 'let') assert (t.advance().token == 'exit') assert (t.advance().token == '=') assert (t.advance().token == 'false') assert (t.advance().token == ';') fake_file = BytesIO(b" return();") t = Tokenizer(fake_file) assert (t.advance().token == 'return') assert (t.advance().token == '(') assert (t.advance().token == ')') assert (t.advance().token == ';')
def runTokenizer(self): """Prepare the Files for the tokenizer""" inFileSource = re.sub( ".txt$", "", self.fInput) + "-" + self.pDict["sourceLanguage"] + ".txt" tokFileSource = re.sub(".txt$", "-tok.txt", inFileSource) inFileTarget = re.sub( ".txt$", "", self.fInput) + "-" + self.pDict["targetLanguage"] + ".txt" tokFileTarget = re.sub(".txt$", "-tok.txt", inFileTarget) self.populate(inFileSource, inFileTarget) tokenizer = Tokenizer(inFileSource, tokFileSource, self.pDict["sourceLanguage"]) tokenizer.execute() tokenizer = Tokenizer(inFileTarget, tokFileTarget, self.pDict["targetLanguage"]) tokenizer.execute() self.pDict["tokFileSource"] = tokFileSource self.pDict["tokFileTarget"] = tokFileTarget #-----Clean the intermediate Files------ os.remove(inFileSource) os.remove(inFileTarget)
def build_index(self): ''' This function build the inverted index, it inserts the url to the doc Table with a doc_id, and insert each token to tokenT table and insert token, doc_id, term frequency and weight into the web_index Table ''' c = Corpus() t = Tokenizer() for url, name in c.get_file_name(): if len(url) > 1000: continue result = t.tokenize(name) if len(result) == 0: continue print(url) doc_id = 1 #Insert URL to table DOC sql = "INSERT INTO web.doc(url) values (%s)" val = (url, ) self.mycursor.execute(sql, val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in URL.") print(url) s_sql = "select id from doc where url=%s" self.mycursor.execute(s_sql, val) myresult = self.mycursor.fetchone() doc_id = myresult[0] print("DOC_ID IS " + str(doc_id)) #Insert token, doc_id, tf into web_index t_sql = "INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)" t_val = [] for token in result.keys(): t_val.append( (token, doc_id, result[token][0], result[token][1])) #print(t_val) self.mycursor.executemany(t_sql, t_val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in WEB_INDEX.") #insert into TokenT table count = 0 for token in result.keys(): tq = "Insert ignore into tokenT values (%s)" tv = (token, ) self.mycursor.execute(tq, tv) self.mydb.commit() count += 1 print("inserted " + str(count) + " Tokens")
def calculate(self, target_paragraph, predicted_paragraph): tokenizer = Tokenizer() target_words_list = tokenizer.tokenize_to_words_nltk( target_paragraph.encode('utf-8')) predicted_words_list = tokenizer.tokenize_to_words_nltk( predicted_paragraph.encode('utf-8')) dictionary = Utils.construct_dictionary( [target_words_list, predicted_words_list]) target_bow, predicted_bow = Utils.retrieve_bag_of_words( dictionary, target_words_list, predicted_words_list) document_precision, document_recall, document_f1score = Utils.calculate_bow_precision_recall( target_bow, predicted_bow, len(target_words_list), len(predicted_words_list)) self.precision_list.append(document_precision) key = int(np.floor_divide(document_precision, 10)) self.precision_count_map[key] = self.precision_count_map.get(key, 0) + 1 self.recall_list.append(document_recall) key = int(np.floor_divide(document_recall, 10)) self.recall_count_map[key] = self.recall_count_map.get(key, 0) + 1 self.f1score_list.append(document_f1score) key = int(np.floor_divide(document_f1score, 10)) self.f1score_count_map[key] = self.f1score_count_map.get(key, 0) + 1
def get_tokens(jack_file): token_generator = Tokenizer(jack_file) for token in token_generator: if not token: continue tokens.append(token) return tokens
def run(code, file, path): Parser.tokens = Tokenizer(code) Parser.file = file Parser.writer = MKDocsWriter() Parser.path = path ret = Parser.parseDocstring() return ret
def parse_state_machine(cls, inputString): tokens = Tokenizer(inputString) number_of_states = 0 state_machine_id = 0 state_machine = StateMachine() while (tokens.has_more_tokens()): current_token = tokens.peek() if (current_token == '--fsm-config'): tokens.next_token() state_machine_id = tokens.next_token() elif (current_token == '--nstates'): tokens.next_token() number_of_states = tokens.next_token() else: new_state = State(tokens) state_machine.add_state(new_state) # Create transitions token = tokens.peek() while (re.match(r'--n(\d+)x(\d+)', token) != None and re.match(r'--s(\d+)', token) == None): new_transition = Transition(tokens) state_machine.add_transition(new_transition) token = tokens.peek() return state_machine
def start(self, queue): tokenizer = Tokenizer() tokenized_text = tokenizer.tokenize_paragraph(self._text) sentences = [sent for sent, _ in tokenized_text] tokenized_text = [' '.join(sent) for _, sent in tokenized_text] vectorizer = self._get_vectorizer() dtm = vectorizer.fit_transform(tokenized_text) u, sigma, vT = randomized_svd(dtm.T, n_components=self._n_components, n_iter=5, random_state=None) sigma = np.diag(sigma) scores = self._sentence_selection(vT, sigma) scores.sort() summary = ' '.join([sentences[i] for i in scores]) similarity = tokenizer.calculate_similarity(self._text, summary) res = { "id": self._id, "summary": summary, "similarity": similarity, 'status': "done" } if self._title is not None: res['title'] = self._title queue.put(res)
def index(self, keep_stopwords=True, stem=False): print "Indexing %s documents in bulk!" % (len(self.docs)) inverted_list = {} for dID, text in self.docs.iteritems(): tokenizer = Tokenizer(text) tokens = tokenizer.tokenize() position = 1 filtered_tokens = [] for t in tokens: if not keep_stopwords: if t in stop_words: continue if stem: t = porter2_stem(t) filtered_tokens.append(t) mDocID = self.mapper.mapDocID(dID) mToken = self.mapper.mapToken(t) if mToken not in inverted_list: inverted_list[mToken] = {} inverted_list[mToken].setdefault(mDocID, []).append(position) position += 1 self.docLengths[dID] = len(filtered_tokens) return self.__writeInvertedIndex(inverted_list)
def getDocuments2(path): # Should find a single file now file = glob.glob(path) file = open(file[0], "r", encoding="utf8") file = file.read() file = file.splitlines() tokenizer = Tokenizer() index = 0 output = [] normalizedDoc = [] for line in file: if line == "": index += 1 line = tokenizer.stemQuery(line) normalizedDoc += line if index % 2 == 0 and index > 1: # The error is here index thing # Instead of returning, we want to normalize this all this and send it somewhere # Gonna do lots of computing though # Remove the URL from the normalized document # If we are at the end of the file normalizedDoc will be empty, so we can just stop if normalizedDoc == []: return output normalizedDoc.pop(0) normalizedDoc.pop(0) # Add the normlaized doc to the output list output.append(normalizedDoc) # Reset the normalizedDoc list to prepare for the next document normalizedDoc = []
def get_comm_token(self, count_num): tf = pd.DataFrame() for name in self.names_of_knn: tf = tf.append( self.knn_csv.loc[self.knn_csv['image_name'] == name]) tf.index = range(len(tf)) token_list = Tokenizer(tf) set_list = [] total_list = [] comm_token = [] for t_list in token_list: set_list.append(list(set(t_list))) for s_list in set_list: total_list += s_list for token in total_list: count = 0 for s_list in set_list: count += s_list.count(token) if count >= count_num: comm_token.append(token) return list(set(comm_token))
def __init__(self, text, product_name): self.candidate_features = [] self.feature_sentences = [] self.product_name = product_name.lower().split('-')[0].split('_') t = Tokenizer() sents = t.sent_tokenize(text.lower()) p = POSTagger() wnl = WordNetLemmatizer() for sent in sents: tagged_sent = p.nltk_tag(t.word_tokenize(sent)) feature_sent = {} feature_sent['sentence'] = sent feature_sent['tags'] = tagged_sent feature_sent['nouns'] = [] feature_sent['noun_phrases'] = [] for i in range(0, len(tagged_sent)): (word, tag) = tagged_sent[i] #Don't include proper nouns if tag.startswith('N') and tag != 'NNP': """ Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase. Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is low. """ if i > 0 and len( feature_sent['nouns'] ) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][ -1] and feature_sent['sentence'].find( feature_sent['nouns'][-1] + ' ' + word) > -1: feature_sent['noun_phrases'].append( wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word)) else: feature_sent['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sent)
def __init__(self, args, logger=None): self.args = args self.logger = logger self.tokenizer_method = args.tokenizer_method self.remove_stopwords = True self.vocab_path = os.path.join(args.experiment_folder, "preprocessed/") self.tokenizer = None self.seed = args.seed self.tokenizer_obj = Tokenizer(language='english', tokenizer_method=self.tokenizer_method, remove_stopwords=self.remove_stopwords, ngram_range=(1, 1), min_freq=1, max_freq_perc=1.0) self.tokenizer = self.tokenizer_obj.tokenizer self.vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.9, norm='l2', ngram_range=(1, 2), analyzer='word', tokenizer=identity_fn, preprocessor=identity_fn, token_pattern=None) self.model = LogisticRegression(random_state=self.seed, max_iter=int(1e6)) self.finetune = self.train_pseudo
def test_base(): fake_file = BytesIO(b"method void dispose();") t = Tokenizer(fake_file) token = t.advance() assert (token.token == 'method') assert (token.type == 'keyword') token = t.advance() assert (token.token == 'void') assert (token.type == 'keyword') token = t.advance() assert (token.token == 'dispose') assert (token.type == 'identifier') token = t.advance() assert (token.token == '(') assert (token.type == 'symbol') token = t.advance() assert (token.token == ')') assert (token.type == 'symbol') token = t.advance() assert (token.token == ';') assert (token.type == 'symbol')
def check_corpus(self, scrape_from_when, min_word_freq): """ Checks if there are pre-existing files or if they will have to be regenerated. If data needs to be scraped the bot will go ahead and do that and immediately generate a corpus for the collected data. :type min_word_freq: the minimum number of times a word must appear in the corpus to be in the user's vocab :param scrape_from_when: When the bot will start grabbing tweets from """ if min_word_freq < 1: raise ValueError(colors.red("Word frequency threshold must be greater than 0")) if self.handle in "test": return None # nothing to do here scraped = False if not os.path.exists(self.corpus): # check for corpus file print colors.red("no corpus.json file found - generating...") if not os.path.exists(self.folder): # check if they even have a folder yet os.mkdir(self.folder) scrape(self.handle, self.keys, start=scrape_from_when if scrape_from_when else self.get_join_date()) scraped = True if scrape_from_when and not scraped: # they already had a corpus and need a special scrape scrape(self.handle, self.keys, start=scrape_from_when) tokenizer = Tokenizer(min_word_freq) tokenizer.generate(self.handle) return tokenizer # always return the Tokenizer object
def compile_jack(jack_file_name): token_file_name = jack_file_name.replace('.jack', 'T.xml') token_file = open(token_file_name, 'w') jack_file = open(jack_file_name, 'r') tokenizer = Tokenizer(jack_file, token_file) vm_file = open(jack_file_name.replace('.jack', '') + '.vm', 'w') code_writer = CodeWriter(tokenizer, vm_file) code_writer.compile_class()
def main(): tk = Tokenizer('Mytestfor10.jack') while(tk.has_more_tokens()): tk.advance() print(tk.token_type(),tk.identifier()) ce = CompilationEngine('Mytestfor10.jack') root = Element('do') ce.compile_if(root)
def __init__(self, model_dir): ''' @param model_dir: The directory containing all trained model files ''' self.models = {} self.tokenizer = Tokenizer() os.path.walk(model_dir, install_all_model, self.models) print "All models loaded"
def __init__(self, input_content, file=True, debug=False): self.tokenizer = Tokenizer(input_content, file) self.tokenizer.tokenize() self.token = None self.is_debug = debug self.pos = 0 self.info = [] self.debug = [] self.eps = False
def read(self): # print('!',sys.argv[0]) # print(os.path.dirname(__file__)+'/data/ptb.valid.txt') file=open(os.path.dirname(__file__)+'/data/ptb.valid.txt') lines=file.readlines() tokenizer=Tokenizer(9999,oov_token=1) tokenizer.fit_on_texts(lines) self.seqs=tokenizer.texts_to_sequences(lines) return self.seqs
def main(unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) print(FLAGS.vocab_size) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key,val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness if hps.mode == 'train': print("creating model...") model = SummarizationModel(hps, vocab) setup_training(model, batcher) elif hps.mode == 'eval': model = SummarizationModel(hps, vocab) run_eval(model, batcher, vocab) elif hps.mode == 'decode': tokenizer = Tokenizer(FLAGS.article_path, FLAGS.data_path) decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
def test_compile_class_with_subroutine_description(): fake_file = BytesIO(b""" class X{ constructor X new(){ return this; } method void new(){ return this; } function void main() { return; } }""") t = Tokenizer(fake_file) c = CompilationEngine(t) node = c.compile() sd_1 = node.value[3].value sd_2 = node.value[4].value sd_3 = node.value[5].value # Test constructor assert (sd_1.value[0].name == 'keyword') assert (sd_1.value[0].value == 'constructor') assert (sd_1.value[1].name == 'identifier') assert (sd_1.value[1].value == 'X') assert (sd_1.value[2].name == 'identifier') assert (sd_1.value[2].value == 'new') assert (sd_1.value[3].name == 'symbol') assert (sd_1.value[3].value == '(') assert (sd_1.value[4].name == 'parameterList') assert (sd_1.value[4].value == []) assert (sd_1.value[5].name == 'symbol') assert (sd_1.value[5].value == ')') assert (sd_1.value[6].name == 'subroutineBody') assert (sd_1.value[6].value[0].value == '{') # Test method assert (sd_2.value[0].name == 'keyword') assert (sd_2.value[0].value == 'method') # Test function assert (sd_3.value[0].name == 'keyword') assert (sd_3.value[0].value == 'function') # Test that class closes properly assert (node.value[6].name == 'symbol') assert (node.value[6].value == '}')
def __init__(self, vocab_dict, estimated_sent_len, estimated_doc_len): self.vocab_dict = vocab_dict self.estimated_sent_len = estimated_sent_len self.estimated_doc_len = estimated_doc_len self.tokenizer = Tokenizer() self.sentence_processing = SentenceProcessing() self.document_processing = DocumentProcessing() self.unknown_words_processing = UnknownWordsProcessing( vocab_list=vocab_dict.keys(), replace=False)
def newcommand(self, name, nargs=0, definition=None, opt=None): """ Create a \\newcommand Required Arguments: name -- name of the macro to create nargs -- integer number of arguments that the macro has definition -- string containing the LaTeX definition opt -- string containing the LaTeX code to use in the optional argument Examples:: c.newcommand('bold', 1, r'\\textbf{#1}') c.newcommand('foo', 2, r'{\\bf #1#2}', opt='myprefix') """ name = str(name) # Macro already exists if self.has_key(name): if not issubclass(self[name], (plasTeX.NewCommand, plasTeX.Definition)): if not issubclass(self[name], plasTeX.TheCounter): return macrolog.debug('redefining command "%s"', name) if nargs is None: nargs = 0 assert isinstance(nargs, int), 'nargs must be an integer' if isinstance(definition, basestring): definition = [x for x in Tokenizer(definition, self)] if isinstance(opt, basestring): opt = [x for x in Tokenizer(opt, self)] macrolog.debug('creating newcommand %s', name) newclass = new.classobj(name, (plasTeX.NewCommand, ), { 'nargs': nargs, 'opt': opt, 'definition': definition }) self.addGlobal(name, newclass)
def run(code): code = Pre_process.filter(code) Parser.tokens = Tokenizer(code) resultado = Parser.parseProgram() if Parser.tokens.actual.value != 'eof': raise Exception("EOF not reached") return resultado
def run(self): # process and compile each class for file in self.fileLists: filename = file.stem fileHandle = open(str(file)) self.tokenizer = Tokenizer() self.parser = Parser(self.parent, filename) self.compileCode(fileHandle) print('Code successfully compiled')
class Miner(): filePath = './data/' tokenizer = Tokenizer('portuguese') writter = TokenWritter() def __init__(self, filename): if (filename == ''): raise ValueError('filename cannot be empty.') self.filePath += path self.load() def load(self): self.writeDataframe(self.generateDataframe()) def search(self, tags=[], words=[]): if (tags == [] and words == []): raise ValueError('You must search for a tag or a word, or both.') df = next(self.readDataframe()) if (tags == []): return df[df['PALAVRA'].isin(words)] if (words == []): return df[df['TAG'].isin(tags)] wdf = df[df['PALAVRA'].isin(words)] return wdf[wdf['TAG'].isin(tags)] def generateDataframe(self): self.generateMidFile() df = pd.read_csv('./minerMid.csv', sep=',', header=0) df['count'] = df.groupby(['PALAVRA', 'TAG'])['PALAVRA'].transform('count') df.drop_duplicates(inplace=True) ''' ALGORITMO QUICKSORT ''' df.sort_values(by=['count'], ascending=False, inplace=True, kind='quicksort') return df def writeDataframe(self, df): fileDf = open('dataframe.pickle', 'wb') pickle.dump(df, fileDf) fileDf.close() def readDataframe(self): file = open('dataframe.pickle', 'rb') yield pickle.load(file) file.close() def generateMidFile(self): self.writter.outputTokens( self.tokenizer.tokenizeFileWords(self.filePath), 'minerMid.csv')
def main(): expression = input("Enter the expression: ") expression += " " calc = Calculator() toke = Tokenizer() print("Expression: " + expression) print("-----------") calc.tokens = toke.Tokenize(expression) calc.PrettyPrint(calc.tokens) print("-----------") print("Expression Result: ",calc.ArithmeticExpression())
def repl(): tokenizer = Tokenizer() parser = Parser() exp = '' while exp != 'end': exp = input('exp > ') tokenizer.clear() tokens = tokenizer.tokenize(exp) codes = parser.parse(tokens) show_codes(codes)