def predict_answers(data, word2vec, N): stop = stopwords.words('english') pred_answs = [] pred_probs = [["A", "B", "C", "D"]] for i in range(data.shape[0]): #calculate word2vec for question q_vec = np.zeros(N, dtype=float) for w in tokenize(data['question'][i]): if w.lower() in word2vec and w.lower() not in stop: w2 = getword2vecval (N,w.lower(),word2vec) q_vec = np.add(q_vec, w2) q_vec = q_vec / linalg.norm(q_vec) #calculate word2vec for answers A_vec = np.zeros(N, dtype=float) B_vec = np.zeros(N, dtype=float) C_vec = np.zeros(N, dtype=float) D_vec = np.zeros(N, dtype=float) for w in tokenize(data['answerA'][i]): if w.lower() in word2vec and w.lower() not in stop: w2 = getword2vecval (N,w.lower(),word2vec) #print (w2[0:4]) A_vec = np.add(A_vec,w2) for w in tokenize(data['answerB'][i]): if w.lower() in word2vec and w.lower() not in stop: w2 = getword2vecval (N,w.lower(),word2vec) #print (w2[0:4]) B_vec = np.add(B_vec,w2) for w in tokenize(data['answerC'][i]): if w.lower() in word2vec and w.lower() not in stop: w2 = getword2vecval (N,w.lower(),word2vec) #print (w2[0:4]) C_vec = np.add(C_vec,w2) for w in tokenize(data['answerD'][i]): if w.lower() in word2vec and w.lower() not in stop: w2 = getword2vecval (N,w.lower(),word2vec) #print (w2[0:4]) D_vec = np.add(D_vec,w2) A_vec = A_vec / linalg.norm(A_vec) B_vec = B_vec / linalg.norm(B_vec) C_vec = C_vec / linalg.norm(C_vec) D_vec = D_vec / linalg.norm(D_vec) #choose question based on cosine distance idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax() probs = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec) pred_answs.append(["A", "B", "C", "D"][idx]) pred_probs.append(probs) return pred_answs, pred_probs
def get_glove_features(data, word2vec, N): stop = stopwords.words('english') scores = [] for i in range(data.shape[0]): #calculate word2vec for question q_vec = np.zeros(N) for w in tokenize(data['question'][i]): if w.lower() in word2vec and w.lower() not in stop: q_vec += word2vec[w.lower()] # # get all synonyms of the word # syns = wn.synsets(w.lower(), pos='n') # if len(syns)>0: # for syn in syns: # sw = syn.lemma_names()[0] # if sw.lower() in word2vec and sw.lower() not in stop: # q_vec += word2vec[sw.lower()] q_vec = q_vec / linalg.norm(q_vec) #calculate word2vec for answers A_vec = np.zeros(N) B_vec = np.zeros(N) C_vec = np.zeros(N) D_vec = np.zeros(N) for w in tokenize(data['answerA'][i]): if w.lower() in word2vec and w.lower() not in stop: A_vec += word2vec[w.lower()] for w in tokenize(data['answerB'][i]): if w.lower() in word2vec and w.lower() not in stop: B_vec += word2vec[w.lower()] for w in tokenize(data['answerC'][i]): if w.lower() in word2vec and w.lower() not in stop: C_vec += word2vec[w.lower()] for w in tokenize(data['answerD'][i]): if w.lower() in word2vec and w.lower() not in stop: D_vec += word2vec[w.lower()] A_vec = A_vec / linalg.norm(A_vec) B_vec = B_vec / linalg.norm(B_vec) C_vec = C_vec / linalg.norm(C_vec) D_vec = D_vec / linalg.norm(D_vec) scores.append(np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec)) return scores
def __build_dictionary(synset, hyperhypo): lesk_dictionary = [] # Includes definition. lesk_dictionary+= tokenize(synset.definition) # Includes lemma_names. lesk_dictionary+= synset.lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo: related_senses = synset.hypernyms()+synset.hyponyms() for related_sense in related_senses: lesk_dictionary+= tokenize(related_sense.definition) lesk_dictionary+= [lemma.name for lemma in related_sense.lemmas] without_stop_words = filter(lambda word: word not in english_stopwords , lesk_dictionary) return map(lambda word: word.lower(), without_stop_words)
def generate_citations(lines, vocab, index): word2idx = dict([(v, k) for k, v in enumerate(vocab)]) for line in lines[:100]: tokenized = list() capitalized = list() for word, cap in zip(utils.tokenize(line, periods=True), utils.tokenize(line, periods=True, capitalized=True)): if word == '.': if len(tokenized) > 10: citation = generate_citation([word2idx[w] for w in tokenized if w in word2idx], index) print(' '.join(capitalized) + ' (%s).' % citation) tokenized = list() capitalized = list() else: tokenized.append(word) capitalized.append(cap)
def predict_segmented_tf_idf(data, docs_per_q, ids_and_categories): #index docs res = [] category_tf_idfs = {} for index, row in data.iterrows(): current_id = str(row['id']) print current_id current_category = ids_and_categories[current_id] if category_tf_idfs.get(current_category) is None: category_tf_idfs[current_category] = utils.get_docstf_idf(wiki_docs_dir + '/%s' % current_category) docs_tf, words_idf = category_tf_idfs[current_category] #get answers words w_A = set(utils.tokenize(row['answerA'])) w_B = set(utils.tokenize(row['answerB'])) w_C = set(utils.tokenize(row['answerC'])) w_D = set(utils.tokenize(row['answerD'])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q = row['question'] for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]: for w in w_A: if w in docs_tf[d]: sc_A += 1. * docs_tf[d][w] * words_idf[w] # count of how many times in the document, times log(numberofdocs/word) for each word for w in w_B: if w in docs_tf[d]: sc_B += 1. * docs_tf[d][w] * words_idf[w] for w in w_C: if w in docs_tf[d]: sc_C += 1. * docs_tf[d][w] * words_idf[w] for w in w_D: if w in docs_tf[d]: sc_D += 1. * docs_tf[d][w] * words_idf[w] res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])]) return res
def matchUp(self, token, ingredientRow): """ Returns our best guess of the match between the tags and the words from the display text. This problem is difficult for the following reasons: * not all the words in the display name have associated tags * the quantity field is stored as a number, but it appears as a string in the display name * the comment is often a compilation of different comments in the display name """ ret = [] # strip parens from the token, since they often appear in the # display_name, but are removed from the comment. token = utils.normalizeToken(token) decimalToken = self.parseNumbers(token) for key, val in ingredientRow.iteritems(): if isinstance(val, basestring): for n, vt in enumerate(utils.tokenize(val)): if utils.normalizeToken(vt) == token: ret.append(key.upper()) elif decimalToken is not None: try: if val == decimalToken: ret.append(key.upper()) except: pass return ret
def testTokens(self): tokens = utils.tokenize(self.str3) self.assertEqual(11, len(tokens)) self.assertEqual('\n two empty spaces and some escaped chars \\\"\\\' in normal textfollowed by a ', tokens[0]['token']) self.assertEqual('"dbl quote"', tokens[1]['token']) self.assertEqual(' and then a ', tokens[2]['token']) self.assertEqual("'single quote'", tokens[3]['token']) self.assertEqual('\nwait there is more!! ', tokens[4]['token']) self.assertEqual('"\'signle quotes\' inside a double quote"', tokens[5]['token']) self.assertEqual(' and ', tokens[6]['token']) self.assertEqual('\'"double quotes" inside a single quote\'', tokens[7]['token']) self.assertEqual('\nwait! there\\\'s more!! ', tokens[8]['token']) self.assertEqual('"escaped double quotes \\" and escaped single quotes\\\' "', tokens[9]['token']) self.assertEqual(' ', tokens[10]['token']) self.assertEqual(utils.TOKEN_NORMAL, tokens[0]['type']) self.assertEqual(utils.TOKEN_DBL_Q, tokens[1]['type']) self.assertEqual(utils.TOKEN_NORMAL, tokens[2]['type']) self.assertEqual(utils.TOKEN_SNG_Q, tokens[3]['type']) self.assertEqual(utils.TOKEN_NORMAL, tokens[4]['type']) self.assertEqual(utils.TOKEN_DBL_Q, tokens[5]['type']) self.assertEqual(utils.TOKEN_NORMAL, tokens[6]['type']) self.assertEqual(utils.TOKEN_SNG_Q, tokens[7]['type']) self.assertEqual(utils.TOKEN_NORMAL, tokens[8]['type']) self.assertEqual(utils.TOKEN_DBL_Q, tokens[9]['type']) self.assertEqual(utils.TOKEN_NORMAL, tokens[10]['type'])
def FrequentWords(data_dirs, suffixes, max_key_words): """ Returns a dictionary of min(max_key_words, percentile_key_words), giving key word with its count. """ matches = matchingFiles(data_dirs, suffixes) token_count = Counter() files_done = 0 for file_name in matches: tokens = tokenize(file_name) for token in tokens: if len(token) == 0: continue try: token_count[token] += 1 except: token_count[token] = 1 files_done += 1 if (files_done % 5000 == 0): print("Completed parsing %d files ..." % files_done) # num_key_words = min(max_key_words, # math.ceil(percentile_key_words * len(token_count))) return token_count.most_common(max_key_words)
def tag(self, text=None): """ Tags the given text. :param text: a string or unicode object. Strings assumed to be utf-8 :returns: a list of lists (sentences with tokens). Each sentence has (token, tag) tuples. """ result = [] if text: tokens = utils.tokenize(text, clean=False) for sent in tokens: tags = self.tag_tokens(sent) result.append(zip(sent, tags)) else: # read tsv from stdin sent = [] for line in sys.stdin: line = line.decode('utf-8').strip() if line: sent.append(line.split()[0]) else: tags = self.tag_tokens(sent) result.append(zip(sent, tags)) sent = [] return result
def bird_info(self): birdv = self.machine.run("echo | birdc | head -1").strip().replace(" ready.", "") birdv = birdv.split(" ") info = { "daemon": birdv[0], "version": birdv[1], "ospf": {} } log.info("[%s] getting OSPF neighbours" % self.hostname()) output = self.machine.run("echo show ospf neighbors | birdc | sed '/^bird[^ ] .*/d'") neighbours = [] for toks in [tokenize(l) for l in splitlines(output)[2:]]: neighbour = { "routerid": toks[0] } if toks[4][0] in ascii_letters: neighbour["ifname"] = toks[4] neighbour["v4addr"] = toks[5] else: neighbour["v4addr"] = toks[4] neighbour["ifname"] = toks[5] neighbours.append(neighbour) info["ospf"]["neighbours"] = neighbours return info
def generate_data(self, count, offset): """ Generates training data in the CRF++ format for the ingredient tagging task """ df = pd.read_csv(self.opts.data_path) df = df.fillna("") start = int(offset) end = int(offset) + int(count) df_slice = df.iloc[start: end] for index, row in df_slice.iterrows(): try: # extract the display name display_input = utils.cleanUnicodeFractions(row["input"]) tokens = utils.tokenize(display_input) del(row["input"]) rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens]) for i, (token, tags) in enumerate(rowData): features = utils.getFeatures(token, i+1, tokens) print utils.joinLine([token] + features + [self.bestTag(tags)]) # ToDo: deal with this except UnicodeDecodeError: pass print
def classify_proba(self, text): token_list = tokenize(text) token_list = del_stopwords(token_list, self.stopset) wordfreq_dict = stat_wordfreq(token_list) dictfeats = tfidf(wordfreq_dict, self.idf_dict) vecfeats = self.vectorizer.transform(dictfeats).toarray() prob = self.classifier.predict_proba(vecfeats) return prob[0]
def macaddr(self, iface): output = self.machine.run("ip link show dev %s | grep link/ether" % iface).strip() if not output: return None mac = tokenize(output)[1].upper() if len(mac.replace("0", "").replace(":", "")) == 0: return None return mac
def find_similar_articles(corpus_name, method, content, data_dir=os.getcwd(), index=None): """ - corpus_name : Le nom du corpus sur lequel on travaille (fichier .tsv sans l'extension .tsv) - method : ldan (n = le nombre de topics), lsin ou tfidf - content : un texte Renvoie les 5 articles de corpus_name les plus proches du contenu spécifié """ corpus_file = os.path.join(data_dir, corpus_name + '_' + method + '.mm') index_file = os.path.join(data_dir, corpus_name + '_' + method + '_index') docid_file = os.path.join(data_dir, corpus_name + '_docid.txt') # Chargement du corpus try: corpus = corpora.mmcorpus.MmCorpus(corpus_file) except Exception: raise IOError('Impossible de charger le fichier %s. Avez-vous bien appliqué le script corpus_to_matrix.py ?' % (corpus_file)) # Chargement du fichier d'index, s'il n'est pas fourni en argument if not index: try: index = similarities.docsim.Similarity.load(index_file) except Exception: raise IOError("""Impossible de charger le fichier %s. Avez-vous bien appliqué le script %s avec l'option --saveindex ?""" % (method, index_file)) dico_file = os.path.join(data_dir, corpus_name + '_wordids.txt') # Chargement du dictionnaire try: id2word = corpora.dictionary.Dictionary.load_from_text(dico_file) except Exception: raise IOError("Impossible de charger le fichier %s" % (dico_file)) # Chargement du modèle correspondant à la méthode voulue par l'utilisateur if method == 'tfidf': model_file = os.path.join(data_dir, corpus_name + '_tfidf_model') model = models.tfidfmodel.TfidfModel.load(model_file) elif method.startswith('lsi'): model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model') model = models.lsimodel.LsiModel.load(model_file) elif method.startswith('lda'): model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model') model = models.ldamodel.LdaModel.load(model_file) tokens = model[id2word.doc2bow(utils.tokenize(content))] # Renvoi des 5 articles les plus proches sims = index[tokens] sims = sorted(enumerate(sims), key=lambda item: -item[1]) return json.dumps([{'id': utils.get_article_by_corpus_number(x[0], docid_file), 'score': round(x[1], 2)} for x in sims[:5]])
def word_freq(filenames, stopset): wordset = set() # 全部单词集 freqset_list = [[],[]] # 分别保存负向和正向文本的词频 npos = 0 # 当前正向文本的数目 nneg = 0 # 当前负向文本的数目 icur = 0 # 当前所指向的正向或负向文本的下标 for filename in filenames: fr = file(filename, 'r') while True: line = fr.readline().decode("utf-8") if len(line) == 0: # Zero length indicates EOF break id,label,text = proc_line(line) token_list = tokenize(text) token_list = del_stopwords(token_list, stopset) wordfreq_dict = {} for token in token_list: wordset.add(token) # 将单词加入全部单词集 if wordfreq_dict.has_key(token): wordfreq_dict[token] += 1 else: wordfreq_dict[token] = 1 doc = [id, label, wordfreq_dict] # 用列表记录每篇文本的id,label和词频 # 将文本加入指定列表 index = 0 if label == '1': index = 1 freqset_list[1].append(doc) icur = npos npos += 1 elif label == '-1': index = 0 freqset_list[0].append(doc) icur = nneg nneg += 1 else: print 'tag-unknown text' continue fr.close() # 将特征词保存至文件中 f = open('./Training/WordSet.txt', 'w') for word in wordset: string = word + '\n' f.write(string.encode("utf-8")) f.close() # 将原始词频保存至文件中 f = open('./Training/WordFreq_Orig.txt', 'w') for i in range(2): for freqset in freqset_list[i]: id = freqset[0] label = freqset[1] freq_list = freqset[2] string = id + '\t' + label + '\t' for word in freq_list: string += word + ',' + str(freq_list[word]) + ';' string += '\n' f.write(string.encode('utf-8')) return wordset, freqset_list
def v4addr(self, iface): output = self.machine.run("ip addr show dev %s | grep '^ *inet '" % iface).strip() def parseaddr(a): a = a.strip() if "/" not in a: return a + "/32" return a tokset = [tokenize(l) for l in splitlines(output)] return [parseaddr(toks[1]) for toks in tokset if len(toks) > 0]
def find_word_freq(li): all_tokens = [normalize(t, lowercase=False) for aff in li for t in tokenize(text_in_element(aff), split_alphanum=split_alphanum)] freq = defaultdict(int) for token in all_tokens: freq[token] += 1 return freq
def dict_from_file(filename, match_case=True): d = defaultdict(list) with codecs.open(DICTS_DIR + filename, 'rb', encoding='utf8') as f: for line in f: tokens = tokenize(normalize(line, lowercase=(not match_case)), split_alphanum=split_alphanum) for (nb, token) in enumerate(tokens): d[token] += [(tokens, nb)] return (d, match_case)
def predict_answers(data, word2vec, N): stop = stopwords.words('english') pred_answs = [] for i in range(data.shape[0]): #calculate word2vec for question q_vec = np.zeros(N) for w in tokenize(data['question'][i]): if w.lower() in word2vec and w.lower() not in stop: q_vec += word2vec[w.lower()] q_vec = q_vec / linalg.norm(q_vec) #calculate word2vec for answers A_vec = np.zeros(N) B_vec = np.zeros(N) C_vec = np.zeros(N) D_vec = np.zeros(N) for w in tokenize(data['answerA'][i]): if w.lower() in word2vec and w.lower() not in stop: A_vec += word2vec[w.lower()] for w in tokenize(data['answerB'][i]): if w.lower() in word2vec and w.lower() not in stop: B_vec += word2vec[w.lower()] for w in tokenize(data['answerC'][i]): if w.lower() in word2vec and w.lower() not in stop: C_vec += word2vec[w.lower()] for w in tokenize(data['answerD'][i]): if w.lower() in word2vec and w.lower() not in stop: D_vec += word2vec[w.lower()] A_vec = A_vec / linalg.norm(A_vec) B_vec = B_vec / linalg.norm(B_vec) C_vec = C_vec / linalg.norm(C_vec) D_vec = D_vec / linalg.norm(D_vec) #choose question based on cosine distance idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax() pred_answs.append(["A", "B", "C", "D"][idx]) return pred_answs
def build_vocab(docs, save_as): start = time.time() vocab = set() for file in utils.iterate_corpus(docs): with open(file, 'r') as f: tokenized = itertools.chain.from_iterable(utils.tokenize(line) for line in f.readlines()) vocab.update(tokenized) vocab = list(vocab) pkl.dump(vocab, open(save_as, 'wb')) print('Built vocabulary and saved it to "%s" in %s' % (save_as, utils.strtime(time.time() - start)), file=sys.stderr) return vocab
def predict(data, docs_per_q): #index docs docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir) res = [] f = [] for index, row in data.iterrows(): #get answers words w_A = set(utils.tokenize(row['answerA'])) w_B = set(utils.tokenize(row['answerB'])) w_C = set(utils.tokenize(row['answerC'])) w_D = set(utils.tokenize(row['answerD'])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q = row['question'] for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]: for w in w_A: if w in docs_tf[d]: sc_A += 1. * docs_tf[d][w] * words_idf[w] for w in w_B: if w in docs_tf[d]: sc_B += 1. * docs_tf[d][w] * words_idf[w] for w in w_C: if w in docs_tf[d]: sc_C += 1. * docs_tf[d][w] * words_idf[w] for w in w_D: if w in docs_tf[d]: sc_D += 1. * docs_tf[d][w] * words_idf[w] res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])]) f.append([sc_A, sc_B, sc_C, sc_D]) features = np.array(f) pd.DataFrame({'id': list(data['id']),'fA': features[:,0], 'fB': features[:,1], 'fC': features[:,2], 'fD': features[:,3]})[['id', 'fA', 'fB', 'fC', 'fD']].to_csv('features_ck12.csv', index = False) return res
def __init__(self, data, minimum_vocab_fraction=.02, include_ngrams=True): self.doc_freq = FreqDist() for count, (label, text) in enumerate(data, start=1): for word in set(utils.tokenize(text, include_ngrams, limit_ngrams=True)): self.doc_freq.inc(word) self.doc_count = count self.min_vocab_freq = 1 self.max_vocab_freq = .95 * self.doc_count print 'Min/max vocabulary frequency:', self.min_vocab_freq, self.max_vocab_freq self.features = sorted(filter(self._is_valid_feature, self.doc_freq))
def load_mol_data(calc_set, opt_set, struct_set, prop_set=None): ''' Load data from data sets and return lists of structure names, full paths to the geometry data, the properties, and the meta data. ''' print "Dataset options used" print "\tCalculation methods:", calc_set print "\tOptimization methods:", opt_set print "\tStructure sets:", struct_set print "\tProperties:", prop_set names = [] datasets = [] geom_paths = [] properties = [] meta = [] lengths = [] for j, base_path in enumerate(opt_set): for i, file_path in enumerate(calc_set): for m, atom_set in enumerate(struct_set): path = os.path.join(DATA_BASE_DIR, "mol_data", base_path, atom_set, file_path) with open(path + ".txt", 'r') as f: for line in f: temp = line.split() name, props = temp[0], temp[1:] names.append(name) datasets.append((base_path, file_path, atom_set)) geom_path = os.path.join(DATA_BASE_DIR, "mol_data", base_path, 'geoms', 'out', name + '.out') geom_paths.append(geom_path) properties.append([float(x) for x in props]) # Add part to feature vector to account for the 4 different data sets. base_part = [i == k for k, x in enumerate(opt_set)] # Add part to feature vector to account for the 3 different methods. method_part = [j == k for k, x in enumerate(calc_set)] # Add part to feature vector to account for the addition of N. atom_part = [m == k for k, x in enumerate(struct_set)] # Add bias feature bias = [1] meta.append(base_part + method_part + atom_part + bias) tokens = tokenize(name, explicit_flips=True) aryl_count = sum([1 for x in tokens if x in ARYL]) lengths.append(aryl_count) prop_desc = (("H**O", "eV"), ("LUMO", "eV"), ("Excitation", "eV")) prop_vals = zip(*properties) prop_out = [(x, y, z) for ((x, y), z) in zip(prop_desc, prop_vals)] return names, datasets, geom_paths, prop_out, meta, lengths
def quagga_info(self): output = self.machine.run("zebra --version") info = { "daemon": "Quagga", "version": tokenize(splitlines(output)[0])[-1], "ospf": {} } neighbours = [] log.info("[%s] getting OSPF neighbours" % self.hostname()) output = self.machine.run("echo show ip ospf neighbor | vtysh | grep '^[1-9]'") for toks in [tokenize(l) for l in splitlines(output)]: if len(toks) == 0: continue neighbour = { "routerid": toks[0], "v4addr": toks[4], "ifname": toks[5].split(":")[0] } neighbours.append(neighbour) info["ospf"]["neighbours"] = neighbours return info
def build_index(docs, vocab, save_as): start = time.time() word2idx = dict([(v, k) for k, v in enumerate(vocab)]) tf = dict([(i, list()) for i in xrange(len(vocab))]) df = Counter() n_docs = len(list(utils.iterate_corpus(docs))) files = list() for i, file in enumerate(utils.iterate_corpus(docs)): print('%d/%d %s' % (i+1, n_docs, utils.strtime(time.time() - start)), file=sys.stderr, end='\r') files.append(file) with open(file, 'r') as f: text = f.read() word_counts = Counter(word2idx[w] for w in utils.tokenize(text)) df.update(word2idx[w] for w in set(utils.tokenize(text))) n_words = utils.counter_sum(word_counts) for word, count in word_counts.items(): tf[word].append((count / math.log(n_words), i)) for word, docs in tf.items(): docs.sort(key=lambda x: x[0], reverse=True) tfidf = tf, df, files pkl.dump(tfidf, open(save_as, 'wb')) print('Processed %d documents in %s' % (n_docs, utils.strtime(time.time() - start)), file=sys.stderr) return tfidf
def tag(self, text): """ Tags the given text. :param text: a string or unicode object. Strings assumed to be utf-8 :returns: a list of lists (sentences with tokens). Each sentence has (token, tag) tuples. """ tokens = utils.tokenize(text, clean=False) result = [] for sent in tokens: tags = self.tag_tokens(sent) result.append(zip(sent, tags)) return result
def clusterSentence(self, sentence): """ clusters the given sentence with existing cluster or creates a new cluster. sentence - sentence to be clustered """ words = utils.tokenize(sentence.lower()) lems = utils.lemmatize(words) terms = utils.filterStopWords(lems) tf = dict(Counter(terms)) self.clusterize(tf, sentence) # Every time a new sentence is clusterized, save latest clusters self.saveClusters()
def predict(data, docs_per_q): #index docs docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir) res = [] doc_score = [["A","B","C","D"]] for index, row in data.iterrows(): #get answers words w_A = set(utils.tokenize(row['answerA'])) w_B = set(utils.tokenize(row['answerB'])) w_C = set(utils.tokenize(row['answerC'])) w_D = set(utils.tokenize(row['answerD'])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q = row['question'] for d in list(zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q)))[0]: for w in w_A: if w in docs_tf[d]: sc_A += 1. * docs_tf[d][w] * words_idf[w] for w in w_B: if w in docs_tf[d]: sc_B += 1. * docs_tf[d][w] * words_idf[w] for w in w_C: if w in docs_tf[d]: sc_C += 1. * docs_tf[d][w] * words_idf[w] for w in w_D: if w in docs_tf[d]: sc_D += 1. * docs_tf[d][w] * words_idf[w] res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])]) doc_score.append([sc_A, sc_B, sc_C, sc_D]) return res, doc_score
def find_similar_articles(corpus_name, method, id=None, content=None): corpus_file = corpus_name + '_' + method + '.mm' index_file = corpus_name + '_' + method + '_index' docid_file = corpus_name + '_docid.txt' try: corpus = corpora.mmcorpus.MmCorpus(corpus_file) except Exception: raise IOError('Impossible de charger le fichier %s' % (corpus_file)) try: index = similarities.docsim.Similarity.load(index_file) except Exception: raise IOError('Impossible de charger le fichier %s' % (index_file)) if id is not None: corpus_id = utils.get_article_by_id(id, docid_file) tokens = corpus[corpus_id] elif content is not None: dico_file = corpus_name + '_wordids.txt' try: id2word = corpora.dictionary.Dictionary.load_from_text(dico_file) except Exception: raise IOError("Impossible de charger le fichier %s" % (dico_file)) if method == 'tfidf': model_file = corpus_name + '_tfidf_model' model = models.tfidfmodel.TfidfModel.load(model_file) elif method.startswith('lsi'): model_file = corpus_name + '_' + args.method + '_model' model = models.lsimodel.LsiModel.load(model_file) elif method.startswith('lda'): model_file = corpus_name + '_' + args.method + '_model' model = models.ldamodel.LdaModel.load(model_file) tokens = model[id2word.doc2bow(utils.tokenize(content))] else: raise Exception("Il faut fournir un id ou un contenu") sims = index[tokens] sims = sorted(enumerate(sims), key=lambda item: -item[1]) return [(utils.get_article_by_corpus_number(x[0], docid_file), x[1]) for x in sims[:5]]
def tag(self, text, no_repeats=False): """ Runs the SRL process on the given text. :param text: unicode or str encoded in utf-8. :param no_repeats: whether to prevent repeated argument labels :returns: a list of SRLAnnotatedSentence objects """ tokens = utils.tokenize(text, clean=False) result = [] for sent in tokens: tagged = self.tag_tokens(sent) result.append(tagged) return result
from collections import Counter import nltk import numpy as np import pandas # noinspection PyUnresolvedReferences from utils import tokenize # importing corpus as resume resume_file = open('../assets/resume.txt', 'r') resume = resume_file.read().lower() resume_file.close() # tokenizing the resume tokens = tokenize(resume) # dividing corpus into 6 documents k = len(tokens) // 6 documents = [] for i in range(5): documents.append(tokens[i * k:(i + 1) * k]) documents.append(tokens[5 * k:]) # calculating most common 5 tokens from each document and storing frequency tables for each document most_common = set() document_frequencies = [] for document in documents: frequencies = Counter(document) document_frequencies.append(frequencies) for word, frequency in frequencies.most_common(5):
def read_training_dataset(self, input_path): with open(input_path) as f: data = json.load(f) self.no_samples = len(data) # for padding. self.words_converter.T2id('<PAD>') self.words_converter.T2id('<SOS>') self.slots_converter.T2id('<PAD>') self.slots_converter.T2id('<SOS>') self.slots_converter.T2id('-') for i in tqdm(range(self.no_samples)): entry = data[str(i)] text = entry["text"] text = normalizeString(text) tokens = tokenize(text) self.stcs_literals.append(tokens) tokens_id = [self.words_converter.T2id(id) for id in tokens] tokens_id.append(self.words_converter.T2id('</s>')) self.stcs.append(tokens_id) self.lengths.append(len(tokens_id)) intent = entry["intent"] self.intents.append(self.intent_converter.T2id(intent)) slots_dictionary = entry["slots"] # +1 make room for <SOS> slots_id = [self.slots_converter.T2id('-')] * len(tokens_id) slots_id[0] = self.slots_converter.T2id('<SOS>') no_slots_in_stc = 0 for slot, target_words in slots_dictionary.items(): target_words = normalizeString(target_words) target_word_list = tokenize(target_words) for word in target_word_list: no_slots_in_stc += 1 try: idx = tokens.index(word) except: idx = [ i for i, s in enumerate(tokens) if word in s ][0] # +1 account for <SOS> slots_id[idx + 1] = self.slots_converter.T2id(slot) # keep count of no slots for j in range(len(tokens_id) - no_slots_in_stc): self.slots_converter.T2id('-') self.slots.append(slots_id) # self.slots.append(torch.tensor(slots_id, dtype=torch.long, device=self.device)) # add padding ncols = max(self.lengths) self.X = self.stcs self.Y = self.slots
# import libraries import pandas as pd from sqlalchemy import create_engine from wordcloud import WordCloud import matplotlib.pyplot as plt from utils import tokenize print('Loading data...') engine = create_engine('sqlite:///data/DisasterResponse.db') df = pd.read_sql_table('disaster_message_category', engine) print('Tokenizing words...') word_string = " ".join(df['message']) word_string_final = " ".join(tokenize(word_string)) print('Creating wordcloud...') wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=300).generate(word_string_final) print('Generating png image...') # plot the WordCloud image plt.figure(figsize=(8, 4), facecolor=None) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad = 0) plt.savefig('app/static/images/wordcloud.png', dpi=105)
def qe(self, sourceLang, targetLang, sourceText, targetText): """ Performs translation quality estimation on sourceText to targetText using QuEst++ and fast_align It's ok to raise Exceptions here. They are handled upstream. """ os.makedirs('data/tmp', exist_ok=True) if not [sourceLang, targetLang] in self.supportedPairs: raise Exception("{}-{} language pair not supported".format( sourceLang, targetLang)) # Sanitize input aligned = hunalign(sourceText, targetText) sourceText = [tokenize(x[0], sourceLang, False) for x in aligned] targetText = [tokenize(x[1], sourceLang, False) for x in aligned] sourceTextPlain = '\n'.join([' '.join(x) for x in sourceText]) targetTextPlain = '\n'.join([' '.join(x) for x in targetText]) alignments = fast_align.FastAlign().align(sourceLang, targetLang, sourceTextPlain, targetTextPlain)['alignment'] with open('data/tmp/alignments', 'w') as fileAlignments: fileAlignments.write(alignments) with open('data/tmp/source', 'w') as fileSource: fileSource.write(sourceTextPlain) with open('data/tmp/target', 'w') as fileTarget: fileTarget.write(targetTextPlain) with DirCrawler('qe/questplusplus'): print("Extracting features") (_output, _error) = bash(""" java -cp QuEst++.jar:lib/* shef.mt.WordLevelFeatureExtractor -lang english spanish -input ../../data/tmp/source ../../data/tmp/target -alignments ../../data/tmp/alignments -config ../questplusplus-config/config.word-level.properties """) outputFile = 'output/test/output.txt' if not os.path.isfile(outputFile): raise Exception('Server Processing Error') with open(outputFile, 'r') as outputFileR: features = outputFileR.readlines() os.remove('data/tmp/alignments') os.remove('data/tmp/source') os.remove('data/tmp/target') features = [[ x.split('=')[1] for x in line.rstrip('\n').rstrip('\t').split('\t') ] for line in features] with open('data/tmp/features', 'w') as fileFeatures: fileFeatures.write('\n'.join(['\t'.join(x) for x in features])) with open('data/tmp/labels', 'w') as fileLabels: fileLabels.write('\n'.join(['1'] * len(features))) with DirCrawler('qe/questplusplus'): print("Removing output directory structure for feature extractor") os.remove(outputFile) os.rmdir('output/test') os.rmdir('output') print("Machine Learning") (_output, _error) = bash(f""" python learning/src/learn_model.py ../questplusplus-config/svr_{sourceLang}_{targetLang}.cfg """) with open('predicted.csv', 'r') as predictedFile: output = [ float(x.rstrip('\n').split('\t')[1]) for x in predictedFile.readlines() ] os.remove('predicted.csv') os.remove('data/tmp/features') os.remove('data/tmp/labels') os.rmdir('data/tmp') return {'status': 'OK', 'qe': output}
# Evaluation setup 'sample': '如', 'max_sample_length': 50, 'sample_range': 2 # how many words in the dictionary to be considered when sampling } # -------------------------Data feeding preparation--------------- # Read and tokenize data texts = [ './data/qts_tab.txt', './data/qsc_tab.txt', './data/qtais_tab.txt', './data/qss_tab.txt' ] # max and min length of poem sequence maxlen = 100 minlen = 7 poems = [] # for t in texts: # poems.extend(utils.read_poem(t)) for t in texts: poems.extend(utils.read_regular_poem(t)) poems = utils.chop_poems(poems, maxlen, minlen) data, count, dictionary, reverse_dictionary = utils.tokenize( poems, params['vocabulary_size']) rnnlm = language_model.RNNLM(params, data, count, dictionary, reverse_dictionary) rnnlm.train(sample_interval=100, save_interval=5000, logger=None) # rnnlm.sample(sample_len=100, checkpoint_dir='./tmp/rnndata/')
data = pd.read_csv('../data/data.csv', skiprows=0) filtered = data[[ 'REGI', 'TYPO', 'VISUAL_SIMILARITY', 'SOUNDEX_DISTANCE' ]][(data['EDIT_DISTANCE'] == 1) & (data['IS_TYPO'] == 1) & ((data['VISUAL_SIMILARITY'] >= 0.8) | (data['SOUNDEX_DISTANCE'] <= 1))] filtered = filtered[filtered.TYPO.map(lambda x: x.count('.')) == 2] filtered = filtered[filtered.REGI.map(lambda x: x.count('.')) == 2] filtered.reset_index(drop=True, inplace=True) reg_list = list() typo_list = list() for i in range(t.shape[0]): reg_list.append(filtered['REGI'][i].split('.')[0]) typo_list.append(filtered['TYPO'][i].split('.')[0]) in_list, out_list = utils.tokenize(reg_list, typo_list, token_size) in_vocab = set() out_vocab = set() for name in in_list: for char in name: in_vocab.add(char) for name in out_list: for char in name: out_vocab.add(char) vocab = in_vocab.union(out_vocab) num_encoder_tokens = len(in_vocab) num_decoder_tokens = len(out_vocab) max_encoder_seq_length = max([len(name) for name in in_list]) max_decoder_seq_length = max([len(name) for name in out_list])
def html_to_json(url): category, uid = tokenize(url) schema_name = 'schema/{}.json'.format(category) with open(schema_name, 'rb') as fp: template = json.load(fp) html_doc = get_html(url) soup = BeautifulSoup(html_doc, 'html.parser') table_title = None result = {} ignore_image = True for tr in soup.find_all('tr'): # keep only the most bottom level tr if tr.find_all('tr'): continue is_title_row = False row_content = [] for td in tr.find_all('td'): if ignore_image and td.find_all('img'): continue text = clean_up(td.text) if text in template: table_title = text is_title_row = True row_titles = template[table_title] ignore_image = row_titles['ignore image'] result[table_title] = {} break link = '' for a in td.find_all('a'): link = a.get('href') row_content.append({'text': text, 'link': link}) if is_title_row: continue if not row_content or not table_title: continue column_index = row_titles['column index'] strict_match = row_titles['strict match'] regex_match = row_titles['regex match'] terminate_on_mismatch = row_titles['terminate on mismatch'] matched = False if len(row_content) > column_index + 1: candidate_row_title = row_content[column_index]['text'] for s in strict_match: if s == candidate_row_title and s not in result[table_title]: matched = True result[table_title][s] = row_content[column_index + 1:] break if not matched: for s in regex_match: if s in candidate_row_title: matched = True result[table_title][u'Certified Votes'] = row_content[column_index + 1:] break if re.match(s, candidate_row_title): matched = True category, race_id = tokenize(row_content[column_index + 1]['link']) result[table_title][race_id] = row_content[column_index:] break if terminate_on_mismatch and not matched: table_title = None ignore_image = True return result
args = parser.parse_args() logging.basicConfig(filename=args.log_filepath, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if args.command == "construct_vocab": dictionary = corpora.Dictionary() count = 0 line = sys.stdin.readline() while line: tokens = tokenize(line) dictionary.add_documents([tokens], prune_at=None) count += 1 if count % 100000 == 0: print_err("line %d %d" % (count, len(dictionary))) line = sys.stdin.readline() dictionary.save(args.vocabulary_filename) dictionary.save_as_text(args.vocabulary_filename + ".txt") elif args.command == "construct_corpus": # use glob to recurse under data/TXTs directory
def convert_filename(filename): tokenized_filename = tokenize(filename.replace('/', '.')) return f'{constants.FILE_START} {tokenized_filename} {constants.FILE_END}\n'
from nltk.corpus import semcor import utils count = 0 num_sentences = 0 for i in range(100): sent = semcor.xml('brown2/tagfiles/br-n12.xml').findall('context/p/s')[i] sentence = "" name = "" for wordform in sent.getchildren(): sentence += wordform.text + " " if wordform.get('pos') == "NN" and wordform.text != "anyone": name = wordform.text sense_key = wordform.get('lexsn') context = utils.tokenize(sentence) if name is not "": best_sense = utils.find_synset(context,name) num_sentences += 1 if sense_key == best_sense.lemmas()[0].key()[-9:]: count += 1 if num_sentences == 50: break print("accuracy = " + str(count*100/num_sentences) + " %")
# http://arxiv.org/abs/1410.4615 # "Sequence to Sequence Learning with Neural Networks" # https://arxiv.org/abs/1409.3215 reverse = True data_path = './data' train_books = [ 'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt', 'war_and_peace.txt', 'botanical_2.txt', 'botanical_terms.txt' ] val_books = ['wonderland.txt', 'botanical_1.txt'] if __name__ == '__main__': # Prepare training data. text = read_text(data_path, train_books) vocab = tokenize(text) vocab = list(filter(None, set(vocab))) # `maxlen` is the length of the longest word in the vocabulary # plus two SOS and EOS characters. maxlen = max([len(token) for token in vocab]) + 2 train_encoder, train_decoder, train_target = transform( vocab, maxlen, error_rate=error_rate, shuffle=False) print(train_encoder[:10]) print(train_decoder[:10]) print(train_target[:10]) input_chars = set(' '.join(train_encoder)) target_chars = set(' '.join(train_decoder)) nb_input_chars = len(input_chars) nb_target_chars = len(target_chars)
# .replace('ё', 'ё') \ # .strip() line = utils.norm_text2(re2.sub('', line)) if line: lines.append(' '.join(line.split())) if len(lines) >= _utils.MIN_TEXT_LINES: texts_total += 1 if link_no > start_link_idx: with open(page_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(page) with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) print(header, file=f) f.write('\n'.join(lines)) print('\r{} (of {})'.format(texts_total, min(utils.TEXTS_FOR_SOURCE, num_links)), end='') need_enter = True #exit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(num_links) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(num_links, isdialog=False)
def generate_repo_dataset(fullname, branch, sha_list, repo_dir, writer): repo = Repo(repo_dir) total_cnt, current_cnt, msg_skip, diff_skip, word_skip = 0, 0, 0, 0, 0 index_list, origin_target_list, target_list, origin_line_list, line_list, origin_word_list, word_list = [],[],[],[],[],[],[] for sha in sha_list: commit = repo.commit(sha) total_cnt += 1 commit_msg = commit.message sentences = split_sentence(commit_msg) if not sentences: continue commit_msg = sentences[0].strip() commit_msg_lower = commit_msg.lower() if 'revert' in commit_msg_lower or commit_msg_lower.startswith('merge '): msg_skip += 1 continue commit_msg = remove_redundant_white_space(commit_msg.strip()) origin_commit_msg = commit_msg if not commit_msg: msg_skip += 1 continue commit_msg = tokenize(commit_msg) commit_msg = remove_last_special_char(commit_msg.strip()) commit_msg = remove_no_english_str(commit_msg) commit_msg = remove_redundant_white_space(commit_msg.strip()) commit_msg = commit_msg.strip() if not commit_msg: msg_skip += 1 continue commit_words = commit_msg.split() # if not starts_with_verb(commit_words): # msg_skip += 1 # continue if len(commit_words) > constants.TARGET_SEQ_LEN_MAX: msg_skip += 1 continue line_diff = get_line_diff(repo_dir, sha) if not line_diff: diff_skip += 1 continue origin_line_diff = line_diff line_diff = remove_no_english_str(line_diff) line_diff = remove_redundant_white_space(line_diff.strip()) line_diff_words = line_diff.split() if not overlap_two_seq(line_diff_words, commit_words): diff_skip+=1 continue if len(line_diff_words) > constants.SOURCE_SEQ_LEN_MAX: diff_skip+=1 continue word_diff = get_word_diff(repo_dir, sha) if not word_diff: word_skip += 1 continue origin_word_diff = word_diff word_diff = remove_no_english_str(word_diff) word_diff = remove_redundant_white_space(word_diff.strip()) if not word_diff: word_skip += 1 continue word_diff_words = word_diff.split() index = f'{fullname} {sha}' writer.write(index, origin_commit_msg, commit_msg, origin_line_diff, line_diff, origin_word_diff, word_diff) current_cnt+=1 print(f'{fullname}: {current_cnt}/{total_cnt}') return current_cnt
def search(*arguments): print("Loading Files") outfile = open("./query_op.txt", 'w') with open(arguments[0], 'r') as f: queries = f.readlines() with open("./inverted_index/titleOffset.txt", 'r') as f: titleOffSet = [int(line.strip()) for line in f] with open("./inverted_index/offset.txt", 'r') as f: offset = [] for line in f.readlines(): try: offset.append(int(line.strip())) except BaseException: continue vocabFile = open("./inverted_index/vocab.txt", 'r') titleFile = open("./inverted_index/title.txt", 'r') with open("./inverted_index/fileNumbers.txt", 'r') as f: nFiles = int(f.read().strip()) key_words = ['t:', 'b:', 'i:', 'c:', 'r:', 'l:'] print("Starting Queries") numQueries = 0 for query in queries: startTime = time.time() numQueries += 1 query = query.strip().lower() numResults, query = query.split(",") query = query.strip() numResults = int(numResults) queryType = "Plain" for w in key_words: if w in query: queryType = "Field" break if queryType == "Field": q = re.split("(t:)|(b:)|(i:)|(c:)|(r:)|(l:)", query) q = [i.strip() for i in q if i is not None and i != ""] queryDict = defaultdict(list) for idx in range(0, len(q), 2): data = tokenize(q[idx + 1].lower()) data = [w for w in data if w not in stopWords] data = stemmer.stemWords(data) queryDict[q[idx].split(":")[0]].extend(data) results, docFreq = fieldQuery(queryDict, vocabFile, offset) results = rank(results, docFreq, nFiles) else: q = tokenize(query) q = [w for w in q if w not in stopWords] q = stemmer.stemWords(q) t = simpleQuery(q, vocabFile, offset) results, docFreq = t[0], t[1] results = rank(results, docFreq, nFiles) if len(results) > 0: results = sorted(results, key=results.get, reverse=True) results = results[:numResults] for key in results: title, _ = fileBinarySearch( 0, len(titleOffSet), titleOffSet, key, titleFile, 'int') print(','.join([key] + [' '.join(title)]), file=outfile) endTime = time.time() print( "{0}, {1}".format( endTime - startTime, (endTime - startTime) / numResults), file=outfile) print('\n', file=outfile) outfile.close()
text = None break if not res: if not SILENT: if not text: print('no text') #if nop: # exit() else: print('text beyond limits:') print(text) continue texts_total += 1 with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(text) print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE), end='') need_enter = True #exit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(utils.TEXTS_FOR_SOURCE) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(utils.TEXTS_FOR_SOURCE, isdialog=False)
model.cuda() gen_text = generation(embedding, model, state, options.n, options.primer) print(gen_text) else: lr = model_settings['learning_rate'] layers = model_settings['layers'] batch_size = model_settings['batch_size'] rnn_size = model_settings['rnn_size'] embed_size = model_settings['embed_size'] seq_length = model_settings['seq_length'] dropout = model_settings['dropout'] data_size = 256 # ??? train_x = utils.tokenize(options.train_data) train_x = utils.batchify(train_x, batch_size) num_batches = train_x.size(0) // seq_length if len(options.load_model) > 0: checkpoint = torch.load(options.load_model) embedding = checkpoint['embed'] model = checkpoint['rnn'] else: embedding = nn.Embedding(256, embed_size) model = Stacked_mLSTM(mLSTM, layers, embed_size, rnn_size, data_size, dropout) loss_fn = nn.CrossEntropyLoss() embed_optimizer = optim.Adam(embedding.parameters(), lr=lr) model_optimizer = optim.Adam(model.parameters(), lr=lr)
logger = utils.get_logger() logger.info('Reading model') sess = tf.InteractiveSession() model = multimlp.MultiFeedForward.load(args.load, sess) word_dict, embeddings = readdata.load_embeddings(args.embeddings, args.vocab, generate=False, load_extra_from=args.load) embeddings = utils.normalize_embeddings(embeddings) model.initialize_embeddings(sess, embeddings) number_to_label = {v: k for (k, v) in utils.label_map.items()} while True: sent1 = raw_input('Type sentence 1: ') sent2 = raw_input('Type sentence 2: ') tokens1 = utils.tokenize(sent1) tokens2 = utils.tokenize(sent2) vector1 = convert_tokens(tokens1, word_dict, model.max_time_steps1) vector2 = convert_tokens(tokens2, word_dict, model.max_time_steps2, prepend=word_dict[utils.GO]) feeds = {model.sentence1: vector1, model.sentence2: vector2, model.sentence1_size: [len(tokens1)], model.sentence2_size: [len(tokens2)+1], model.dropout_keep: 1.0} answer = sess.run(model.answer, feed_dict=feeds) print('Model answer:', number_to_label[answer[0]]) print()
#!/usr/bin/python from utils import tokenize, stdin words_count = {} for line in stdin(): for word in tokenize(line, [' ', '\t', '-']): words_count[word] = words_count.get(word, 0) + 1 sorted_words_count = sorted(words_count.items(), reverse=True, key=lambda tup: tup[1]) for word in sorted_words_count: print("%i %s" % (word[1], word[0]))
from model import NerModel import tensorflow_addons as tf_ad import os import numpy as np from args_help import args from my_log import logger if not (os.path.exists(args.vocab_file) and os.path.exists(args.tag_file)): logger.info("building vocab file") build_vocab([args.train_path], args.vocab_file, args.tag_file) else: logger.info("vocab file exits!!") vocab2id, id2vocab = read_vocab(args.vocab_file) tag2id, id2tag = read_vocab(args.tag_file) text_sequences, label_sequences = tokenize(args.train_path, vocab2id, tag2id) train_dataset = tf.data.Dataset.from_tensor_slices( (text_sequences, label_sequences)) train_dataset = train_dataset.shuffle(len(text_sequences)).batch( args.batch_size, drop_remainder=True) logger.info("hidden_num:{}, vocab_size:{}, label_size:{}".format( args.hidden_num, len(vocab2id), len(tag2id))) model = NerModel(hidden_num=args.hidden_num, vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=args.embedding_size) optimizer = tf.keras.optimizers.Adam(args.lr) ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
# for tweet in tokenized_tweets: # tweets.append(tweet['clean']) # labels.append(tweet['class']) # train = pd.read_csv("../Data/imdb/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) train = pd.read_csv("../Data/imdb/train.tsv", header=0, delimiter="\t", quoting=3) # test = pd.read_csv("../Data/imdb/testData.tsv", header=0, delimiter="\t", quoting=3) tokenized_train = [] for idx, text in train.iterrows(): # tokenized_train.append(ut.tokenize(text['review'], text['sentiment'])) # for labeledTrainData.tsv tokenized_train.append(ut.tokenize(text['Phrase'], text['Sentiment'])) # for train.tsv tweets = [] labels = [] for tweet in tokenized_train: tweets.append(tweet['clean']) labels.append(tweet['class']) partition = 5 train_tweets, test_tweets, train_labels, test_labels = ut.crossValidation2( tweets, labels, partition) # kf = cv.KFold(n=len(tweets), n_folds=3, shuffle=True, indices=False) accuracyLR, precisionLR, recallLR, f_measureLR = [], [], [], [] accuracyRF, precisionRF, recallRF, f_measureRF = [], [], [], []
from torch.utils.data import Dataset, DataLoader from model import NeuralNet with open('intents.json','r') as f: intents = json.load(f) # print(intents) all_words = [] tags = [] xy = [] for intent in intents['intent']: tag = intents['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) # use extend instead of append as we don;t want array of arrays xy.append((w,tag)) ignore_words = ['?','!','[',']','.',','] all_words = [stem(w) for w in all_words if w not in ignore_words] all_words = sorted(set(all_words)) tags = sorted(set(tags)) X_train = [] y_train = [] for (sen, tag) in xy: bag = bow(sen,all_words)
def main(args): nlp = spacy.load('en') print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab((q['answer'] for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.'], add_special=True) # all_program_strs = [] # for q in questions: # if 'program' not in q: continue # program_str = program_to_strs(q['program'], args.mode)[0] # if program_str is not None: # all_program_strs.append(program_str) # program_token_to_idx = build_vocab(all_program_strs, add_special=True) vocab = { 'question_token_to_idx': question_token_to_idx, # 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, # no special tokens } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f, indent=4) # Encode all questions and entities print('Encoding data') questions_encoded = [] orig_idxs = [] image_idxs = [] answers = [] questions_len = [] questions_mask = [] noun_chunk_starts = [] noun_chunk_ends = [] entity_masks = [] max_entity_length = 5 for orig_idx, q in enumerate(questions): question = q['question'].replace('?', '').replace('.', '').replace( ';', ' ;').replace(',', ' ,') doc = nlp(question) start, end = find_noun_chunks(doc) noun_chunk_starts.append(start[:max_entity_length]) noun_chunk_ends.append(end[:max_entity_length]) orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) question_tokens = tokenize(question) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) questions_len.append(len(question_encoded)) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) else: answers.append(-1) # Pad encoded questions and entities max_question_length = max(len(x) for x in questions_encoded) for st, ed, qe in zip(noun_chunk_starts, noun_chunk_ends, questions_encoded): entity_masks.append( (np.arange(max_entity_length) < len(st)).astype(int)) if len(st) < max_entity_length: # qe.append(vocab['question_token_to_idx']['<NULL>']) padding = [len(qe) - 1] * (max_entity_length - len(st)) st += padding if len(ed) < max_entity_length: # qe.append(vocab['question_token_to_idx']['<NULL>']) padding = [len(qe)] * (max_entity_length - len(ed)) ed += padding questions_mask.append( (np.arange(max_question_length) < len(qe)).astype(int)) if len(qe) < max_question_length: # qe.append(vocab['question_token_to_idx']['<NULL>']) padding = [vocab['question_token_to_idx']['<NULL>'] ] * (max_question_length - len(qe)) qe += padding questions_encoded = np.asarray(questions_encoded, dtype=np.int32) questions_len = np.asarray(questions_len, dtype=np.int32) print(questions_encoded.shape) entity_starts = np.asarray(noun_chunk_starts, dtype=np.int32) entity_ends = np.asarray(noun_chunk_ends, dtype=np.int32) print(entity_starts.shape) print('Writing') obj = { 'questions': questions_encoded, 'image_idxs': np.asarray(image_idxs), 'orig_idxs': np.asarray(orig_idxs), # 'programs': programs_encoded, # 'program_inputs': program_inputs_encoded, 'answers': answers, 'questions_len': questions_len, 'questions_mask': questions_mask, 'e_starts': entity_starts, 'e_ends': entity_ends, 'e_masks': entity_masks } with open(args.output_pt_file, 'wb') as f: pickle.dump(obj, f)
all_words = data["all_words"] tags = data["tags"] model_state = data["model_state"] model = NeuralNet(input_size, hidden_size, output_size).to(device) model.load_state_dict(model_state) model.eval() bot_name = "Bryant's Coffee shop" print('Type quit to exit') while True: sentence = input("You: ") if sentence == "quit": break sentence = tokenize(sentence) X = bag_of_words(sentence, all_words) X = X.reshape(-1, X.shape[0]) X = torch.from_numpy(X) output = model(X) # print(output) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim=1) prob = probs[0][predicted.item()] if prob.item() > 0.75: for intent in intents["intents"]: if tag == intent["tag"]: print(f"{bot_name}: {random.choice(intent['responses'])}")
import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' import tensorflow as tf from model import NerModel from utils import tokenize, read_vocab, format_result, build_embedding_matrix import tensorflow_addons as tf_ad from args_help import args import json import numpy as np # 针对测试集完成词表字典,标签字典,文本序列长度和初始化词向量 vocab2id, id2vocab = read_vocab(args.vocab_file) tag2id, id2tag = read_vocab(args.tag_file) print(id2tag) text_sequences, label_sequences, text_origin, label_origin = tokenize( args.test_path, vocab2id, tag2id) # text_sequences 的维度是(159,110) embedded_matrix = build_embedding_matrix(args.pretrain_embedding_vec, vocab2id) # print('查看 text_sequences 的值和维度:') # print(text_sequences.shape) # print(type(text_sequences)) # 载入模型 optimizer = tf.keras.optimizers.Adam(args.lr) model = NerModel(hidden_num=args.hidden_num, vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=args.embedding_size, embedding_matrix=embedded_matrix) # restore model
def main(args): print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab((q['answer'] for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.'], add_special=True) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_strs(q['program'], args.mode)[0] if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs, add_special=True) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, # no special tokens } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f, indent=4) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] # value_inputs, encoded by question_token_to_idx in CLEVR # because all valid inputs are in question vocab program_inputs_encoded = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str, input_str = program_to_strs(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) # program value_inputs input_tokens = tokenize(input_str) input_encoded = encode(input_tokens, vocab['question_token_to_idx']) assert len(input_encoded) == len( program_encoded) # input should have the same len with func program_inputs_encoded.append(input_encoded) else: programs_encoded.append([-1]) program_inputs_encoded.append([-1]) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) else: answers.append(-1) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) for ie in program_inputs_encoded: while len(ie) < max_program_length: ie.append(vocab['question_token_to_idx']['<NULL>']) questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) program_inputs_encoded = np.asarray(program_inputs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) print(program_inputs_encoded.shape) print('Writing') obj = { 'questions': questions_encoded, 'image_idxs': np.asarray(image_idxs), 'orig_idxs': np.asarray(orig_idxs), 'programs': programs_encoded, 'program_inputs': program_inputs_encoded, 'answers': answers, } with open(args.output_pt_file, 'wb') as f: pickle.dump(obj, f)
default='dictionary.pkl', type=str, help='path to the dictionary') args = parser.parse_args() # Turns on logging. import logging root = logging.getLogger() root.setLevel(logging.DEBUG) dictionary, rev_dict = utils.get_dictionary(args.text, args.dictionary) num_classes = len(dictionary) iterator = utils.tokenize(args.text, dictionary, batch_size=args.batch_size, seq_len=args.seq_len) sess = tf.Session() model = SeqGAN(sess, num_classes, logdir=args.logdir, learn_phase=args.learn_phase, only_cpu=args.only_cpu) model.build() model.load(ignore_missing=True) for epoch in range(1, args.num_epochs + 1): for step in range(1, args.num_steps + 1): logging.info('epoch %d, step %d', epoch, step) model.train_batch(next(iterator))
def add_code_into_document(document, body): asts, code_hints = transform_body(body) flag = False #typed_method_call = set() for ast in asts: for mc in ast["typed_method_call"]: if mc: document.add( Field("typed_method_call", mc, Field.Store.YES, Field.Index.ANALYZED)) flag = True for e in ast["extends"]: if e: document.add( Field("extends", e, Field.Store.YES, Field.Index.ANALYZED)) for c in ast["used_classes"]: if c: document.add( Field("used_classes", c, Field.Store.YES, Field.Index.ANALYZED)) for m in ast["methods"]: if m: document.add( Field("methods", m, Field.Store.YES, Field.Index.ANALYZED)) flag = True for m in ast["methods_called"]: if m: document.add( Field("methods_called", m, Field.Store.YES, Field.Index.ANALYZED)) flag = True #comment if "comments" in ast: for c in ast["comments"]: document.add( Field("comments", utils.unescape_html(c), Field.Store.NO, Field.Index.ANALYZED)) for i in ast["class_instance_creation"]: if i: document.add( Field("class_instance_creation", i, Field.Store.YES, Field.Index.ANALYZED)) flag = True for l in ast["literals"]: if l: document.add(StringField("literals", l, Field.Store.YES)) #finally all the splitted words # for s in camel_case: # document.add( Field("camel_case_words", s.lower(), Field.Store.NO, Field.Index.NOT_ANALYZED)) hints = [] for h in code_hints: for token in utils.tokenize(h): if 1 < len(token) < 20: hints.append(token) for hint in set(hints): document.add( Field("code_hints", hint, Field.Store.YES, Field.Index.ANALYZED)) return flag
error_rate = 0.6 reverse = True model_path = './models/seq2seq.h5' hidden_size = 512 sample_mode = 'argmax' data_path = './data' books = [ 'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt', 'war_and_peace.txt' ] test_sentence = 'The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.' if __name__ == '__main__': text = read_text(data_path, books) vocab = tokenize(text) vocab = list(filter(None, set(vocab))) # `maxlen` is the length of the longest word in the vocabulary # plus two SOS and EOS characters. maxlen = max([len(token) for token in vocab]) + 2 train_encoder, train_decoder, train_target = transform( vocab, maxlen, error_rate=error_rate, shuffle=False) tokens = tokenize(test_sentence) tokens = list(filter(None, tokens)) nb_tokens = len(tokens) misspelled_tokens, _, target_tokens = transform(tokens, maxlen, error_rate=error_rate, shuffle=False)
def tokenize(line, lower=True, flat=False, clean=True): if clean: line = Vocab.clean_line(line) toks = U.tokenize(line, lower=lower, flat=flat) return toks
# choose the first half of files based on a deterministic random range robj = random.Random(12345) robj.shuffle(files) if args.command == 'train': fileSubset = files[:len(files) / 2] elif args.command == 'test': fileSubset = files[len(files) / 2:] else: fileSubset = files[len(files) / 2:] if args.command == 'train' or args.command == 'test': for i, name in enumerate(fileSubset): if i % 1000 == 0: print '%d files done' % i filesAndTokens.append((name, utils.tokenize(name))) print len(filesAndTokens) print sum([len(tokens) for name, tokens in filesAndTokens]) # model = PositionDependentVectorModel(keywords, winSize=args.win, # wdim=args.dim, stepsize=args.lr, # reg=args.reg) # model = ConstantAttentionVectorModel(keywords, winSize=args.win, # wdim=args.dim, stepsize=args.lr, # reg=args.reg) # model = NonLinearVectorModel(keywords, winSize=args.win, # wdim=args.dim, zdim=args.zdim, # stepsize=args.lr, # reg=args.reg) # model = RnnDense(keywords, winSize=args.win, # wdim=args.dim, zdim=args.zdim,