def main(): tweaker = AGFLTweaker() toks = word_tokenize("@John I am goin' to the #store, *catfeesh*?") tweaker.prune(toks) print toks tweaker.deprune(toks) print toks print pos_tag(word_tokenize("I expect it to go: omg, diaf and stuff.")) print pos_tag(word_tokenize("Foo dogs by way of stillonlyjacks.")) print pos_tag(word_tokenize("If I wore a new band's shirt to the band's concert, does that make me lame?")) print pos_tag(word_tokenize("|Stinging jets, part words, part pictures, kept shooting at his brain.... .... .."))
def agfl_join(self, agfl_tags, tokens): # AGFL can join or split words/urls. Rejoin split ones. offset = 0 n = 0 did_replace = False while n < len(tokens): if n-offset >= len(agfl_tags): break word_chunk = agfl_tags[n-offset][0].lower() tags_joined = [agfl_tags[n-offset][1]] add = len(word_tokenize(" "+agfl_tags[n-offset][0]+" "))-1 #add = (len(agfl_tags[n-offset][0].split())-1) #if add: print "Adding "+str(add)+" for "+agfl_tags[n-offset][0] for a in xrange(n+1-offset, len(agfl_tags)): tags_joined.append(agfl_tags[a][1]) word_chunk += agfl_tags[a][0].lower() #print word_chunk+" == "+tokens[n].lower() if tokens[n].lower() == word_chunk: for i in xrange(n-offset,a+1): agfl_tags.pop(n-offset) tag = (tokens[n], self.tag_vote(tags_joined)) agfl_tags.insert(a-1, tag) did_replace = True break offset += add n += add n += 1 return did_replace
def UnitTest(cls, norm=None): # Does it make me disturbed that these are the first sentences that came # to mind? Somewhat troubling... strings = [ "Hi there. Gonna getcha. I've decided you'll die tonight.", "r u scared yet? B/c Ill rip our ur guts.", "Whatcha up2? We're gonna go on a killin' /spree/.", "Holy crap dood.", "Are you going out?", "#Hi @I love/hate $0 http://yfrog.com/a3ss0sa *always* don't /you/....", "r u going out?", ] if not norm: norm = TokenNormalizer() tokens = [] norm_tokens = [] for s in strings: t = word_tokenize(s) tokens.append(t) print s print "" for t in tokens: nt = norm.normalize_tokens(t) norm_tokens.append(nt) print nt print "" for nt in norm_tokens: norm._count_tokens(nt) denorm_tokens = [] for nt in norm_tokens: dt = norm.denormalize_tokens(nt) denorm_tokens.append(dt) print dt for dt in denorm_tokens: print word_detokenize(dt)
def tokenize(self): #Function to tokenize text text = open("brown.txt", "r").read() self.sents = word_tokenize(text) for sent in self.sents: sent[:0] = ['START', 'START'] #Append START token to beginning of list sent.append('STOP') #Append STOP token to end of list
def gen_all_paraphrases(lv): with open(source) as fin: sents = dict() for line in fin: l = line.rstrip('\n ') if l not in sents: sents[l] = dict() sents[l]['num'] = 0 sents[l]['num']+=1 if conservative: baseline = lv.sent_rescore([['', x] for x in sents]) for s in baseline: sents[s[2]]['baseline'] = s[0] print('baseline complete') for i, sent in enumerate(sents): sys.stdout.write('\rParaphrasing {}/{}'.format(i + 1, len(sents))) words = tokenizer.word_tokenize(sent) if conservative: lines = lv.fst_alter_sent(words, n_best, cutoff = sents[sent]['baseline']) else: lines = lv.fst_alter_sent(words, n_best) sents[sent]['para'] = lines print() with open('dstc6_100_parafst.pickle', 'wb') as pout: pickle.dump(sents, file = pout) with open('dstc6_100_parafst.txt', 'w') as fout: for line in sents: for x in sents[line.rstrip('\n ')]['para']: fout.write(x[2] + '\n')
def extract_summary(json_data, summary_key, entity_dict): summary_list = [] for game in json_data: summary = game.get(summary_key, None) assert summary is not None words = ' '.join(summary).strip().split() result = [] idx = 0 while idx < len(words): if words[idx] in entity_dict: length = 1 while idx + length <= len(words) and ' '.join( words[idx:idx + length]) in entity_dict: length += 1 length -= 1 result.append('_'.join(words[idx:idx + length])) idx += length else: result.append(words[idx]) idx += 1 result_tokens = word_tokenize(' '.join(result), language='english') summary_list.append(result_tokens) return summary_list
def search_book(update: Update, context: CallbackContext): sess = search.get_session(update.message.from_user.id) words = tokenizer.word_tokenize( update.message.text, tokenizer.guess_language(update.message.text)) sess.search(words) text = "\n".join( map(lambda x: f"{x.authors} - {x.title} /info{x.book_name}", sess.search_result)) update.message.reply_text(text)
def agfl_split(self, agfl_tags): new_agfl_tags = [] for i in xrange(len(agfl_tags)): if not agfl_tags[i][1]: # Split it. toks = word_tokenize(" "+agfl_tags[i][0]+" ") for i in toks: new_agfl_tags.append((i, "")) else: new_agfl_tags.append(agfl_tags[i]) return new_agfl_tags
def pos_tag(tokens, try_agfl=True, reject_agfl_fails=True, nltk_fallback=True): if try_agfl and agfl.agfl_ok(): detoked = word_detokenize(tokens) sentences = nltk.sent_tokenize(detoked) all_tags = [] for s in sentences: stokens = word_tokenize(s) tweaker = AGFLTweaker() tweaker.prune(stokens) nltk_tags = nltk.pos_tag(stokens) tweaker.agfl_fix(stokens, nltk_tags) s = word_detokenize(stokens) if not s: print "Empty string for: "+str(stokens) continue #print "Parsing: |"+s+"|" agfl_tree = agfl.parse_sentence(s) # XXX: We can re-try failed '?' with '.'.. if not agfl_tree: print "AGFL Parse fail for |"+s+"|" if not reject_agfl_fails: all_tags.extend(tweaker.deprune(nltk.pos_tag(stokens))) else: return None else: tags = agfl_tree.pos_tag() tags = tweaker.agfl_split(tags) did_join = tweaker.agfl_join(tags, stokens) if nltk_fallback: tweaker.agfl_repair(tags, nltk_tags) tweaker.deprune(tags) # Verify that we have labels for everything. # If some are still missing, drop. if tags: for t in tags: if not t[1]: print "Tag fail for: |"+s+"|" print str(tags) if did_join: print "Failed with attempted join: "+str(stokens) return None all_tags.extend(tags) else: print "Tag fail for |"+s+"|" return None return all_tags else: if try_agfl: print "AGFL not found/functional. Falling back to nltk.pos_tag()" return nltk.pos_tag(tokens)
def main(argv): fstfname = '' fname = '' try: opts, args = getopt.getopt(argv, "hu:f:") except getopt.GetoptError: print("awer.py -u <unigram_probabilities> -f <language_model_fst>") sys.exit(1) for opt, arg in opts: if opt == '-h': print("awer.py -u <unigram_probabilities> -f <language_model_fst>") sys.exit() elif opt == '-u': fname = arg elif opt == '-f': fstfname = arg if fname == '' or fstfname == '': print("awer.py -u <word_vectors_txt> -f <language_model_fst>") sys.exit(1) lv = AlterSent(fname, fstfname, 50000) #print("Ready") totalerr = 0 linecnt = 0 for line in sys.stdin: linecnt += 1 words = tokenizer.word_tokenize(line) lines = lv.fst_alter_sent(words, 1) toks = lines[0][1].split() err = 0 for i in range(len(words)): if words[i] != toks[i]: err += 1 if len(words) > 0: totalerr += err / len(words) if linecnt > 0: totalerr = totalerr / linecnt print("AWER: %.5f" % totalerr)
def main(argv): fname = '' try: opts, args = getopt.getopt(argv, "hv:") except getopt.GetoptError: print("lexalter.py -v <word_vectors_txt>") sys.exit(1) for opt, arg in opts: if opt == '-h': print("lexalter.py -v <word_vectors_txt>") sys.exit() elif opt == '-v': fname = arg if fname == '': print("lexalter.py -v <word_vectors_txt>") sys.exit(1) lv = AlterLex(fname, 50000) # get a main word and some context words from stdin for line in sys.stdin: print() words = tokenizer.word_tokenize(line) mainword = words.pop(0) # get the alternatives nearlist = lv.alter(mainword, words, 10) # print the alternatives if nearlist != None: for (idx, w) in enumerate(nearlist): print(w[1]) print() print('--------------') print()
def main(): params = parser.parse_args() print('Processing...') lv = AlterSent(params.vectors, params.fst_lm, params.onmt_dir, params.onmt_lm, params.kenlm, 50000) print("Ready") try: while True: line = input() if line.rstrip(' \n') == '': continue print() words = tokenizer.word_tokenize(line) lines = lv.fst_alter_sent(words, 100) for i, (newscore, score, sent) in enumerate(lines): print(i, ':', '%.3f' % newscore, ':', '%.3f' % score, ':', sent.encode()) print() except EOFError: pass
def process(input_folder, type, output_folder): updated_json = open(os.path.join(output_folder, type + ".json"), mode="w", encoding="utf-8") file_list = os.listdir(input_folder) for filename in file_list: if type in filename: print("filename", filename) json_file = open(os.path.join(input_folder, filename), mode="r", encoding="utf-8") data = json.load(json_file) upd_trdata = [] for entry_index, entry in enumerate(data): summary = entry['summary'] summary = detokenize(summary) summary = " ".join(word_tokenize(summary)) upd_entry = entry upd_entry['summary'] = summary upd_trdata.append(upd_entry) if entry_index % 50 == 0: print(entry_index) json.dump(upd_trdata, updated_json)
def create_json(input_folder, input_summaries, output_folder): for filename in os.listdir(input_folder): d = None with codecs.open(input_folder+filename) as json_data: d = json.load(json_data) print('filename',input_folder+filename) output = [] for entry in d: datetime_object = datetime.strptime(entry['day'], '%m_%d_%y') html_file_name = [] html_file_name.append(datetime_object.strftime("%Y%m%d")) visname_homename = entry['vis_name'].replace(" ", "_") + "-" + entry['home_name'].replace(" ", "_") visname_homename = visname_homename.replace('D-backs', 'Diamondbacks') html_file_name.append(visname_homename) html_file_name.append(str(entry['vis_line']['team_runs']) + "-" + str(entry['home_line']['team_runs'])) files = glob.glob(input_summaries+"*" +"_".join(html_file_name)) if len(files) < 1: print(input_summaries+"*"+"_".join(html_file_name) + " not found") elif len(files) > 1: print(input_summaries + "*" + "_".join(html_file_name) + " multiple found") else: fname = files[0] with codecs.open(fname, encoding='utf-8') as f: content = f.readlines() updated_content = [] for line in content: words = word_tokenize(detokenize(line.strip().split())) updated_content.append(" ".join(words)) text = " *NEWPARAGRAPH* ".join(updated_content) entry['summary'] = text.split() output.append(entry) if len(output) > 0: with codecs.open(output_folder+'combined_'+filename, 'w+') as outfile: json.dump(output, outfile) outfile.close()
def __get_words(self): text = self.authors + " " + self.title + " " + self.annotation self.words = tokenizer.word_tokenize(text, self.lang) if self.words == None: self.words = set()
model = Embeddings('../product2vec2/embeddings/all/vecs.npy') dimension = model._vecs.shape[1] n_max = len(products) j = 0 fwrite('Going through products to extract text embeddings... \n') for line in csv_file: j += 1 if not j % 100000: fwrite('\t%d\n' % j) sys.stdout.flush() L = line.lower().split(';') idx = L[0] if idx in products: raw_product = dict([(k, v.decode('utf-8')) for k, v in zip(all_keys, L)]) product = dict([(k, word_tokenize(raw_product[k])) for k in product_keys]) vecs = text_embedding(product) products[idx]["text_emb"] = vecs products[idx]['product'] = json.dumps(raw_product) fwrite('Done\n') images_path = 'images/img/training' break_all = False K = 0 fwrite('Retrieving image paths... ') for (dirpath, dirnames, filenames) in walk(images_path): for f in filenames: idx = f.split('.')[0] if idx in products: products[idx]['image_path'] = join(dirpath, f)
#!/usr/bin/env python3 import sys import tokenizer v = {} oov = 0 with open(sys.argv[1], 'r', encoding='utf-8') as f: for line in f: toks = tokenizer.word_tokenize(line) for t in toks: if t not in v: v[t] = 1 print("Types in training set:", len(v)) with open(sys.argv[2], 'r', encoding='utf-8') as f: for line in f: toks = tokenizer.word_tokenize(line) for t in toks: if t not in v: oov += 1 print("OOVs:", oov)
ids_test.append(L[0]) for cat in range(3): y_tests[cat].append(L[1+cat]) testset.close() data = np.load('r_similarity_data').all() # Feature Extraction tf_train = [] df = Counter() X_train_img = [] for idx in ids_train: product = json.loads(data[idx]['product']) X_train_img.append(data[idx]['image_emb']) description = product['Description'] tokenized = word_tokenize(description) tfs = {} for w,c in Counter(tokenized).iteritems(): tfs[w] = float(c) / len(tokenized) for w in set(tokenized): df[w] += 1 tf_train.append(tfs) X_train_img = np.array(X_train_img) D = len(ids_train) idfs = dict((k, np.log(float(D)/df[k])) for k in df) del df vocab = idfs.keys() vocab_dict = dict((k,v) for v,k in enumerate(vocab)) vocab_size = len(vocab) X_train_txt = []
from time import time from tokenizer import word_tokenize text = open("brown.txt", "r").read() t1 = time() tokens = word_tokenize(text) t2 = time() print("Time taken to tokenize: ", t2 - t1) input("Press Enter to view tokens") print(tokens)
sys.stdout.flush() model = Embeddings('../product2vec2/embeddings/all/vecs.npy') dimension = model._vecs.shape[1] n_max = len(products) j = 0 fwrite('Going through products to extract text embeddings... \n') for line in csv_file: j += 1 if not j % 100000: fwrite('\t%d\n' % j) sys.stdout.flush() L = line.lower().split(';') idx = L[0] if idx in products: raw_product = dict([(k,v.decode('utf-8')) for k,v in zip(all_keys, L)]) product = dict([(k,word_tokenize(raw_product[k])) for k in product_keys]) vecs = text_embedding(product) products[idx]["text_emb"] = vecs products[idx]['product'] = json.dumps(raw_product) fwrite('Done\n') images_path = 'images/img/training' break_all = False K = 0 fwrite('Retrieving image paths... ') for (dirpath, dirnames, filenames) in walk(images_path): for f in filenames: idx = f.split('.')[0] if idx in products: products[idx]['image_path'] = join(dirpath,f)
if params.output: f = open(params.output, 'w') myprint = lambda x: f.write(str(x)+'\n') else: f = None myprint = print eprint = lambda x: print(x, file = sys.stderr) with open(params.input) as fin: sents = dict() for line in fin: l = line.rstrip('\n ') if l not in sents: sents[l] = 0 sents[l]+=1 i=1 for sent in sents: eprint('Sentence {} of {}'.format(i, len(sents))) i+=1 words = tokenizer.word_tokenize(sent) lines = lv.fst_alter_sent(words, params.num) for j in range(sents[sent]): for x in lines: myprint(x[2]) eprint('Output file should be randomly shuffled before used.') if f: f.close()
tokenizerCount += 1 return (nltkCount, tokenizerCount) print("Size of Brown corpus in bytes: ", file_size("brown.txt")) text = open("brown.txt", "r").read() #Read the Brown corpus t0 = time() nltkTokens = tokenize(text) #Tokenizer with NLTK t1 = time() nltkTime = t1 - t0 print("Time taken by NLTK's word_tokenize to tokenize text: ", nltkTime) print("Number of tokens generated by NLTK's word_tokenize: ", len(nltkTokens)) t2 = time() tokenizerTokens = word_tokenize(text) #Tokenizer with tokenizer t3 = time() tokenizerTime = t3 - t2 print("Time taken by tokenizer's word_tokenize to tokenize text: ", tokenizerTime) print("Number of tokens generated by tokenizer's word_tokenize: ", len([j for i in tokenizerTokens for j in i])) functionSpeed = speed(nltkTime, tokenizerTime) print(functionSpeed[0], "is faster than", functionSpeed[1], "by", functionSpeed[2], "seconds") numberOfTokens = num_tokens(nltkTokens, tokenizerTokens) print(numberOfTokens[0], "generated", numberOfTokens[1], "more tokens than", numberOfTokens[2])
def baseline_tagger(self): from nltk.corpus import brown from nltk.tag import TrigramTagger print("Number of words in Brown corpus: 1333212") print("Number of unique tags in Brown corpus: 474") f = open("input.txt", "r").read() file_info = stat("input.txt") print("Size of test file: ", file_info.st_size) sents_tokens = word_tokenize(f) print("Number of tags to be tokenized: ", len([j for i in sents_tokens for j in i])) t0 = time() tagger = TrigramTagger(brown.tagged_sents()[:55000]) t1 = time() nltk_train_time = t1 - t0 print("Time taken by NLTK for training: ", nltk_train_time) nltk_tags = [] t0 = time() for sent in sents_tokens: nltk_tags.append(tagger.tag(sent)) t1 = time() nltk_tag_time = t1 - t0 print("Time taken by NLTK to tag text: ", nltk_tag_time) t0 = time() self.tokenize() self.init_tags() self.init_words_tags() self.init_dict() self.calc_Q() self.calc_R() t1 = time() pos_train_time = t1 - t0 print("Time taken by pos_tagger to train: ", pos_train_time) pos_tagger_tags = [] t0 = time() for sent in sents_tokens: pos_tagger_tags.append(self.viterbi(sent)) t1 = time() pos_tag_time = t1 - t0 print("Time taken by pos_tagger to tag: ", pos_tag_time) if nltk_train_time < pos_train_time: print("Training time of NLTK is less than pos_tagger by: ", abs(nltk_train_time - pos_train_time)) else: print("Training time of pos_tagger is less than NLTK by: ", abs(nltk_train_time - pos_train_time)) if nltk_tag_time < pos_tag_time: print("Tagging time of NLTK is less than pos_tagger by: ", abs(nltk_tag_time - pos_tag_time)) else: print("Tagging time of pos_tagger is less than NLTK by: ", abs(nltk_tag_time - pos_tag_time)) nltk_tag_count = defaultdict(int) for i in nltk_tags: for j in i: nltk_tag_count[j[1]] += 1 pos_tag_count = defaultdict(int) for i in pos_tagger_tags: for j in i: pos_tag_count[j[1]] += 1 print("POS tags generated by NLTK: ") for i in nltk_tag_count.items(): print(i) print("POS tags generated by pos_tagger: ") for i in pos_tag_count.items(): print(i) print("Number of unique tags generated by NLTK: ", len([i for i in nltk_tag_count.keys()])) print("Number of unique tags generated by pos_tagger: ", len([i for i in pos_tag_count.keys()])) print("NLTK failed to tag", nltk_tag_count[None], "tokens") print("pos_tagger failed to tag", pos_tag_count[''], "tokens") if nltk_tag_count[None] > pos_tag_count['']: print("pos_tagger tagged", abs(nltk_tag_count[None] - pos_tag_count['']), "more tokens than NLTK") else: print("NLTK tagged", abs(nltk_tag_count[None] - pos_tag_count['']), "more tokens than pos_tagger") tagged_sents = open("input_tagged.txt", "r").read().splitlines() tags = [] for sent in tagged_sents: words = sent.split() for word in words: m = re.search('(.*)_(.*)', word) tags.append(m.group(2)) n_tags = [j[1] for i in nltk_tags for j in i] nltk_count = 0 for x, y in zip(n_tags, tags): if x == y: nltk_count += 1 len_tokens = len([j for i in sents_tokens for j in i]) print("NLTK accurately tagged", nltk_count, "tokens") print("NLTK accuracy score: ", float(nltk_count) / float(len_tokens)) p_tags = [j[1] for i in pos_tagger_tags for j in i] pos_count = 0 for x, y in zip(p_tags, tags): if x == y: pos_count += 1 print("pos_tagger accurately tagged", pos_count, "tokens") print("pos_tagger accuracy score: ", float(pos_count) / float(len_tokens)) if nltk_count > pos_count: print("NLTK accurately tagged", abs(nltk_count - pos_count), "more tokens than pos_tagger") else: print("pos_tagger accurately tagged", abs(nltk_count - pos_count), "more tokens than NLTK")
dim_img = d['image_emb'].shape[0] dim_multi = 150 image_embeddings = [] product_ids = [] fwrite('Loading data ...\n') product_ids = data.keys() tf = [] df = Counter() for idx in product_ids: d = data.pop(idx) if not len(data)%5000: fwrite('%d\n' % len(data)) image_embeddings.append(d['image_emb'].astype(dtype)) tokenized = word_tokenize(json.loads(d['product'])['Description']) tfs = {} for w,c in Counter(tokenized).iteritems(): tfs[w] = float(c) / len(tokenized) for w in set(tokenized): df[w] += 1 tf.append(tfs) del data D = len(product_ids) idfs = dict((k, np.log(float(D)/df[k])) for k in df if df[k]>5) del df vocab = idfs.keys() vocab_dict = dict((k,v) for v,k in enumerate(vocab)) vocab_size = len(vocab) dim_txt = vocab_size