def _get_emb(self, w2v): """ Get the avgerage w2v embedding """ toks = tokenizer(self.opn.lower()) + tokenizer(self.asp.lower()) self.is_valid = len(toks) > 0 if not self.is_valid: return None embs = torch.stack([self._get_tok_emb(tok, w2v) for tok in toks], dim=0) return torch.mean(embs, dim=0)
def PairLoader(pairs, batch_size, w2cloader, max_seq_length=3): left = [pair[0] for pair in pairs] right = [pair[1] for pair in pairs] left_ids = [[w2cloader.toks2ids(tokenizer(col), max_seq_length) for col in row] for row in left] right_ids = [[w2cloader.toks2ids(tokenizer(col), max_seq_length) for col in row] for row in right] dataset = torch.utils.data.TensorDataset(torch.tensor(left_ids), torch.tensor(right_ids)) data_iter = torch.utils.data.DataLoader(dataset, batch_size=batch_size) return data_iter
def semanticSimilarity(sentenceA, sentenceB, infoContentNorm): wordsA = tokenizer(sentenceA) wordsB = tokenizer(sentenceB) wordSet = set(wordsA).union(set(wordsB)) wordVectorA = semanticVector(wordsA, wordSet, infoContentNorm) wordVectorB = semanticVector(wordsB, wordSet, infoContentNorm) semSimilarity = dot(wordVectorA, wordVectorB) / (linalg.norm(wordVectorA) * linalg.norm(wordVectorB)) return semSimilarity
def wordSimilarity(sentenceA, sentenceB): wordsA = tokenizer(sentenceA) wordsB = tokenizer(sentenceB) wordSet = list(set(wordsA).union(set(wordsB))) index = {word[1]: word[0] for word in enumerate(wordSet)} #sr = r1-r2 r1 = wordOrder(wordsA, wordSet, index) r2 = wordOrder(wordsB, wordSet, index) srTemp = linalg.norm(r1 - r2) / linalg.norm(r1 + r2) return 1 - srTemp
def ppindexer(test, gold, PREPS): ppindex = -100 test_words = tokenizer(test) gold_words = tokenizer(gold) assert len(test_words) == len(gold_words) for i, pair in enumerate(zip(test_words, gold_words)): if not (pair[0] == pair[1]) and (pair[0] in PREPS and pair[1] in PREPS) : correctionpair = (pair[0], pair[1]) ppindex = i else: pass return ppindex, test_words, gold_words, correctionpair
def __call__(self, x): bow = [] for sent in splitter(x): for i, token in enumerate(tokenizer(sent)): if self.remove_nonalpha and not token.isalpha(): continue if (self.remove_entities and i and token[0] != token[0].lower()): continue if (self.remove_stopwords and token.lower() in STOPWORDS): continue bow.append(token if not self.lowercase else token.lower()) _bow = [] prev = None while bow: token = bow.pop(0) if token == prev: continue _bow.append(token) prev = token return ' '.join(_bow)
def __call__(self, doc): tokens = [ token for token in tokenizer(doc, self.lang, True) if token.isalnum() and len(token) > 0 and not token.isspace() ] # we can eliminate punctuation as well tokens = [token.lower() for token in tokens] if self.remove_numbers: number_pattern = "[a-zA-z]{,3}\d{6,}" tokens = [re.sub(number_pattern, "", token) for token in tokens] if self.eliminate_stopwords: stopwords = stopword_lists.get_stopwords(lang="tr") tokens = [token for token in tokens if token not in stopwords] if self.apply_stemming: tokens = [tr_stemmer.stem2(token) for token in tokens] if self.deasciify: tokens = [ Deasciifier(token).convert_to_turkish() for token in tokens ] tokens = [token.strip() for token in tokens] tokens = [token for token in tokens if len(token) > 0] # or not token.isspace()] return tokens
def extract(self, name, name_list, text, limit=1, is_wordnet=False): if not self.is_name_in_text(name_list, text): return {}, 0 tagList = self.tag_list stopwords = self.stops wordDict = {} filterd_dict = {} sents = segmenter(text) wordcount = 0 for sent in sents: tokens = tokenizer(sent.lower()) terms = tagger(tokens) for t in terms: wordcount += 1 key = '.'.join(t) try: wordDict[key] += 1 except KeyError: wordDict[key] = 1 for term_s, count in wordDict.items(): try: word, pos = term_s.split('.') except ValueError: continue if pos[:2] in tagList and word.lower() not in stopwords and len(word) >= 3: print word, pos if is_wordnet: meanList = self.abstract(word, pos, limit) for w in meanList: filterd_dict[term_s] = count else: filterd_dict[term_s] = count return filterd_dict, wordcount
def parse_comments(comments): lines_final = [] syn = ['Syntax', 'SYNTAX'] others = ['NOTE', 'Note', 'Comments'] if comments is not None: comm_list = comments.text if comm_list: split_list = comm_list.split('\n') leading_space_stripped = [ my_str.strip() for my_str in split_list ] for line in leading_space_stripped: if line: is_syn_struct = False is_split = False tokens = tokenizer(line) if tokens[0] in syn: is_syn_struct = True is_split = True elif tokens[0] in others: is_syn_struct = False is_split = True elif is_syn_struct: is_split = True if is_split: lines_final.append(line) elif not lines_final: lines_final.append(line) else: lines_final[-1] += ' ' + line return lines_final return '' return ''
def get_emb_vec(self, row): emb = [] for field in row: for tok in tokenizer(field): emb.append(self.tok2emb(tok)) if len(emb) == 0: emb = [self.w_emb[self.unknown_idx]] return torch.norm(torch.tensor(emb), dim=0)
def snippet_tokenize(self, snippet): snippet = snippet.lower() snippet_tokens = [ self.clean_word(t) for t in tokenizer(snippet) if t not in self.stops and len(t) >= 3 and t != '...' ] snippet_tokens = [e for e in snippet_tokens if e] snippet_freq = dict(FreqDist(snippet_tokens)) return snippet_freq
def title_tokenize(self, title): title = title.lower() title_tokens = [ self.clean_word(t) for t in tokenizer(title) if t not in self.stops and len(t) >= 3 ] title_tokens = [e for e in title_tokens if e] title_freq = dict(FreqDist(title_tokens)) return title_freq
def ar(): # Take sys.argv[2] as input which data set should be run. # Create a list of the files in the chosen directory. directory = sys.argv[2] files = os.listdir("../out/" + directory + "/ar/") # Create counter, set to zero. counter = 0 # Open file with Arabic stop words. sw_in = open("../data/arstoplist.txt") sw = sw_in.read().splitlines() sw_in.close() # Open file with Arabic punctuation. punctlist = open("../data/arabpunct.txt").read().splitlines() # DELETE THIS Import IsriStemmer() # st = ISRIStemmer() # Loop over files: for f in files: counter += 1 print("Beginning file: " + str(counter)) allwords = [] if "txt" in f: # Open and read in file. f_in = open("../out/" + directory + "/ar/" + f, 'rU') lines = f_in.readlines() f_in.close() # Loop over the lines in the file. for line in lines: words = [] # Tokenize the line. tokens = tokenizer(line) # Loop over the words in the line: # Pass if the word appears in the stopwords or punctuation list. # Stem all other tokens and append to: words. for t in tokens: if t in sw: pass elif t in punctlist: pass else: words.append(t) # Why stem here???!!! allwords.append(words) # Write stemmed words to file. f_out = open("../out/tokenized/" + directory + "/ar/" + f, 'w') for item in allwords: f_out.write("\n".join(item)) f_out.close()
def is_name_in_text(self, name_list, text): lower_text = text.lower() sents = segmenter(lower_text) word_set = set() for sent in sents: tokens = tokenizer(sent) for t in tokens: word_set.add(t) for e in name_list: if e in word_set: return True return False
def en(): # Take sys.argv[2] as input which data set should be run. # Create a list of the files in the chosen directory. directory = sys.argv[2] files = os.listdir("../out/" + directory + "/en/") # Create counter, set to zero. counter = 0 # Import stop words. sw_en = stopwords.words('english') # DELETE THIS Import Porterstemmer() # st = PorterStemmer() # Loop over files: for f in files: counter += 1 print("Beginning file: " + str(counter)) allwords = [] if "txt" in f: # Open and read in file. f_in = open("../out/" + directory + "/en/" + f, 'rU') lines = f_in.readlines() f_in.close() # Loop over the lines in the opened file: # lowercase and tokenize words. for line in lines: words = [] line = line.lower() tokens = tokenizer(line) # Loop over the words in tokens: # pass if the word is in the list of stop words. # all other words: stem the word (???) and append to list: words. for t in tokens: try: if t in sw_en: pass else: words.append(t) except IndexError: words.append(t) allwords.append(words) # Write the stemmed words to file. f_out = open("../out/tokenized/" + directory + "/en/" + f, 'w') for item in allwords: f_out.write("\n".join(item)) f_out.close()
def DataLoader(target_rows, aux_rows, batch_size, w2cloader, max_seq_length=5): assert len(target_rows) == len(aux_rows) aux_cols = [list(set([r[i] for r in aux_rows])) for i in range(len(aux_rows[0]))] aux_c_sizes = [len(c) for c in aux_cols] aux_c_counts = [[sum(1 for row in aux_rows if row[i]==val) for val in aux_cols[i]] for i in range(len(aux_cols))] aux_weights = [[min(3, val/(min(col)+1.)) for val in col] for col in aux_c_counts] # Update batch target_ids = [[w2cloader.toks2ids(tokenizer(col), max_seq_length) for col in row] for row in target_rows] label_ids = [[aux_cols[i].index(row[i]) for i in range(len(row))] for row in aux_rows] dataset = torch.utils.data.TensorDataset(torch.tensor(target_ids), torch.tensor(label_ids)) data_iter = torch.utils.data.DataLoader(dataset, batch_size=batch_size) return data_iter, aux_c_sizes, aux_weights
def word_tokenize(text_block, stemmer, stop_words): sentences = SENT_RE.findall(text_block) sense_phrases = [] for sentence in sentences: sentence = sentence.replace('\'', '').replace('(', ' ') \ .replace(')', ' ').replace("/", " or ").replace("-", "") sentence = TAG_RE.sub('', sentence) sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) sentence_words = [stemmer.stem(word) for word in tokenizer(sentence) if word not in stop_words and re.match(alpha_numeric, word)] sense_phrases.append(sentence_words) logger.info("Will sense tokenize : %s" % sentence) return sense_phrases
def word_tokenize(text_block, stemmer, stop_words): sentences = SENT_RE.findall(text_block) sense_phrases = [] for sentence in sentences: sentence = sentence.replace('\'', '').replace('(', ' ') \ .replace(')', ' ').replace("/", " or ").replace("-", "") sentence = TAG_RE.sub('', sentence) sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) sentence_words = [ stemmer.stem(word) for word in tokenizer(sentence) if word not in stop_words and re.match(alpha_numeric, word) ] sense_phrases.append(sentence_words) logger.info("Will sense tokenize : %s" % sentence) return sense_phrases
def tag_sentences(sentences, pos_symbol=False): tokenized = [] for sent in sentences: tokenized.append(tokenizer(sent)) processed_list = tagger(tokenized) if not pos_symbol: output_list = [] for sentence in processed_list: new_sentence = [] for word in sentence: new_sentence.append((word[_IDX_WORD], POS_TAGS[word[_IDX_SYMBOL]])) output_list.append(new_sentence) else: output_list = processed_list return output_list
def make_cluster_name(self): if self.cluster_name: return self.cluster_name all_titles = [[w.lower() for w in tokenizer(self.titles[docid])] for docid in self.items()] gram_dist = Counter() for title_tokens in random.sample( all_titles, 50 if 50 < len(all_titles) else len(all_titles)): for n in [1, 2, 3, 4, 5]: for gram in ngrams(title_tokens, n): tfidf_weight = sum([ self.__get_tfidf_weight(token.lower()) for token in gram ]) if (gram[0].lower() in STOPWORDS or gram[-1].lower() in STOPWORDS or [token for token in gram if not token.isalpha()]): continue gram_dist[gram] += tfidf_weight * n if not gram_dist.most_common(): return None self.cluster_name = ' '.join(gram_dist.most_common()[0][0]) return self.cluster_name
def _read_reviews(self, source_file): """ Read reviews from file and conduct initial pruning """ entities = set([]) reviews = [] num_exts = 0 with open(source_file, "r", encoding="utf-8") as file: for _, line in enumerate(tqdm(file, desc="reviews")): review = json.loads(str(line)) # Process sentences & extractions sents = review["sentences"] exts = review["extractions"] # Filter sentences with NO extractions if self.filter_empty: sents = [sents[i] for i in set([e["sid"] for e in exts])] # Prune by number of sentences if len(sents) < self.s_min or len(sents) > self.s_max: continue # Prune by number of extractions if len(exts) < self.e_min or len(exts) > self.e_max: continue # Process extractions & sentences for ext in review["extractions"]: ext["opinion"] = self._process_span(ext["opinion"]) ext["aspect"] = self._process_span(ext["aspect"]) sents = [self.detokenizer.detokenize(toks) for toks in sents] # Validate number of tokens per review num_tokens = len(tokenizer(" ".join(sents))) if num_tokens > self.t_max: continue review["sentences"] = sents reviews.append(review) entities.add(review["ty_id"]) num_exts += len(exts) print("Average number of extractions per review: {}".format( num_exts / (0.0 + len(reviews)))) return reviews, entities
def extract(self, name, name_list, text, limit=1, is_wordnet=False): if not self.is_name_in_text(name_list, text): return {}, 0 tagList = self.tag_list stopwords = self.stops wordDict = {} filterd_dict = {} sents = segmenter(text) wordcount = 0 for sent in sents: tokens = tokenizer(sent.lower()) terms = tagger(tokens) for t in terms: wordcount += 1 key = '.'.join(t) try: wordDict[key] += 1 except KeyError: wordDict[key] = 1 for term_s, count in wordDict.items(): try: word, pos = term_s.split('.') except ValueError: continue if pos[:2] in tagList and word.lower( ) not in stopwords and len(word) >= 3: print word, pos if is_wordnet: meanList = self.abstract(word, pos, limit) for w in meanList: filterd_dict[term_s] = count else: filterd_dict[term_s] = count return filterd_dict, wordcount
def main(root_path): parser = argparse.ArgumentParser(description='') parser.add_argument('--json', dest='json', default='kvret_train_public.json', help='process json file') args = parser.parse_args() task = args.json.split('_')[1] with open(os.path.join(root_path, args.json)) as f: dialogues = json.load(f) with open(os.path.join(root_path, 'kvret_entities.json')) as f: entities_dict = json.load(f) # drop poi and poi_type here. global_kb_type = ['distance', 'traffic_info', 'location', 'weather_attribute', 'temperature', "weekly_time", 'event', 'time', 'date', 'party', 'room', 'agenda'] global_temp = [] di = {} # connect infos with '_' and map from original str to str with '_' for e in global_kb_type: for p in map(lambda x: str(x).lower(), entities_dict[e]): if "_" in p and p.replace("_", " ") != p: di[p.replace("_", " ")] = p else: if p != p.replace(" ", "_"): di[p] = p.replace(" ", "_") global_temp.append(di) example_kbs = [] for d in dialogues: roots = [] if (d['scenario']['task']['intent'] == "navigate"): # "schedule" "navigate" print("#navigate#") temp = [] names = {} # iterate through all kb infos. for el in d['scenario']['kb']['items']: poi = " ".join(tokenizer(el['poi'].replace("'", " "))).replace(" ", "_").lower() slots = ['poi', 'distance', 'traffic_info', 'poi_type', 'address'] # remvoe "'" and convert to lower for slot in slots: el[slot] = " ".join(tokenizer(el[slot].replace("'", " "))).lower() names[el['poi']] = poi di = { el['distance']: el['distance'].replace(" ", "_"), el['traffic_info']: el['traffic_info'].replace(" ", "_"), el['poi_type']: el['poi_type'].replace(" ", "_"), el['address']: el['address'].replace(" ", "_"), } print( "0 " + di[el['distance']] + " " + di[el['traffic_info']] + " " + di[el['poi_type']] + " poi " + poi) print("0 " + poi + " distance " + di[el['distance']]) print("0 " + poi + " traffic_info " + di[el['traffic_info']]) print("0 " + poi + " poi_type " + di[el['poi_type']]) print("0 " + poi + " address " + di[el['address']]) temp.append(di) # construct tree root for each kb item root = Node(poi, 'poi', layer=0) # except poi again for slot in slots[1:]: root.children.append(Node(di[el[slot]], slot, layer=1)) roots.append(root) # use for latter entity matching ? temp += global_temp # drop last one. if (len(d['dialogue']) % 2 != 0): d['dialogue'].pop() j = 1 for i in range(0, len(d['dialogue']), 2): user = "******".join(cleaner(tokenizer(str(d['dialogue'][i]['data']['utterance']).lower()))) bot = " ".join(cleaner(tokenizer(str(d['dialogue'][i + 1]['data']['utterance']).lower()))) # replace entity names with names joined by "_" bot, user = entity_replace(temp, bot, user, names) navigation = global_kb_type # ['distance','traffic_info'] nav_poi = ['address', 'poi', 'type'] gold_entity = [] for key in bot.split(' '): for e in navigation: for p in map(lambda x: str(x).lower(), entities_dict[e]): if (key == p): gold_entity.append(key) elif (key == str(p).replace(" ", "_")): gold_entity.append(key) for e in entities_dict['poi']: for p in nav_poi: if (key == str(e[p]).lower()): gold_entity.append(key) elif (key == str(e[p]).lower().replace(" ", "_")): gold_entity.append(key) # gold entity for each turn of dialogue. gold_entity = list(set(gold_entity)) if bot != "" and user != "": print(str(j) + " " + user + '\t' + bot + '\t' + str(gold_entity)) j += 1 print("") elif (d['scenario']['task']['intent'] == "weather"): # "weather" print("#weather#") temp = [] j = 1 print("0 today " + d['scenario']['kb']['items'][0]["today"]) today = d['scenario']['kb']['items'][0]["today"] for el in d['scenario']['kb']['items']: for el_key in el.keys(): el[el_key] = " ".join(tokenizer(el[el_key])).lower() loc = el['location'].replace(" ", "_") di = {el['location']: loc} temp.append(di) days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] for day in days: print("0 " + loc + " " + day + " " + el[day].split(',')[0].rstrip().replace(" ", "_")) print("0 " + loc + " " + day + " " + el[day].split(',')[1].split(" ")[1] + " " + el[day].split(',')[1].split(" ")[3]) print("0 " + loc + " " + day + " " + el[day].split(',')[2].split(" ")[1] + " " + el[day].split(',')[2].split(" ")[3]) # construct tree root for each kb item # root = Node(loc, 'location', layer=0) slots = ['weather', 'high', 'low'] for day in days: root = Node(loc, 'location', layer=0) ''' tmp = Node(el[day], day, layer=1) val = el[day] splits = [item.strip() for item in val.split(',')] tmp.children.append(Node(splits[0], 'weather', layer=2)) tmp.children.append(Node(splits[1], splits[1].split()[0], layer=2)) tmp.children.append(Node(splits[2], splits[2].split()[0], layer=2)) root.children.append(tmp) ''' # change weather to 1-layer tree. val = el[day] splits = [item.strip() for item in val.split(',')] root.children.append(Node(day, 'date', layer=1)) # more delicate for vals root.children.append(Node(splits[1], splits[1].split()[0], layer=1)) root.children.append(Node(splits[2], splits[2].split()[0], layer=1)) # miss this in original dataset... if today == day: root.children.append(Node('yes', 'today', layer=1)) else: root.children.append(Node('no', 'today', layer=1)) roots.append(root) temp += global_temp if (len(d['dialogue']) % 2 != 0): d['dialogue'].pop() for i in range(0, len(d['dialogue']), 2): user = "******".join(cleaner(tokenizer(str(d['dialogue'][i]['data']['utterance']).lower()))) bot = " ".join(cleaner(tokenizer(str(d['dialogue'][i + 1]['data']['utterance']).lower()))) bot, user = entity_replace(temp, bot, user) weather = global_kb_type # ['location', 'weather_attribute','temperature',"weekly_time"] gold_entity = [] for key in bot.split(' '): for e in weather: for p in map(lambda x: str(x).lower(), entities_dict[e]): if (key == p): gold_entity.append(key) elif (key == str(p).replace(" ", "_")): gold_entity.append(key) gold_entity = list(set(gold_entity)) if bot != "" and user != "": print(str(j) + " " + user + '\t' + bot + '\t' + str(gold_entity)) j += 1 print("") if (d['scenario']['task']['intent'] == "schedule"): # "schedule" print("#schedule#") temp = [] names = {} j = 1 # for all kb triple if (d['scenario']['kb']['items'] != None): for el in d['scenario']['kb']['items']: for el_key in el.keys(): el[el_key] = " ".join(tokenizer(el[el_key])).lower() ev = el['event'].replace(" ", "_") names[el['event']] = ev slots = ['time', 'date', 'party', 'room', 'agenda'] di = {} for slot in slots: if el[slot] == "-": continue if slot == "time": print("0 " + ev + " " + slot + " " + el[slot].replace(" ", "")) di[el[slot]] = el[slot].replace(" ", "") else: print("0 " + ev + " " + slot + " " + el[slot].replace(" ", "_")) di[el[slot]] = el[slot].replace(" ", "_") temp.append(di) root = Node(ev, 'event', layer=0) for slot in slots: tmp = Node(el[slot], slot, layer=1) root.children.append(tmp) roots.append(root) temp += global_temp if (len(d['dialogue']) % 2 != 0): d['dialogue'].pop() for i in range(0, len(d['dialogue']), 2): user = "******".join(cleaner(tokenizer(str(d['dialogue'][i]['data']['utterance']).lower()))) bot = " ".join(cleaner(tokenizer(str(d['dialogue'][i + 1]['data']['utterance']).lower()))) bot, user = entity_replace(temp, bot, user, names) calendar = global_kb_type # ['event','time', 'date', 'party', 'room', 'agenda'] gold_entity = [] for key in bot.split(' '): for e in calendar: for p in map(lambda x: str(x).lower(), entities_dict[e]): if (key == p): gold_entity.append(key) elif (key == str(p).replace(" ", "_")): gold_entity.append(key) gold_entity = list(set(gold_entity)) if bot != "" and user != "": print(str(j) + " " + user + '\t' + bot + '\t' + str(gold_entity)) j += 1 print("") # add to example kbs. example_kbs.append(roots) # next step : save to file. with open(os.path.join(root_path, '{}_example_kbs.dat'.format(task)), 'wb') as f: pickle.dump(example_kbs, f)
def title_tokenize(self, title): title = title.lower() title_tokens = [self.clean_word(t) for t in tokenizer(title) if t not in self.stops and len(t) >= 3] title_tokens = [e for e in title_tokens if e] title_freq = dict(FreqDist(title_tokens)) return title_freq
def main(): # Open all files related to removing stop words or punctuation from the data. sw_in = open(r"../data/arstoplist.txt") stopwords = sw_in.read().splitlines() punctlist = open("../data/arabpunct.txt").read().splitlines() directory = sys.argv[1] # Give location of input files. files = os.listdir("../out/" + directory + "/ar/") st = ISRIStemmer() #rx_en = re.compile(r'\D+') tokens = [] counter = 0 filelist = [] for f in files: if "txt" in f: counter += 1 f_in = open("../out/" + directory + "/ar/" + f, 'rU') lines = f_in.readlines() f_in.close() filelist.extend(lines) print("Files read.") stemmed = {} types = {} f_out = open('../out/testset-tokenized-' + directory + '.txt', 'w') compl_list = [] for line in filelist: #line = line.strip() #tokenize = word_tokenize(line) # Tokenize the text. tokenize = tokenizer(line) #tokenize.sort() # Comment this out after the test-set has been used? # Define all patterns that shall be excluded. rx_ar = re.compile( u'^[\u0621-\u064A]+$' ) # This exludes Arabic words that have numbers attached to them. rx_ar2 = re.compile(u'^(\u0622{2,})') for w in tokenize: if len(w) == 1: pass elif rx_ar2.match(w): pass elif rx_ar.match(w): f_out.write(w + "\n") compl_list.append(w) else: pass f_out.close() # wieder einfügen for w in compl_list: types[w] = 0 #if punctlist[0] in compl_list or punctlist[1] in compl_list or punctlist[2] or punctlist[3] in compl_list: # if len(w) > 1: # ERROR # new_w = w[:-1] # ERROR! This strips off Arabic letters although they are not in the punctlist # types[new_w] = 0 # tokens.append(new_w) # else: # types[w] = 0 # tokens.append(w) print(str(len(types)) + " different words.") print("Punctuation separated.") # Here the actual stemming happens. verbs = {} c = -1 for w in types: c += 1 if w not in stopwords: stm = st.stem(w) stemmed[w] = stm verbs[stm] = 0 if c % 10000 == 0: print(str(c) + " words stemmed.") print("File stemmed.") # print the stemmed words and their unstemmed versions to a file f_out = open('../out/stem_tok_' + directory + '.txt', 'w') wordlist = [] for w in verbs.keys(): if len( w ) > 4: # Don't save words that are longer than 4 letters. Verbs in Arabic are usually 3 letters long. Ivery rare cases they can be 2 or 4 letters long as well. pass else: wordlist.append(w) #f_out.write(w + "\t" + stemmed[w]) #f_out.write(w + "\n") wordlist.sort() for w in wordlist: f_out.write(w + "\n") f_out.write("No. of verbs:" + str(len(wordlist))) # Really verbs? Why not wordlist? f_out.close() # handle some corpora stats corp_stat = Counter(tokens) for w in list(corp_stat.keys())[0:11]: print("token: " + w + "\tno.: " + str(corp_stat[w]))
def __init_tools(self): test = "Just a test not for printing out or other use " tagger(tokenizer(test)) segmenter(test) test = wn.synsets('test')
def snippet_tokenize(self, snippet): snippet = snippet.lower() snippet_tokens = [self.clean_word(t) for t in tokenizer(snippet) if t not in self.stops and len(t) >=3 and t != '...'] snippet_tokens = [e for e in snippet_tokens if e] snippet_freq = dict(FreqDist(snippet_tokens)) return snippet_freq
for e in global_kb_type: for p in map(lambda x: str(x).lower(), entities_dict[e]): if "_" in p and p.replace("_"," ")!=p: di[p.replace("_"," ")] = p else: if p!=p.replace(" ","_"): di[p] = p.replace(" ","_") global_temp.append(di) for d in dialogues: if(d['scenario']['task']['intent']=="navigate"): #"schedule" "navigate" print("#navigate#") temp = [] names = {} for el in d['scenario']['kb']['items']: poi = " ".join(tokenizer(el['poi'].replace("'"," "))).replace(" ", "_").lower() slots = ['poi','distance','traffic_info','poi_type','address'] for slot in slots: el[slot] = " ".join(tokenizer(el[slot].replace("'"," "))).lower() names[el['poi']] = poi di = { el['distance']: el['distance'].replace(" ", "_"), el['traffic_info']: el['traffic_info'].replace(" ", "_"), el['poi_type']: el['poi_type'].replace(" ", "_"), el['address']: el['address'].replace(" ", "_"), } print("0 "+di[el['distance']]+" "+di[el['traffic_info']]+" "+di[el['poi_type']]+" poi "+poi)
def extract_words(string, lowercase=True, rm_num=True): return [ w.lower() if lowercase else w for w in tokenizer(string) if not rm_num or w.isalpha() ]
for c in column_names: entity_set.append(str(kb[c]).lower()) if c != "name": print("0 " + str(kb['name']).lower() + " " + c + " " + str(kb[c]).lower() + " name " + c) entity_set = list(set(entity_set)) #dialog if (len(d['dialogue']) % 2 != 0): d['dialogue'].pop() j = 1 for i in range(0, len(d['dialogue']), 2): user = "******".join( cleaner( tokenizer( str(d['dialogue'][i]['data']['utterance']).lower()))) bot = " ".join( cleaner( tokenizer( str(d['dialogue'][i + 1]['data']['utterance']).lower()))) gold_entity = [] for key in bot.split(' '): if key in entity_set: gold_entity.append(key) gold_entity = list(set(gold_entity)) if user != "" and bot != "": print( str(j) + " " + user + '\t' + bot + '\t' + str(gold_entity)) j += 1 print("")
def tokenize_string(s): return tokenizer().tokenize(s)
import sys from nltk import word_tokenize as tokenizer with open(sys.argv[1]) as f_in, open(sys.argv[2], "w") as f_out: for c, l in enumerate(f_in): f_out.write(" ".join(tokenizer(l))+"\n") #if c % 1000 == 0: # print(c)
for p in map(lambda x: str(x).lower(), entities_dict[e]): if "_" in p and p.replace("_", " ") != p: di[p.replace("_", " ")] = p else: if p != p.replace(" ", "_"): di[p] = p.replace(" ", "_") global_temp.append(di) for d in dialogues: if (d['scenario']['task']['intent'] == "navigate" ): #"schedule" "navigate" print("#navigate#") temp = [] names = {} for el in d['scenario']['kb']['items']: poi = " ".join(tokenizer(el['poi'].replace("'", " "))).replace( " ", "_").lower() slots = [ 'poi', 'distance', 'traffic_info', 'poi_type', 'address' ] for slot in slots: el[slot] = " ".join(tokenizer(el[slot].replace( "'", " "))).lower() names[el['poi']] = poi di = { el['distance']: el['distance'].replace(" ", "_"), el['traffic_info']: el['traffic_info'].replace(" ", "_"), el['poi_type']: el['poi_type'].replace(" ", "_"), el['address']: el['address'].replace(" ", "_"), } print("0 " + di[el['distance']] + " " +
def main(): # Define which corpora to work with via sys.argv[1] corpora = sys.argv[1] # Define input data. k50 = "../out/mallet/testdez/" + corpora + "-50.txt" k100 = "../out/mallet/testdez/" + corpora + "-100.txt" k200 = "../out/mallet/testdez/" + corpora + "-200.txt" # Load ISRIStemmer. st = ISRIStemmer() # Create lists: all_plots, all_means. all_plots = [] all_means = [] # Create for loop over the three files. for i in (k50, k100, k200): # Open file, read it into variable f, close file. f_in = open(i) f = f_in.readlines() f_in.close() # Create lists: words, stemlist. words = [] stemlist = [] # Loop over the lines in f. Tokenize words, delete the numbers at the # beginning of each line (0:4). Append line to words. for line in f: line = tokenizer(line) del line[0:4] words.append(line) # Loop over words. Stem each word and append to stemlist. for listitem in words: stems = [] for w in listitem: r = st.stem(w) stems.append(r) stemlist.append(stems) # Create lists: score, plotdata. score = [] plotdata = [] # Loop over lists in stemlist. Create a dictionary: d. # Loop over the words in topic: # if word is in d: add 1 to its value in d. # else: add word to d. for topic in stemlist: d = {} for item in topic: if item in d: d[item] += 1 else: d[item] = 1 # Get the value of each word in d and append it to plotdata. maximum = max(d, key=d.get) plotdata.append(d[maximum]) # Calculate the score: 1 / len(d). # Append each d_score to score. d_score = 1 / len(d) score.append(d_score) # Calculate the mean of score. Append to all_means. mean = np.mean(score) all_means.append(mean) # Append plotdata to all_plots. all_plots.append(plotdata) print(plotdata) # Create figure: boxplot with data from "all_plots". xtick50 = "k=50, mean score over \n all topics: " + str( round(all_means[0], 4)) xtick100 = "k=100, mean score over \n all topics: " + str( round(all_means[1], 4)) xtick200 = "k=200, mean score over \n all topics: " + str( round(all_means[2], 4)) fig = plt.figure(1, figsize=(9, 6)) ax = fig.add_subplot(111) ax.boxplot(all_plots) ax.set_xticklabels([xtick50, xtick100, xtick200]) ax.set_yticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) ax.set_ylabel("Highest value of root repetition per topic", rotation='vertical') ax.set_xlabel("k = topics") ax.set_title("UN") fig.savefig('../out/mallet/figures/testdez/un.png', bbox_inches='tight')
def word_dist(text): return Counter( [w for w in tokenizer(text.lower()) if w.isalpha() and len(w) > 3] )