def is_partial_match(query, table_names): query = lemma(query) table_names = [[lemma(x) for x in names.split(' ')] for names in table_names] same_count = 0 result = None for names in table_names: if query in names: same_count += 1 result = names return result if same_count == 1 else False
def _read_errorprob(self): try: prob_v = FeatureExtractor.dic_errorprobs[self.v] except KeyError: prob_v = FeatureExtractor.dic_errorprobs[en.lemma(self.v)] finally: pass
def get_rhyme_word(old_word, candidates, pos): #Find the candidates in the lexicon lemmas = [lemma(candidate['word']) for candidate in candidates] filtered_lemmas = filter_candidates(lemmas, pos) options = [candidate for candidate in candidates if lemma(candidate['word']) in filtered_lemmas] if not options: return '' best_options = [option['word'] for option in options if option['score'] == options[0]['score']] closest, score = most_similar(old_word, best_options, pos) if score > 2.5: return '' return closest
def agg_n_grams_by_line(poems, template): logging.info('Starting aggregator: agg_n_grams_by_line') #First extend all poems to the length of the longest poem max_len = max([len(poem.poem) for poem in poems]) extended_poems = [(poem.poem + ['']*(max_len-len(poem.poem))) for poem in poems] #Then zip all together poem_lines = zip(*[poem for poem in extended_poems]) #Then look a line at a time (so each first line of each poem, second line of each poem etc.) n_grams_by_line = [] for line in poem_lines: n_grams = [] for poem_line in line: #Now get the n_grams for this line for all n up to the length of the line and add it if not just stop words split_line = get_tokenized_words(poem_line) split_line = [lemma(word) for word in split_line] for n in range(1, len(split_line)): grams = ngrams(split_line, n) n_grams.extend([gram for gram in grams if len(set(gram) - stop_words)]) n_grams_by_line.append(n_grams) #Now filter by the ones that actually occur with some significant frequency min_num_occurrences = round(len(poems) * 0.1) for n_grams_line in n_grams_by_line: counts = Counter(n_grams_line) template.n_grams_by_line.append([(' '.join(g for g in gram), count) for gram, count in counts.items() if count > min_num_occurrences]) reduced_n_grams_by_line = [] for entry in template.n_grams_by_line: reduced_n_grams_by_line.append(remove_redundant_substring_occurences(entry, min_num_occurrences)) template.n_grams_by_line = reduced_n_grams_by_line logging.info('Aggregator finished: agg_n_grams_by_line')
def _collect_words_d(word_list): cleaned_list = [] for w in word_list: # type: str # 去除左右空格 w = w.strip() w = lemma(w) cleaned_list.append(w) return cleaned_list
def _find_verb_idx(self): verbpos = [idx for idx, sufpos in enumerate(zip(self.SUF, self.POS)) if sufpos[0] == self.v and "VB" in sufpos[1]] if verbpos: return verbpos[0] else: SUF_l = [en.lemma(w) for w in self.SUF] verbpos = [idx for idx, sufpos in enumerate(zip(SUF_l, self.POS)) if sufpos[0] == self.v and "VB" in sufpos[1]] if verbpos: return verbpos[0] else: return None
def access(self, uri, lemma): f = open(uri, 'r') for line in f: line = line.strip() if line != '': if lemma is not False: line = en.lemma(line) if self.vocab_dict.get(line): self.vocab_dict[line] += 1 else: self.vocab_dict[line] = 1 self.size = len(self.vocab_dict) f.close()
def word_element(file_urls=()): for url in file_urls: result = list() file = open(const.clean_path + url, 'r', encoding='utf-8') fp = open(const.element_path + url, 'w', encoding='utf-8') for line in file: words = re.split(r'\s', line) for word in words: if word.strip() != '': s = en.lemma(word) if s not in const.simple_word: result.append(word + " " + s) file.close() fp.writelines([line + '\n' for line in result]) fp.close()
def srl(self, v_idx=None): try: if not v_idx: v_idx = self.v_idx self.tmp_ARG0 = [] self.tmp_ARG1 = [] self.tmp_PRED = defaultdict(dict) ARGS = [(l[FeatureExtractor.col_srlrel], l[FeatureExtractor.col_suf], l[FeatureExtractor.col_pos], l[FeatureExtractor.col_netag]) for l in self.tags if l[FeatureExtractor.col_srl] != "_" and int(l[FeatureExtractor.col_srl]) - 1 == v_idx] if ARGS: srlf = {FeatureExtractor.gen_fn(["SRL", t[0], en.lemma(t[1])]):1 for t in ARGS} # srlp = {FeatureExtractor.gen_fn(["SRL", t[0], en.lemma(t[1])+"/"+t[2]]):1 for t in ARGS} srln = {FeatureExtractor.gen_fn(["SRL", t[0], t[3]]):1 for t in ARGS if not t[3]=="_"} self.features.update(srlf) # self.features.update(srlp) self.features.update(srln) except Exception, e: logging.debug(pformat(e))
def agg_n_grams(poems, template): logging.info('Starting aggregator: agg_n_grams') n_grams_by_poem = [] for poem in poems: full_poem = '' for line in poem.poem: full_poem += line + ' ' n_grams = [] split_poem = get_tokenized_words(full_poem) split_poem = [lemma(word) for word in split_poem] for n in range(1, len(split_poem)): grams = ngrams(split_poem, n) n_grams.extend([gram for gram in grams if len(set(gram) - stop_words)]) n_grams_by_poem.extend(n_grams) #Now filter by the ones that actually occur with some significant frequency min_num_occurrences = round(len(poems) * 0.1) counts = Counter(n_grams_by_poem) template.n_grams.extend([(' '.join(g for g in gram), count) for gram, count in counts.items() if count > min_num_occurrences + 1]) template.n_grams = remove_redundant_substring_occurences(template.n_grams, min_num_occurrences) logging.info('Aggregator finished: agg_n_grams')
def _collect_words(self, word_list): """ 数据清洗 :param word_list: :return: """ for w in word_list: # todo 判断数字 if len(re.findall(is_num, w)) > 0: continue w_len = len(w) if w_len == 0: continue # 有些关键字不能 改成小写 如 C,R,直接保存,因为lemma 会 把所有单词lower # if w_len == 1: # rank_list[w] += 1 # 先小写,为了在stop_words 中筛掉 if w_len > 1: w = w.lower() # 除去 stopwords if w in stopwords: continue w = lemma(w) # 再次除去 stopwords if w in stopwords: continue else: # 4 stem # w = stemmer.stem(w) # w = lemmatizer.lemmatize(w) # w = singularize(w) self.rank_list[w] += 1
from pattern.text.en import conjugate, lemma, lexeme print (lemma('gave')) print (lexeme('gave'))
def partial_match(query, table_name): query = [lemma(x) for x in query] table_name = [lemma(x) for x in table_name] if query in table_name: return True return False
def alter_column0(datas): """ Attach column * table :return: model_result_replace """ zero_count = 0 count = 0 result = [] for d in datas: if 'C(0)' in d['model_result']: pattern = regex.compile('C\(.*?\) T\(.*?\)') result_pattern = list(set(pattern.findall(d['model_result']))) ground_col_labels = [] for pa in result_pattern: pa = pa.split(' ') if pa[0] != 'C(0)': index = int(pa[1][2:-1]) ground_col_labels.append(index) ground_col_labels = list(set(ground_col_labels)) question_arg_type = d['question_arg_type'] question_arg = d['question_arg'] table_names = [[lemma(x) for x in names.split(' ')] for names in d['table_names']] origin_table_names = [[ wordnet_lemmatizer.lemmatize(x.lower()) for x in names.split(' ') ] for names in d['table_names']] count += 1 easy_flag = False for q_ind, q in enumerate(d['question_arg']): q = [lemma(x) for x in q] q_str = " ".join(" ".join(x) for x in d['question_arg']) if 'how many' in q_str or 'number of' in q_str or 'count of' in q_str: easy_flag = True if easy_flag: # check for the last one is a table word for q_ind, q in enumerate(d['question_arg']): if (q_ind > 0 and q == ['many'] and d['question_arg'][q_ind - 1] == ['how']) or ( q_ind > 0 and q == ['of'] and d['question_arg'][q_ind - 1] == ['number'] ) or (q_ind > 0 and q == ['of'] and d['question_arg'][q_ind - 1] == ['count']): re = multi_equal(question_arg_type, q_ind, ['table'], 2) if re is not False: # This step work for the number of [table] example table_result = table_names[ origin_table_names.index(question_arg[re])] result.append( (d['query'], d['question'], table_result, d)) break else: re = multi_option(question_arg, q_ind, d['table_names'], 2) if re is not False: table_result = re result.append((d['query'], d['question'], table_result, d)) pass else: re = multi_equal(question_arg_type, q_ind, ['table'], len(question_arg_type)) if re is not False: # This step work for the number of [table] example table_result = table_names[ origin_table_names.index( question_arg[re])] result.append((d['query'], d['question'], table_result, d)) break pass table_result = random_choice( question_arg=question_arg, question_arg_type=question_arg_type, names=table_names, ground_col_labels=ground_col_labels, q_ind=q_ind, N=2, origin_name=origin_table_names) result.append((d['query'], d['question'], table_result, d)) zero_count += 1 break else: M_OP = False for q_ind, q in enumerate(d['question_arg']): if M_OP is False and q in [['than'], ['least'], ['most'], ['msot'], ['fewest']] or \ question_arg_type[q_ind] == ['M_OP']: M_OP = True re = multi_equal(question_arg_type, q_ind, ['table'], 3) if re is not False: # This step work for the number of [table] example table_result = table_names[ origin_table_names.index(question_arg[re])] result.append( (d['query'], d['question'], table_result, d)) break else: re = multi_option(question_arg, q_ind, d['table_names'], 3) if re is not False: table_result = re # print(table_result) result.append((d['query'], d['question'], table_result, d)) pass else: # zero_count += 1 re = multi_equal(question_arg_type, q_ind, ['table'], len(question_arg_type)) if re is not False: # This step work for the number of [table] example table_result = table_names[ origin_table_names.index( question_arg[re])] result.append((d['query'], d['question'], table_result, d)) break table_result = random_choice( question_arg=question_arg, question_arg_type=question_arg_type, names=table_names, ground_col_labels=ground_col_labels, q_ind=q_ind, N=2, origin_name=origin_table_names) result.append((d['query'], d['question'], table_result, d)) pass if M_OP is False: table_result = random_choice( question_arg=question_arg, question_arg_type=question_arg_type, names=table_names, ground_col_labels=ground_col_labels, q_ind=q_ind, N=2, origin_name=origin_table_names) result.append((d['query'], d['question'], table_result, d)) for re in result: table_names = [[lemma(x) for x in names.split(' ')] for names in re[3]['table_names']] origin_table_names = [[x for x in names.split(' ')] for names in re[3]['table_names']] if re[2] in table_names: re[3]['rule_count'] = table_names.index(re[2]) else: re[3]['rule_count'] = origin_table_names.index(re[2]) for data in datas: if 'rule_count' in data: str_replace = 'C(0) T(' + str(data['rule_count']) + ')' replace_result = regex.sub('C\(0\) T\(.\)', str_replace, data['model_result']) data['model_result_replace'] = replace_result else: data['model_result_replace'] = data['model_result']
def re_lemma(string): lema = lemma(string.lower()) if len(lema) > 0: return lema else: return string.lower()
def replace(old_word, candidates, phrases): new_phrases = [] #Find the word among the phrases, replace with candidate with same pos for phrase in phrases: if 'noun' in phrase.__dict__.keys(): if phrase.noun == lemma(old_word): replacement = get_rhyme_word(old_word, candidates, 'N') if not replacement: phrase.post_modifiers.append(phrase_spec.ADJ(get_rhyme_mod(old_word, candidates, 'A', 'N'))) else: phrase = phrase_spec.NP(replacement) if 'verb' in phrase.__dict__.keys(): if phrase.verb == lemma(old_word): replacement = get_rhyme_word(old_word, candidates, 'V') if not replacement: phrase.post_modifiers.append(phrase_spec.ADV(get_rhyme_mod(old_word, candidates, 'AVP', 'V'))) else: phrase = phrase_spec.VP(replacement) if 'np' in phrase.__dict__.keys(): if phrase.np.noun == lemma(old_word): replacement = get_rhyme_word(old_word, candidates, 'N') if not replacement: phrase.np.post_modifiers.append( phrase_spec.ADJ(get_rhyme_mod(old_word, candidates, 'A', 'N'))) else: phrase.np = phrase_spec.NP(replacement) for pre_modifier in phrase.pre_modifiers: if 'adjective' in pre_modifier.__dict__.keys(): if pre_modifier.adjective == lemma(old_word): option_nodes = [get_node(candidate['word'], 'a') for candidate in candidates] replacement_nodes = list(closest_matching([get_node(old_word, 'a')], option_nodes)) if replacement_nodes: replacement = random.choice(replacement_nodes).id.split()[0] else: replacement = random.choice(candidates)['word'] new_pre_modifier = phrase_spec.ADJ(replacement) pre_modifier_index = phrase.pre_modifiers.index(pre_modifier) phrase.pre_modifiers[pre_modifier_index] = new_pre_modifier if 'adverb' in pre_modifier.__dict__.keys(): if pre_modifier.adverb == lemma(old_word): option_nodes = [get_node(candidate['word'], 'adv') for candidate in candidates] replacement_nodes = list(closest_matching([get_node(old_word, 'adv')], option_nodes)) if replacement_nodes: replacement = random.choice(replacement_nodes).id.split()[0] else: replacement = random.choice(candidates)['word'] new_pre_modifier = phrase_spec.ADJ(replacement) pre_modifier_index = phrase.pre_modifiers.index(pre_modifier) phrase.pre_modifiers[pre_modifier_index] = new_pre_modifier for modifier in phrase.modifiers: if 'adjective' in modifier.__dict__.keys(): if modifier.adjective == lemma(old_word): option_nodes = [get_node(candidate['word'], 'a') for candidate in candidates] replacement_nodes = list(closest_matching([get_node(old_word, 'a')], option_nodes)) if replacement_nodes: replacement = random.choice(replacement_nodes).id.split()[0] else: replacement = random.choice(candidates)['word'] new_modifier = phrase_spec.ADJ(replacement) modifier_index = phrase.modifiers.index(modifier) phrase.modifiers[modifier_index] = new_modifier if 'adverb' in modifier.__dict__.keys(): if modifier.adverb == lemma(old_word): option_nodes = [get_node(candidate['word'], 'adv') for candidate in candidates] replacement_nodes = list(closest_matching([get_node(old_word, 'adv')], option_nodes)) if replacement_nodes: replacement = random.choice(replacement_nodes).id.split()[0] else: replacement = random.choice(candidates)['word'] new_modifier = phrase_spec.ADJ(replacement) modifier_index = phrase.modifiers.index(modifier) phrase.modifiers[modifier_index] = new_modifier for post_modifier in phrase.post_modifiers: if 'adjective' in post_modifier.__dict__.keys(): if post_modifier.adjective == lemma(old_word): option_nodes = [get_node(candidate['word'], 'a') for candidate in candidates] replacement_nodes = list(closest_matching([get_node(old_word, 'a')], option_nodes)) if replacement_nodes: replacement = random.choice(replacement_nodes).id.split()[0] else: replacement = random.choice(candidates)['word'] new_post_modifier = phrase_spec.ADJ(replacement) post_modifier_index = phrase.post_modifiers.index(post_modifier) phrase.post_modifiers[post_modifier_index] = new_post_modifier if 'adverb' in post_modifier.__dict__.keys(): if post_modifier.adverb == lemma(old_word): option_nodes = [get_node(candidate['word'], 'adv') for candidate in candidates] replacement_nodes = list(closest_matching([get_node(old_word, 'adv')], option_nodes)) if replacement_nodes: replacement = random.choice(replacement_nodes).id.split()[0] else: replacement = random.choice(candidates)['word'] new_post_modifier = phrase_spec.ADJ(replacement) post_modifier_index = phrase.post_modifiers.index(post_modifier) phrase.post_modifiers[post_modifier_index] = new_post_modifier new_phrases.append(phrase) if not new_phrases: new_phrases = phrases return new_phrases
def collate_text(batch): batch_posts, batch_response = list(zip(*batch)) max_post_len_list = [ max([len(posts[i]) for posts in batch_posts]) + 2 for i in range(4) ] max_response_len = max([len(response) for response in batch_response]) + 2 post_1, post_2, post_3, post_4 = [], [], [], [] post_length_1, post_length_2, post_length_3, post_length_4 = [], [], [], [] response = [] response_length = [] def padding(sent, length): """ Add sos and eos tokens, then pad sentence to length""" return ['_SOS'] + sent + ['_EOS'] + (['_PAD'] * (length - len(sent) - 2)) for posts in batch_posts: post_1.append(padding(posts[0], max_post_len_list[0])) post_2.append(padding(posts[1], max_post_len_list[1])) post_3.append(padding(posts[2], max_post_len_list[2])) post_4.append(padding(posts[3], max_post_len_list[3])) post_1[-1] = list(map(transform, post_1[-1])) post_2[-1] = list(map(transform, post_2[-1])) post_3[-1] = list(map(transform, post_3[-1])) post_4[-1] = list(map(transform, post_4[-1])) post_length_1.append(len(posts[0]) + 2) post_length_2.append(len(posts[1]) + 2) post_length_3.append(len(posts[2]) + 2) post_length_4.append(len(posts[3]) + 2) for i in range(len(batch_response)): sample_response = batch_response[i] response.append(padding(sample_response, max_response_len)) response[-1] = list(map(transform, response[-1])) response_length.append(len(sample_response) + 2) entity = [[], [], [], []] for posts in batch_posts: for i in range(4): entity[i].append([]) for j in range(len(posts[i])): word = posts[i][j] try: lemmatized = lemma(word) except UnicodeEncodeError: lemmatized = word if lemmatized in relation: entity[i][-1].append([ list(map(transform, triple)) for triple in relation[lemmatized] ]) else: entity[i][-1].append([[4, 4, 4]]) # naf_h, naf_r, naf_t # entity[i][j][k][l] : lth triple with kth word in ith post of jth sample as head entity max_triple_len = [0, 0, 0, 0] # entity_length_list = [] for i in range(4): for j in range(len(entity[i])): for k in range(len(entity[i][j])): if len(entity[i][j][k]) > max_triple_len[i]: max_triple_len[i] = len(entity[i][j][k]) entity_list = [] entity_mask_list = [] entity_length_list = [] for i in range(4): entity_list.append( np.array(list(zip_longest(*entity[i], fillvalue=[[4, 4, 4]]))).T) entity_list[i] = np.array([ np.pad(triples, pad_width=((0, max_triple_len[i] - len(triples)), (0, 0)), mode='constant', constant_values=4) for sample in entity_list[i] for triples in sample ]) entity_list[i] = entity_list[i].reshape( (len(batch), -1, max_triple_len[i], 3)) pre_post_fix = np.full((len(batch), 1, max_triple_len[i], 3), 4) entity_list[i] = np.concatenate( (pre_post_fix, entity_list[i], pre_post_fix), axis=1) for i in range(4): entity_list[i] = torch.tensor(entity_list[i]) entity_mask_list.append(entity_list[i][:, :, :, 0] == 4) entity_length_list.append( torch.sum((entity_list[i][:, :, :, 0] != 4), dim=2)) batched_data = { 'post_1': torch.tensor(post_1), # (batch_size, max_post_1_len) 'post_2': torch.tensor(post_2), 'post_3': torch.tensor(post_3), 'post_4': torch.tensor(post_4), 'post_length_1': torch.tensor(post_length_1), # (batch_size,) 'post_length_2': torch.tensor(post_length_2), 'post_length_3': torch.tensor(post_length_3), 'post_length_4': torch.tensor(post_length_4), 'response': torch.tensor(response), # (batch_size, max_response_len) 'response_length': torch.tensor(response_length), # (batch_size,) 'entity_1': entity_list[0], # (batch_size, max_post_1_len, max_triple_num, 3) 'entity_2': entity_list[1], 'entity_3': entity_list[2], 'entity_4': entity_list[3], 'entity_mask_1': entity_mask_list[0], # (batch_size, max_post_1_len, max_triple_num) 'entity_mask_2': entity_mask_list[1], 'entity_mask_3': entity_mask_list[2], 'entity_mask_4': entity_mask_list[3], 'entity_length_1': entity_length_list[0], # (batch_size, max_post_1_len) 'entity_length_2': entity_length_list[1], 'entity_length_3': entity_length_list[2], 'entity_length_4': entity_length_list[3] } return batched_data
def write_hypo(parent, count, list_of_neighbors): return_dict = {} for index in range(0, len(list_of_neighbors)): s = wordnet.synsets(list_of_neighbors[index]) if len(s) > 0: s = s[0] synomyms = s.synonyms hypernyms = s.hypernyms() hyponyms = s.hyponyms() holonyms = s.holonyms() meronyms = s.meronyms() singulars = [singularize(list_of_neighbors[index])] plurals = [pluralize(list_of_neighbors[index])] comparatives = [comparative(list_of_neighbors[index])] superlatives = [superlative(list_of_neighbors[index])] lemmas = [lemma(list_of_neighbors[index])] lexemes = [lexeme(list_of_neighbors[index])] tensess = [tenses(list_of_neighbors[index])] suggests = [suggest(list_of_neighbors[index])] neighbors_with_link_string = None if parent in synomyms: neighbors_with_link_string = str( list_of_neighbors[index]) + "[SYNO]" elif parent in hypernyms: neighbors_with_link_string = str( list_of_neighbors[index]) + "[HYPER]" elif parent in hyponyms: neighbors_with_link_string = str( list_of_neighbors[index]) + "[HYPO]" elif parent in holonyms: neighbors_with_link_string = str( list_of_neighbors[index]) + "[HOLO]" elif parent in meronyms: neighbors_with_link_string = str( list_of_neighbors[index]) + "[MERO]" elif parent in singulars: neighbors_with_link_string = str( list_of_neighbors[index]) + "[PLURAL]" elif parent in plurals: neighbors_with_link_string = str( list_of_neighbors[index]) + "[SINGULAR]" elif parent in comparatives: neighbors_with_link_string = str( list_of_neighbors[index]) + "[COMPA]" elif parent in superlatives: neighbors_with_link_string = str( list_of_neighbors[index]) + "[SUPERLA]" elif parent in lemmas: neighbors_with_link_string = str( list_of_neighbors[index]) + "[LEMMA]" elif parent in lexemes: neighbors_with_link_string = str( list_of_neighbors[index]) + "[LEXEME]" elif parent in tensess: neighbors_with_link_string = str( list_of_neighbors[index]) + "[TENSE]" elif parent in suggests: neighbors_with_link_string = str( list_of_neighbors[index]) + "[MISPELL]" if neighbors_with_link_string: try: return_dict[word][1].append( neighbors_with_link_string) except: return_dict[word] = (count, [neighbors_with_link_string]) return return_dict
def extend_phrase(phrases, target_num_syllables, num_syllables): logging.info('Extending phrase') used = [] #While less than: #Add adjectives and adverbs as modifiers with max missing number of syllables while num_syllables < target_num_syllables: added_specifier = False if target_num_syllables == num_syllables + 1: for phrase in phrases: if 'noun' in phrase.__dict__.keys(): if phrase.specifier is None: phrase.specifier = 'the' added_specifier = True break elif 'np' in phrase.__dict__.keys(): if phrase.np.specifier is None: phrase.np.specifier = 'the' added_specifier = True break if added_specifier: break if added_specifier: break changeable_phrases = [] for phrase in phrases: try: if 'noun' in phrase.__dict__.keys() and phrase.noun[0].isupper(): continue else: changeable_phrases.append(phrase) except IndexError: changeable_phrases.append(phrase) phrase_to_change = phrases.index(random.choice(changeable_phrases)) pos = 'A' target_pos = 'N' if 'verb' in phrases[phrase_to_change].__dict__.keys(): target_word = phrases[phrase_to_change].verb target_pos = 'V' pos = 'AVP' elif 'np' in phrases[phrase_to_change].__dict__.keys(): target_word = phrases[phrase_to_change].np.noun else: target_word = phrases[phrase_to_change].noun #Need to check that it is <= the required number of syllables word = '' added_syllables = 0 tries = 10 while tries: try: word = get_property(lemma(target_word.split()[-1]), target_pos, used) except IndexError: word = get_random_word(pos) used.append(word) added_syllables = count_syllables([word])[0] if added_syllables <= (target_num_syllables - num_syllables): break tries -= 1 if pos == 'A': modifier_phrase = phrase_spec.ADJ(word) else: modifier_phrase = phrase_spec.ADV(word) phrases[phrase_to_change].pre_modifiers.append(modifier_phrase) num_syllables += added_syllables return phrases