def find_ngrams_by_tagged_words(tagged_ngrams, tagged_words, window_size=None): ngrams = [] for tagged_ngram in tagged_ngrams: if window_size: window_size = int(window_size) t = filter( None, utils.flatten_list(tagged_ngram[window_size:-window_size])) else: t = filter(None, utils.flatten_list(tagged_ngram)) w = filter(None, utils.flatten_list(tagged_words)) if utils.is_sublist_of(w, t): ngrams.append(utils.sentence_from_tagged_ngram(tagged_ngram)) return ngrams
def _load_aaer_test_data(self, doc_length, one_to_n=False): # data only contains test files, to save computing & memory costs self.save_dir = const.GENERATED_DATA_DIR if one_to_n: self.dict_save_fname = os.path.join( self.save_dir, "%s%s_1_to_%d.%s" % (const.DL_DOC_DICT_PREFIX, self.__class__.__name__, doc_length, const.PICKLE_FILE_EXTENSION)) else: self.dict_save_fname = os.path.join( self.save_dir, "%s%s_%d.%s" % (const.DL_DOC_DICT_PREFIX, self.__class__.__name__, doc_length, const.PICKLE_FILE_EXTENSION)) try: logging.info("loading saved data from %s" % self.dict_save_fname) with open(self.dict_save_fname, 'rb') as f: self._docvec_dict = pickle.load(f) except FileNotFoundError: logging.info("%s not found. Start building..." % self.dict_save_fname) test_files = ft.list_file_paths_under_dir(const.TEST_DIR, ['txt']) docs = [] for test_file in test_files: if one_to_n: docs += utils.flatten_list( ex_parsing.one_to_n_grams_from_file( ft.get_source_file_by_example_file(test_file), n=doc_length)) else: docs += ex_parsing.ngrams_from_file( ft.get_source_file_by_example_file(test_file), n=doc_length) # print(docs[0]) self._make_docvec_dict(docs)
def score_by_rouge(words_found, test_entity_dict, entity_key): logging.info('score_by_Rouge: words_found: ' + str(words_found)) score = 0 targets = 1 if entity_key in test_entity_dict: answers = [util.flatten_list(test_entity_dict[entity_key])] words_found = [] if words_found is None else util.flatten_list( words_found) # targets += len(test_entity_dict[entity_key]) print('answers:') print(answers) # print(words_found) score += rouge.rouge_1(words_found, answers, alpha=0.5) # print(score) elif not words_found: # both do not have similar words compared to example score += 1 # set rouge2 as 0 because for single word rouge2 returns 0 return score, targets
def score(self, key, gram, test_file_path, wv_dict, **kwargs): print('similar to:' + str(gram)) # words_found = self.similar_grams_by_gram(gram, gram) answers = [random.choice(util.flatten_list(self.test_tokens))] hits, targets = score_by_rouge(answers, self.test_entity_dict, key) print("rouge:", hits) self.score_dict[ test_file_path] = self.score_dict[test_file_path] + hits return targets
def make_wv_dict(self, file_path): sentences = ex_parsing.sentences_from_file( ft.get_source_file_by_example_file(file_path)) tokens = list(self.phrases_model.get_bigrams(sentences)) flat_grams = util.flatten_list(tokens) flat_grams[:] = [ tuple(w.split(const.GENSIM_PHRASES_DELIMITER)) for w in flat_grams ] return self.doc_vector_to_dict_by_list(self.doc_vec_model, flat_grams)
def dir_to_file_without_punctuations(dir_path, extension='txt', file_name=False): file_names = ft.list_file_paths_under_dir(dir_path, [extension]) tokens = [] for fname in file_names: temp_tokens, _ = parse_file(fname) tokens.extend(util.flatten_list(temp_tokens)) if not file_name: file_name = '_'.join(dir_path.split('/')[-2:]) with open(file_name, 'w') as f: print('saving to:', file_name) f.write(' '.join(tokens))
def tagged_words_to_str(tagged_words): return const.UNIQUE_DELIMITER.join(util.flatten_list(tagged_words))
def make_vec_file_from_wiki_model(sentences, wiki_aaer_vec_name): flatten_tokens = util.flatten_list(sentences) ft.filter_vec_file_by_set(const.FASTTEXT_WIKI_PATH, set(flatten_tokens), wiki_aaer_vec_name)