def test_preprocess_corpus_no_lower_no_punc_with_special(self): s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!' trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names']) output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=True, no_special_char=False) expected_output = \ 'á THIS is a test_string where we test Whether join_entity_names is working properly on the_test_string' self.assertEqual(output, expected_output)
def clean_raw_corpus(save_dir, start=0, end=5000, step=100, to_lower=False, no_punctuations=False, no_special_char=True): if save_dir[-1] != '/': raise AssertionError('save_dir must end with a "/".') # This reads all phrases. paper_dict, facet_dict, entity_names_set = read_corpus_util.read_ns_entities( read_corpus_util.kCorpusDirectory, to_lower=False) paper_key_phrases_dict, key_phrases_set, _ = read_corpus_util.read_key_phrases( read_corpus_util.kCorpusDirectory, to_lower=False) all_phrases = list(entity_names_set.union(key_phrases_set)) trie = trie_util.Trie(all_phrases) for i in range(start, end, step): # Field of study is either "Neuroscience" or "Computer Science" with open(save_dir + 'cs_corpus_max_matching_only_' + str(i) + '.txt', 'w', encoding='utf8') as f: f.write( read_corpus_util.read_papers_corpus( read_corpus_util.kCorpusDirectory, field_of_study="Computer Science", start_index=i, end_index=i + step, trie=trie, to_lower=to_lower, no_punctuations=no_punctuations, no_special_char=no_special_char))
def test_join_entity_names(self): s = 'This is a test string where we test whether join entity names is working properly on the test string ' \ 'the phrase join entity name should not be joined together but the phrase the test should' s = s.lower() trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names', 'the test']) output = read_corpus_util.join_entity_names(s, trie) expected_output = \ 'This is a test_string where we test whether join_entity_names is working properly on the_test_string ' \ 'the phrase join entity name should not be joined together but the phrase the_test should' expected_output = expected_output.lower() self.assertEqual(output, expected_output)
def test_get_key_phrase_pairs_from_context_no_match_second(self): s_before = 'á THIS, is a test_string where we test! Whether' s_after = 'join entity names is working properly on the string?!' s_before_list = s_before.split(' ') s_after_list = s_after.split(' ') trie = trie_util.Trie([ 'test', 'test string', 'test case', 'the test string', 'join entity names' ]) expected_output = (None, None) actual_output = trie_util.get_key_phrase_pairs_from_context( s_before_list, s_after_list, trie) self.assertTupleEqual(actual_output, expected_output)
def test_count_entity_names(self): s = 'á THIS, is a test_string where we test ! Whether join_entity_names is working properly on the test string?!' trie = trie_util.Trie([ 'test', 'test string', 'test case', 'the test string', 'join entity names' ]) expected_output = defaultdict(int, { 'test': 2, 'join_entity_names': 1, 'test_string': 1 }) actual_output = defaultdict(int) trie_util.count_entity_names(s, trie, actual_output) self.assertDictEqual(actual_output, expected_output)
def test_preprocess_corpus_no_lower_with_punc(self): s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!' trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names']) output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=False, no_special_char=True) # Notice that when there is punctuation, we cannot connect 'test string,' to 'test_string,'. expected_output = \ 'THIS, is a test string, where we test ! Whether join_entity_names is working properly on the test string?!' self.assertEqual(output, expected_output) def test_preprocess_corpus_no_lower_with_punc_with_special(self): s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!' trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names']) output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=False, no_special_char=False) expected_output = \ 'á THIS, is a test string, where we test ! Whether join_entity_names is working properly on the test string?!' self.assertEqual(output, expected_output)
def clean_text_file(save_dir, text_dir, to_lower=False, no_punctuations=False, no_special_char=True): key_phrases_dict, key_phrases_set = read_corpus_util.read_key_phrases_dict( read_corpus_util.kCorpusDirectory, to_lower=to_lower) key_phrases_list = map(lambda phrase: phrase.replace(' ', '_'), list(key_phrases_set)) trie = trie_util.Trie(key_phrases_list) with open(text_dir, 'r') as fin, open(save_dir, 'w') as fout: # Can't use csv_reader because line is too long. for line in fin: fout.write( read_corpus_util.preprocess_corpus(line, trie, to_lower, no_punctuations, no_special_char)) fout.write('\n')