예제 #1
0
 def test_preprocess_corpus_no_lower_no_punc_with_special(self):
     s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!'
     trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names'])
     output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=True, no_special_char=False)
     expected_output = \
         'á THIS is a test_string where we test Whether join_entity_names is working properly on the_test_string'
     self.assertEqual(output, expected_output)
예제 #2
0
def clean_raw_corpus(save_dir,
                     start=0,
                     end=5000,
                     step=100,
                     to_lower=False,
                     no_punctuations=False,
                     no_special_char=True):
    if save_dir[-1] != '/':
        raise AssertionError('save_dir must end with a "/".')

    # This reads all phrases.
    paper_dict, facet_dict, entity_names_set = read_corpus_util.read_ns_entities(
        read_corpus_util.kCorpusDirectory, to_lower=False)
    paper_key_phrases_dict, key_phrases_set, _ = read_corpus_util.read_key_phrases(
        read_corpus_util.kCorpusDirectory, to_lower=False)
    all_phrases = list(entity_names_set.union(key_phrases_set))
    trie = trie_util.Trie(all_phrases)

    for i in range(start, end, step):
        # Field of study is either "Neuroscience" or "Computer Science"
        with open(save_dir + 'cs_corpus_max_matching_only_' + str(i) + '.txt',
                  'w',
                  encoding='utf8') as f:
            f.write(
                read_corpus_util.read_papers_corpus(
                    read_corpus_util.kCorpusDirectory,
                    field_of_study="Computer Science",
                    start_index=i,
                    end_index=i + step,
                    trie=trie,
                    to_lower=to_lower,
                    no_punctuations=no_punctuations,
                    no_special_char=no_special_char))
예제 #3
0
 def test_join_entity_names(self):
     s = 'This is a test string where we test whether join entity names is working properly on the test string ' \
         'the phrase join entity name should not be joined together but the phrase the test should'
     s = s.lower()
     trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names', 'the test'])
     output = read_corpus_util.join_entity_names(s, trie)
     expected_output = \
         'This is a test_string where we test whether join_entity_names is working properly on the_test_string ' \
         'the phrase join entity name should not be joined together but the phrase the_test should'
     expected_output = expected_output.lower()
     self.assertEqual(output, expected_output)
예제 #4
0
 def test_get_key_phrase_pairs_from_context_no_match_second(self):
     s_before = 'á THIS, is a test_string where we test! Whether'
     s_after = 'join entity names is working properly on the string?!'
     s_before_list = s_before.split(' ')
     s_after_list = s_after.split(' ')
     trie = trie_util.Trie([
         'test', 'test string', 'test case', 'the test string',
         'join entity names'
     ])
     expected_output = (None, None)
     actual_output = trie_util.get_key_phrase_pairs_from_context(
         s_before_list, s_after_list, trie)
     self.assertTupleEqual(actual_output, expected_output)
예제 #5
0
 def test_count_entity_names(self):
     s = 'á THIS, is a test_string where we test ! Whether join_entity_names is working properly on the test string?!'
     trie = trie_util.Trie([
         'test', 'test string', 'test case', 'the test string',
         'join entity names'
     ])
     expected_output = defaultdict(int, {
         'test': 2,
         'join_entity_names': 1,
         'test_string': 1
     })
     actual_output = defaultdict(int)
     trie_util.count_entity_names(s, trie, actual_output)
     self.assertDictEqual(actual_output, expected_output)
예제 #6
0
    def test_preprocess_corpus_no_lower_with_punc(self):
        s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!'
        trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names'])
        output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=False, no_special_char=True)
        # Notice that when there is punctuation, we cannot connect 'test string,' to 'test_string,'.
        expected_output = \
            'THIS, is a test string, where we test ! Whether join_entity_names is working properly on the test string?!'
        self.assertEqual(output, expected_output)

        def test_preprocess_corpus_no_lower_with_punc_with_special(self):
            s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!'
            trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names'])
            output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=False,
                                                        no_special_char=False)
            expected_output = \
                'á THIS, is a test string, where we test ! Whether join_entity_names is working properly on the test string?!'
            self.assertEqual(output, expected_output)
예제 #7
0
def clean_text_file(save_dir,
                    text_dir,
                    to_lower=False,
                    no_punctuations=False,
                    no_special_char=True):

    key_phrases_dict, key_phrases_set = read_corpus_util.read_key_phrases_dict(
        read_corpus_util.kCorpusDirectory, to_lower=to_lower)
    key_phrases_list = map(lambda phrase: phrase.replace(' ', '_'),
                           list(key_phrases_set))
    trie = trie_util.Trie(key_phrases_list)
    with open(text_dir, 'r') as fin, open(save_dir, 'w') as fout:
        # Can't use csv_reader because line is too long.
        for line in fin:
            fout.write(
                read_corpus_util.preprocess_corpus(line, trie, to_lower,
                                                   no_punctuations,
                                                   no_special_char))
            fout.write('\n')