def extract_important_tokens(corpus_file, min_count=1): corpus = SimpleCorpus(corpus_file) word_counts = defaultdict(int) for context in corpus.get_texts(): for word in context: word_counts[word] += 1 return set([k for k, v in word_counts.items() if v >= min_count])
def extract_important_tokens(corpus_file, min_count=1): corpus = SimpleCorpus(corpus_file) word_counts = defaultdict(int) for context in corpus.get_texts(): for word in context: word_counts[word] += 1 return set([k for k,v in word_counts.items() if v >= min_count])
class TestCorpusParser(unittest.TestCase): def setUp(self): self.interesting_tokens = set(['the','it']) module_path = os.path.dirname(__file__) self.corpus_path = os.path.join(module_path, 'test_data/corpus.en.1000') self.corpus = SimpleCorpus(self.corpus_path) def test_parse_corpus_contexts(self): contexts = parse_corpus_contexts(self.corpus_path, self.interesting_tokens) for context in contexts: self.assertTrue(len(set(context['target']).intersection(self.interesting_tokens)) > 0) all_contexts = parse_corpus_contexts(self.corpus_path) num_toks = sum([len(sen) for sen in self.corpus.get_texts()]) self.assertTrue(num_toks == len(all_contexts))
def get_corpus_file(corpus_file, label): corpus = SimpleCorpus(corpus_file) return (label, corpus.get_texts())
def parse_corpus_contexts(corpus_file, interesting_tokens=None, tag=1): corpus = SimpleCorpus(corpus_file) return list_of_target_contexts(corpus.get_texts(), interesting_tokens, tag=tag)