def _build_offensive_ngrams(offensive_phrases_path): offensive_phrases = load_file(offensive_phrases_path) offensive_ngrams = [ tuple(get_tokens_sequence(offensive_phrase)) for offensive_phrase in offensive_phrases ] return set(offensive_ngrams)
def _read_testset(): corpus_path = os.path.join(TEST_DATA_DIR, '{}.txt'.format(TEST_CORPUS_NAME)) test_lines = load_file(corpus_path) testset = defaultdict(set) for i in xrange(0, len(test_lines) - 1, 2): context = test_lines[i].strip() response = test_lines[i + 1].strip() testset[context].add(response) return testset
def get_tokenized_test_lines(corpus_name, tokens_voc): corpus_path = os.path.join(TEST_DATA_DIR, '%s.txt' % corpus_name) if not is_non_empty_file(corpus_path): raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path)) test_lines = load_file(corpus_path) result = [] for line in test_lines: tokenized_line = get_tokens_sequence(line) tokenized_line = replace_out_of_voc_tokens(tokenized_line, tokens_voc) result.append(tokenized_line) return result
def _build_offensive_ngrams(offensive_phrases_path): offensive_phrases = load_file(offensive_phrases_path) offensive_ngrams = [tuple(get_tokens_sequence(offensive_phrase)) for offensive_phrase in offensive_phrases] return set(offensive_ngrams)