Пример #1
0
 def _build_offensive_ngrams(offensive_phrases_path):
     offensive_phrases = load_file(offensive_phrases_path)
     offensive_ngrams = [
         tuple(get_tokens_sequence(offensive_phrase))
         for offensive_phrase in offensive_phrases
     ]
     return set(offensive_ngrams)
Пример #2
0
def _read_testset():
    corpus_path = os.path.join(TEST_DATA_DIR, '{}.txt'.format(TEST_CORPUS_NAME))
    test_lines = load_file(corpus_path)

    testset = defaultdict(set)
    for i in xrange(0, len(test_lines) - 1, 2):
        context = test_lines[i].strip()
        response = test_lines[i + 1].strip()
        testset[context].add(response)

    return testset
Пример #3
0
def _read_testset():
    corpus_path = os.path.join(TEST_DATA_DIR,
                               '{}.txt'.format(TEST_CORPUS_NAME))
    test_lines = load_file(corpus_path)

    testset = defaultdict(set)
    for i in xrange(0, len(test_lines) - 1, 2):
        context = test_lines[i].strip()
        response = test_lines[i + 1].strip()
        testset[context].add(response)

    return testset
Пример #4
0
def get_tokenized_test_lines(corpus_name, tokens_voc):
    corpus_path = os.path.join(TEST_DATA_DIR, '%s.txt' % corpus_name)
    if not is_non_empty_file(corpus_path):
        raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path))
    test_lines = load_file(corpus_path)
    result = []
    for line in test_lines:
        tokenized_line = get_tokens_sequence(line)
        tokenized_line = replace_out_of_voc_tokens(tokenized_line, tokens_voc)
        result.append(tokenized_line)

    return result
Пример #5
0
 def _build_offensive_ngrams(offensive_phrases_path):
     offensive_phrases = load_file(offensive_phrases_path)
     offensive_ngrams = [tuple(get_tokens_sequence(offensive_phrase)) for offensive_phrase in offensive_phrases]
     return set(offensive_ngrams)