示例#1
0
        weights1 = [self.weights[i] for i in pos_sample_indexes]

        # оставшиеся после усечения негативные примеры
        str_pairs0 = [self.str_pairs[i] for i in neg_sample_indexes]
        relevancy0 = [self.relevancy[i] for i in neg_sample_indexes]
        weights0 = [self.weights[i] for i in neg_sample_indexes]

        self.str_pairs = list(itertools.chain(str_pairs1, str_pairs0))
        self.relevancy = list(itertools.chain(relevancy1, relevancy0))
        self.weights = list(itertools.chain(weights1, weights0))


if __name__ == '__main__':
    tokenizer = Tokenizer()
    tokenizer.load()
    random_questions = CorpusSearcher()
    random_facts = CorpusSearcher()

    # прочитаем список случайных вопросов из заранее сформированного файла
    # (см. код на C# https://github.com/Koziev/chatbot/tree/master/CSharpCode/ExtractFactsFromParsing
    # и результаты его работы https://github.com/Koziev/NLP_Datasets/blob/master/Samples/questions4.txt)
    print('Loading random questions and facts...')
    with codecs.open(questions_path, 'r', 'utf-8') as rdr:
        for line in rdr:
            if len(line) < 40:
                question = line.strip()
                question = ru_sanitize(u' '.join(
                    tokenizer.tokenize(question.lower())))
                random_questions.add_phrase(normalize_qline(question))

    # Прочитаем список случайных фактов, чтобы потом генерировать отрицательные паттерны
    return False


df = pd.read_csv(os.path.join(data_folder, 'premise_question_relevancy.csv'),
                 encoding='utf-8',
                 delimiter='\t',
                 quoting=3)
added_pq = set(
    (premise + '|' + question)
    for premise, question in zip(df['premise'].values, df['question'].values))

segmenter = Segmenter()
tokenizer = Tokenizer()
tokenizer.load()

random_facts = CorpusSearcher()

# Прочитаем список случайных фактов, чтобы потом генерировать отрицательные паттерны
corpus_path = os.path.expanduser('~/Corpus/Raw/ru/text_blocks.txt')
n = 0
print(u'Loading samples from {}'.format(corpus_path))
with codecs.open(corpus_path, 'r', 'utf-8') as rdr:
    for line in rdr:
        line = line.strip()
        phrases = segmenter.split(line)
        for phrase in phrases:
            if phrase[-1] == '.':
                phrase = phrase.strip().replace('--', '-')
                if phrase.count('"') == 1:
                    phrase = phrase.replace('"', '')