weights1 = [self.weights[i] for i in pos_sample_indexes] # оставшиеся после усечения негативные примеры str_pairs0 = [self.str_pairs[i] for i in neg_sample_indexes] relevancy0 = [self.relevancy[i] for i in neg_sample_indexes] weights0 = [self.weights[i] for i in neg_sample_indexes] self.str_pairs = list(itertools.chain(str_pairs1, str_pairs0)) self.relevancy = list(itertools.chain(relevancy1, relevancy0)) self.weights = list(itertools.chain(weights1, weights0)) if __name__ == '__main__': tokenizer = Tokenizer() tokenizer.load() random_questions = CorpusSearcher() random_facts = CorpusSearcher() # прочитаем список случайных вопросов из заранее сформированного файла # (см. код на C# https://github.com/Koziev/chatbot/tree/master/CSharpCode/ExtractFactsFromParsing # и результаты его работы https://github.com/Koziev/NLP_Datasets/blob/master/Samples/questions4.txt) print('Loading random questions and facts...') with codecs.open(questions_path, 'r', 'utf-8') as rdr: for line in rdr: if len(line) < 40: question = line.strip() question = ru_sanitize(u' '.join( tokenizer.tokenize(question.lower()))) random_questions.add_phrase(normalize_qline(question)) # Прочитаем список случайных фактов, чтобы потом генерировать отрицательные паттерны
return False df = pd.read_csv(os.path.join(data_folder, 'premise_question_relevancy.csv'), encoding='utf-8', delimiter='\t', quoting=3) added_pq = set( (premise + '|' + question) for premise, question in zip(df['premise'].values, df['question'].values)) segmenter = Segmenter() tokenizer = Tokenizer() tokenizer.load() random_facts = CorpusSearcher() # Прочитаем список случайных фактов, чтобы потом генерировать отрицательные паттерны corpus_path = os.path.expanduser('~/Corpus/Raw/ru/text_blocks.txt') n = 0 print(u'Loading samples from {}'.format(corpus_path)) with codecs.open(corpus_path, 'r', 'utf-8') as rdr: for line in rdr: line = line.strip() phrases = segmenter.split(line) for phrase in phrases: if phrase[-1] == '.': phrase = phrase.strip().replace('--', '-') if phrase.count('"') == 1: phrase = phrase.replace('"', '')