def make_x_train(): # создаём словарь со всеми текстами ручной разметки {индекс:текст} dict_texts = {} all_train_texts = db.select_text('train_corpus') for inst in all_train_texts: dict_texts[inst.index_text] = inst.text # создаём словарь {идекс1.индекс2: категория} dict_rels = {} all_train_relations = db.select_relations('train_relations') for inst in all_train_relations: dict_rels[inst.first_text + '.' + inst.second_text] = inst.relation # создаём вектор каждой пары из предыдущего словаря vectors = [] y_train = [] for pair, rel in dict_rels.items(): text1 = dict_texts[pair.split('.')[0]] text2 = dict_texts[pair.split('.')[1]] vectors.append(mv.compare(text1, text2)) y_train.append(rel) return vectors, y_train
def make_x_test(): collection = {} ids = [] for inst in db.select_text('texts', **{'publication_date': " BETWEEN '2015-01-05' AND '2015-01-05'"}): collection[inst.id] = inst.text ids.append(inst.id) vectors = [] pairs = [] for index, first_id in enumerate(ids[:-1]): for second_id in ids[index+1:]: pairs.append(str(first_id) + '.' + str(second_id)) vectors.append(mv.compare(collection[first_id], collection[second_id])) return vectors, pairs