def _read_data(path, stage, match_type, mode='both'): if stage == 'train': if mode == 'part1': path = path.joinpath( 'train/SemEval2016-Task3-CQA-QL-train-part1.xml') data = _load_data_by_type(path, match_type) elif mode == 'part2': path = path.joinpath( 'train/SemEval2016-Task3-CQA-QL-train-part2.xml') data = _load_data_by_type(path, match_type) else: part1 = path.joinpath( 'train/SemEval2016-Task3-CQA-QL-train-part1.xml') p1 = _load_data_by_type(part1, match_type) part2 = path.joinpath( 'train/SemEval2016-Task3-CQA-QL-train-part1.xml') p2 = _load_data_by_type(part2, match_type) data = pd.concat([p1, p2], ignore_index=True) return matchzoo.pack(data) elif stage == 'dev': path = path.joinpath('dev/SemEval2016-Task3-CQA-QL-dev.xml') data = _load_data_by_type(path, match_type) return matchzoo.pack(data) else: path = path.joinpath('SemEval2016-Task3-CQA-QL-test.xml') data = _load_data_by_type(path, match_type) return matchzoo.pack(data)
def load_data( path: str = 'train.csv' ) -> typing.Union[mz.DataPack, typing.Tuple[mz.DataPack, list]]: data_pack = mz.pack(pd.read_csv(path, index_col=0, error_bad_lines=False)) data_pack.relation['label'] = data_pack.relation['label'].astype(int) data_pack = data_pack.one_hot_encode_label(num_classes=2) return data_pack
def _read_data(path, task, target_label): table = pd.read_csv(path, sep='\t') df = pd.DataFrame({ 'text_left': table['sentence1'], 'text_right': table['sentence2'], 'label': table['gold_label'] }) df = df.dropna(axis=0, how='any').reset_index(drop=True) df = df.dropna(axis=0, how='any').reset_index(drop=True) filter_id = df[df['label'] == '-'].index.tolist() df.drop(filter_id, inplace=True) if task == 'ranking' or isinstance(task, matchzoo.tasks.Ranking): if target_label not in ['entailment', 'contradiction', 'neutral']: raise ValueError(f"{target_label} is not a valid target label." f"Must be one of `entailment`, `contradiction`" f" and `neutral`") df['label'] = (df['label'] == target_label) elif task == 'classification' or isinstance(task, matchzoo.tasks.Classification): classes = [True, False] df['label'] = df['label'].apply(classes.index) else: raise ValueError(f"{task} is not a valid task." f"Must be one of `Ranking` and `Classification`.") return matchzoo.pack(df, task)
def _elmo_read_data(self, path): def _replace_text_content(ids: List[int], new_text_dict: Dict[int, str]): ans = [" ".join(new_text_dict[v]) for v in ids] return ans table = pd.read_csv(path, sep='\t', header=0, quoting=csv.QUOTE_NONE) df = pd.DataFrame({ 'text_left': _replace_text_content(table["QueryID"], self.queries_content), 'raw_text_left': table['QueryText'].copy(), 'images_left': table['QueryImages'].copy().progress_apply(str.split), # indices 'text_right': _replace_text_content(table["DocID"], self.docs_content), 'raw_text_right': table['DocText'].copy(), 'images_right': table['DocImages'].copy().progress_apply(str.split), # indices 'id_left': table['QueryID'], 'id_right': table['DocID'], 'label': table['Label'] }) return matchzoo.pack(df)
def _read_data(path): table = pd.read_table(path) df = pd.DataFrame({ 'text_left': table['sentence1'], 'text_right': table['sentence2'], 'label': table['gold_label'] }) return matchzoo.pack(df)
def read_data(path): table = pd.read_csv(path) df = pd.DataFrame({ 'text_left': table['sentence1'], 'text_right': table['sentence2'], 'label': table['label'] }) return mz.pack(df)
def _read_predict(datas, labels=None): if not labels: labels = [0] * len(datas) dataset = [] for label, (left, right) in zip(labels, datas): dataset.append((label, " ".join(list(left)), " ".join(list(right)))) df = pd.DataFrame(data=dataset, columns=["label", "text_left", "text_right"]) return matchzoo.pack(df)
def _read_data(path): table = pd.read_csv(path, sep='\t') df = pd.DataFrame({ 'text_left': table['sentence1'], 'text_right': table['sentence2'], 'label': table['gold_label'] }) df = df.dropna(axis=0, how='any').reset_index(drop=True) return matchzoo.pack(df)
def _read_data(path): table = pd.read_csv(path, sep='\t', header=0, quoting=csv.QUOTE_NONE) df = pd.DataFrame({ 'text_left': table['Question'], 'text_right': table['Sentence'], # 'id_left': table['QuestionID'], # 'id_right': table['SentenceID'], 'label': table['Label'] }) return mz.pack(df)
def _read_data(path, task): table = pd.read_csv(path, sep='\t', header=0, quoting=csv.QUOTE_NONE) df = pd.DataFrame({ 'text_left': table['querystring'], 'text_right': table['documentstring'], 'id_left': table['topicid'], 'id_right': table['docid'], 'label': table['label'] }) return matchzoo.pack(df, task)
def load_data(data_type, line_nub=-1): id_set = set() X_left = [] X_left_id = [] X_right = [] X_right_id = [] y = [] json_line_s = open(root_dir.format(data_type + ".json.jieba.pre.json"), "r", encoding="utf-8").readlines() query_data_loder = tqdm(json_line_s) query_data_loder.set_description("load query data lines") for json_line in query_data_loder: tdata = json.loads(json_line) text_id = tdata["text_id"] query_text = all_text_dic[text_id] mention_data = tdata["mention_data"] for mention in mention_data: text_id_subject = text_id + mention["mention"] kb_id = mention["kb_id"] doc_text = kb_all_text_dic[kb_id] y_label = int(mention["label"]) pid = text_id + "_" + kb_id if len(id_set) == line_nub: break if pid not in id_set: id_set.add(pid) X_left.append(query_text) X_right.append(doc_text) X_left_id.append(text_id_subject) X_right_id.append(kb_id) y.append(y_label) else: continue break df = pd.DataFrame({ 'text_left': X_left, 'text_right': X_right, 'id_left': X_left_id, 'id_right': X_right_id, 'label': y }) return mz.pack(df)
def _read_data(path): table = pd.read_json(path, lines=True) df = pd.DataFrame({ 'text_left': table['question'].str.join(' '), 'text_right': table['qaquestion'].str.join(' '), 'id_left': table['qid'], 'id_right': table['qaid'], 'label': table['qarel'].astype(float) }) print(df) return matchzoo.pack(df)
def _load_data(task, path): if task == 'ranking': task = mz.tasks.Ranking() if task == 'classification': task = mz.tasks.Classification() data_pack = mz.pack(pd.read_csv(path, index_col=0, engine='python')) if isinstance(task, mz.tasks.Ranking): data_pack.relation['label'] = data_pack.relation['label'].astype( 'float32') return data_pack elif isinstance(task, mz.tasks.Classification): data_pack.relation['label'] = data_pack.relation['label'].astype(int) return data_pack.one_hot_encode_label(num_classes=2), [False, True]
def load_data( stage: str = 'train', task: str = 'ranking', return_classes: bool = False ) -> typing.Union[matchzoo.DataPack, typing.Tuple[matchzoo.DataPack, list]]: """ Load WikiQA data. :param stage: One of `train`, `dev`, and `test`. :param task: Could be one of `ranking`, `classification` or a :class:`matchzoo.engine.BaseTask` instance. :param return_classes: `True` to return classes for classification task, `False` otherwise. :return: A DataPack unless `task` is `classificiation` and `return_classes` is `True`: a tuple of `(DataPack, classes)` in that case. Example: >>> import matchzoo as mz >>> stages = 'train', 'dev', 'test' >>> tasks = 'ranking', 'classification' >>> for stage in stages: ... for task in tasks: ... _ = mz.datasets.toy.load_data(stage, task) """ if stage not in ('train', 'dev', 'test'): raise ValueError(f"{stage} is not a valid stage." f"Must be one of `train`, `dev`, and `test`.") if task == 'ranking': task = matchzoo.tasks.Ranking() if task == 'classification': task = matchzoo.tasks.Classification() path = Path(__file__).parent.joinpath(f'{stage}.csv') data_pack = matchzoo.pack(pd.read_csv(path, index_col=0)) if isinstance(task, matchzoo.tasks.Ranking): data_pack.relation['label'] = \ data_pack.relation['label'].astype('float32') return data_pack elif isinstance(task, matchzoo.tasks.Classification): data_pack.relation['label'] = data_pack.relation['label'].astype(int) data_pack = data_pack.one_hot_encode_label(num_classes=2) if return_classes: return data_pack, [False, True] else: return data_pack else: raise ValueError(f"{task} is not a valid task." f"Must be one of `Ranking` and `Classification`.")
def _read_data(fpath): datas = [] with open(fpath, "r") as rf: for line in rf: line = line.strip() if not line: continue obj = json.loads(line) datas.append((obj['label'], " ".join(obj['text_left'].split()), " ".join(obj['text_right'].split()))) df = pd.DataFrame(data=datas, columns=["label", "text_left", "text_right"]) return matchzoo.pack(df)
def _data_pack(self,X,y=None,stage='train'): data = pd.concat([X, y], axis=1, ignore_index=True, sort=False) columns = data.columns if stage in ['train', 'dev']: df = pd.DataFrame({ 'text_left': data[columns[0]], 'text_right': data[columns[1]], 'label': data[columns[2]].astype(int) }) else: df = pd.DataFrame({ 'text_left': data[columns[0]], 'text_right': data[columns[1]], }) return mz.pack(df, self.task)
def predict(train_id='test_file'): q = 'how did apollo creed die' d = "Urban legend states that Apollo Creed's name is a wordplay on the Apostles' Creed , a statement of belief used in Christian churches." df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]}) preprocessor = mz.load_preprocessor(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.dssm_preprocessor') predict_pack = mz.pack(df) predict_pack_processed = preprocessor.transform(predict_pack) model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.dssm_model') predict_score = float( model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = {'score': predict_score} print(ret_dict)
def DSSM(model): request_data = json.loads(request.data.decode('utf-8')) q = request_data['text1'] d = request_data['text2'] train_id = request_data['train_id'] df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]}) preprocessor_suffix = '.' + model + '_preprocessor' preprocessor = mz.load_preprocessor(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + preprocessor_suffix) predict_pack = mz.pack(df) predict_pack_processed = preprocessor.transform(predict_pack) keras.backend.clear_session() model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float( model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = {'score': predict_score} ''' if model != 'drmm': model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float(model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = { 'score': predict_score } else: glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] pred_generator = mz.HistogramDataGenerator(data_pack=predict_pack_processed, embedding_matrix=embedding_matrix, bin_size=30, hist_mode='LCH') test_x, test_y = pred_generator[:] keras.backend.clear_session() model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float(model.predict(test_x)[0][0]) ret_dict = { 'score': predict_score } ''' return jsonify(ret_dict)
def _read_data(path, stage): data = pd.read_csv(path, sep='\t', error_bad_lines=False, dtype=object) data = data.dropna(axis=0, how='any').reset_index(drop=True) if stage in ['train', 'dev']: df = pd.DataFrame({ 'id_left': data['qid1'], 'id_right': data['qid2'], 'text_left': data['question1'], 'text_right': data['question2'], 'label': data['is_duplicate'].astype(int) }) else: df = pd.DataFrame({ 'text_left': data['question1'], 'text_right': data['question2'] }) return matchzoo.pack(df)
def predict_similarity_scores(tweets,q,n_top): keras.backend.clear_session() pre=mz.engine.base_preprocessor.load_preprocessor('./search/Preprocessors/MatchPy_full_fasttext') model=mz.engine.base_model.load_model('./search/Models/MatchPy_full_fasttext') tokenizer = RegexpTokenizer(r'\w+') rows=[] for id, tweet in tweets.iterrows(): rows+=[{ 'id_left': 1, 'text_left': q, 'id_right': tweet['id_str'], 'text_right': tweet['processed_text'], 'label':1 }] if len(rows)==0: return None data=pd.DataFrame.from_dict(rows) data_pack = mz.pack(data) del data data_pack.relation['label'] = data_pack.relation['label'].astype('float32') predict_pack_processed=pre.transform(data_pack) predict_generator = mz.DPoolDataGenerator(predict_pack_processed, fixed_length_left=10, fixed_length_right=40, batch_size=20) logging.info('\n# Predictiong...\n') pred_x, pred_y = predict_generator[:] predictions=model.predict(pred_x) del data_pack i=0 tweets.set_index('id_str',inplace=True) tweets['score']=0 x=pd.DataFrame(pred_x,columns=['id_left','id_right']) for index,row in x.iterrows(): tweets.loc[row['id_right'],'score']=predictions[i][0] i+=1 del x tweets=tweets.sort_values(['score'],ascending=[False]) # eliminer les scores negatifs logging.info('\n# Prediction [OK]\n') return tweets
def _read_data(path, stage, task): data = pd.read_csv(path, error_bad_lines=False, dtype=object) data = data.dropna(axis=0, how='any').reset_index(drop=True) # data['question1_list'] = data['question1'].apply(lambda t: ' '.join(list(t))) if stage in ['train', 'dev']: df = pd.DataFrame({ # 'id_left': data['qid1'], # 'id_right': data['qid2'], 'text_left': data['question1_cut'], 'text_right': data['question2_cut'], # 'text_left': data['question1'], # 'text_right': data['question2'], 'label': data['label'].astype(int) }) else: df = pd.DataFrame({ 'text_left': data['question1_cut'], 'text_right': data['question2_cut'] }) return matchzoo.pack(df, task)
def _read_data(path): premise_list = [] hypothesis_list = [] label_list = [] idx_list = [] for line in open(path, "r", encoding="utf-8"): line = json.loads(line) premise_list.append(line["premise"]) hypothesis_list.append(line["hypothesis"]) label_list.append(line["label"]) idx_list.append(line["idx"]) df = pd.DataFrame({ 'text_left': premise_list, 'text_right': hypothesis_list, 'label': label_list }) print(df) df = df.dropna(axis=0, how='any').reset_index(drop=True) return matchzoo.pack(df)
def pairs_to_datapack(pairs, task='classification'): #text id: pid + entity label text_left = [] text_right = [] id_left = [] id_right = [] label = [] for pair in pairs: text_left.append(pair[1]) id_left.append(pair[0] + '_' + pair[3]) text_right.append(pair[2]) id_right.append(pair[0] + '_' + pair[4]) label.append(pair[5]) df = pd.DataFrame({ 'text_left': text_left, 'text_right': text_right, 'id_left': id_left, 'id_right': id_right, 'label': label }) return mz.pack(df, task)
def load_data(stage: str = 'train', task: str = 'ranking',path='/data/CORPUS/tweets/train_data_2016_new.csv' ) -> typing.Union[mz.DataPack, tuple]: """ Load data. :param stage: One of `train`, `dev`, and `test`. :param task: Could be one of `ranking`, `classification` or a :class:`mz.engine.BaseTask` instance. :return: A DataPack if `ranking`, a tuple of (DataPack, classes) if `classification`. """ if stage not in ('train', 'dev', 'test'): raise ValueError(f"{stage} is not a valid stage." f"Must be one of `train`, `dev`, and `test`.") if task == 'ranking': task = mz.tasks.Ranking() if task == 'classification': task = mz.tasks.Classification() table=pd.read_csv(path, index_col=0) # change column names df=pd.DataFrame({ "text_left": table['topic_text'], 'text_right': table['tweet_text'], 'id_left': table['topic_id'], 'id_right':table['tweet_id'], 'label': table['label'] }) df=df.reset_index() data_pack = mz.pack(df) if isinstance(task, mz.tasks.Ranking): data_pack.relation['label'] = \ data_pack.relation['label'].astype('float32') return data_pack elif isinstance(task, mz.tasks.Classification): data_pack.relation['label'] = data_pack.relation['label'].astype(int) return data_pack.one_hot_encode_label(num_classes=2), [False, True] else: raise ValueError(f"{task} is not a valid task.")
def load_data(stage='train', task='ranking'): """ Load WikiQA data. :param stage: One of `train`, `dev`, and `test`. :param task: Could be one of `ranking`, `classification` or a :class:`matchzoo.engine.BaseTask` instance. :return: A DataPack if `ranking`, a tuple of (DataPack, classes) if `classification`. Example: >>> import matchzoo as mz >>> stages = 'train', 'dev', 'test' >>> tasks = 'ranking', 'classification' >>> for stage in stages: ... for task in tasks: ... _ = mz.datasets.toy.load_data(stage, task) """ if stage not in ('train', 'dev', 'test'): raise ValueError(f"{stage} is not a valid stage." f"Must be one of `train`, `dev`, and `test`.") if task == 'ranking': task = matchzoo.tasks.Ranking() if task == 'classification': task = matchzoo.tasks.Classification() path = Path(__file__).parent.joinpath(f'{stage}.csv') data_pack = matchzoo.pack(pd.read_csv(path, index_col=0)) if isinstance(task, matchzoo.tasks.Ranking): data_pack.relation['label'] = \ data_pack.relation['label'].astype('float32') return data_pack elif isinstance(task, matchzoo.tasks.Classification): data_pack.relation['label'] = data_pack.relation['label'].astype(int) return data_pack.one_hot_encode_label(num_classes=2), [False, True] else: raise ValueError(f"{task} is not a valid task.")
def _read_data(path): premise_question_list = [] choice_list = [] label_list = [] idx_list = [] choice_idx_list=[] for line in open(path, "r", encoding="utf-8"): line = json.loads(line) if line["question"] == "cause": question = "What was the cause of this?" else: question = "What happened as a result?" premise_question_list.append(line["premise"] + " " + question) premise_question_list.append(line["premise"] + " " + question) choice_list.append(line["choice1"]) choice_list.append(line["choice2"]) if int(line["label"])==0: label_list.append(1) label_list.append(0) else: label_list.append(0) label_list.append(1) idx_list.append(line["idx"]) idx_list.append(line["idx"]) choice_idx_list.append(str(line["idx"]) + "_0") choice_idx_list.append(str(line["idx"]) + "_1") df = pd.DataFrame({ 'text_left': premise_question_list, 'text_right': choice_list, 'id_left': idx_list, 'id_right': choice_idx_list, 'label': label_list }) print(df) df = df.dropna(axis=0, how='any').reset_index(drop=True) return matchzoo.pack(df)
def test_data(topic_number, cord_uids, query, meta, msp): text_left = [] id_left = [] text_right = [] id_right = [] label = [] for cord_uid in cord_uids: sha = meta[meta['cord_uid'] == cord_uid]['sha'].values[0] path = msp[sha] with open(path, 'r') as open_file: txt = text(open_file.read()) id_left.append(str(topic_number)) text_left.append(query) id_right.append(cord_uid) text_right.append(txt) label.append(0) df = pd.DataFrame(data={'text_left': text_left, 'id_left': id_left, 'text_right': text_right, 'id_right': id_right, 'label': label}) return mz.pack(df)
def load_data(data_path): df_data = pd.read_csv(data_path, sep='\t', header=None) df_data = pd.DataFrame(df_data.values, columns=['id_left', 'text_left', 'id_right', 'text_right', 'label']) df_data = mz.pack(df_data) return df_data
import matchzoo as mz import os print('matchzoo version', mz.__version__) DATA_DIR = '/data/disk2/private/guozhipeng/syq/coliee/Case_Law/format/matchzoo' ranking_task = mz.tasks.Ranking(losses=mz.losses.RankHingeLoss()) ranking_task.metrics = [ mz.metrics.Precision(k=5), mz.metrics.Recall(k=5), mz.metrics.F1(k=5) ] print("`ranking_task` initialized with metrics", ranking_task.metrics) train_pack_raw = mz.pack( pd.read_csv(os.path.join(DATA_DIR, 'train_bm25.csv'), index_col=False, encoding='utf8'), 'ranking') dev_pack_raw = mz.pack( pd.read_csv(os.path.join(DATA_DIR, 'dev_bm25.csv'), index_col=False, encoding='utf8'), 'ranking') test_pack_raw = mz.pack( pd.read_csv(os.path.join(DATA_DIR, 'test_bm25.csv'), index_col=False, encoding='utf8'), 'ranking') print('data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`') preprocessor = mz.models.DUET.get_default_preprocessor( filter_mode='df', filter_low_freq=2,
import matchzoo as mz import pandas as pd print(mz.__version__) data_pack = mz.pack(pd.read_csv('match-zoo-corpus-top-1w.csv')) print(data_pack[-10:]) data_pack.relation['label'] = data_pack.relation['label'].astype('float32') frame = data_pack.frame task = mz.tasks.Ranking() train_raw = data_pack # mz.datasets.toy.load_data(stage='train', task=task) test_raw = data_pack # mz.datasets.toy.load_data(stage='test', task=task) model = mz.load_model('step-2-mz-model') preprocessor = mz.preprocessors.BasicPreprocessor() preprocessor.fit(train_raw, verbose=0) ## init preprocessor inner state. # train_processed = preprocessor.transform(train_raw, verbose=5) test_processed = preprocessor.transform(test_raw, verbose=0) # x, y = train_processed.unpack() test_x, test_y = test_processed.unpack() results = model.predict(test_x) print(type(results)) print(len(results)) print(results) for idx, item in enumerate(results[:20]): print('*' * 100) print(idx)