def get_test_keras_bert_iterator(data_path, file_name): it = get_test_data_iterator(data_path, file_name) for source, target, cat_source, cat_target, id in it: data_source = _get_indices(text=source) data_target = _get_indices(text=target) seg_source = jieba.lcut(source) seg_target = jieba.lcut(target) bm25 = calculate_bm25_similarity(bm25Model, seg_source, seg_target) tf_cosine = calculate_tf_cosine_similarity(seg_source, seg_target) tfidf_cosine = calculate_tfidf_cosine_similarity(seg_source, seg_target, bm25Model.idf) yield data_source['input_ids'], data_source['token_type_ids'], data_source['attention_mask'], data_target['input_ids'], data_target['token_type_ids'], data_target['attention_mask'], bm25, tf_cosine, tfidf_cosine, cat_source, cat_target, id
def get_keras_bert_iterator(data_path, file_names, tokenizer): while True: data_it = get_data_iterator(data_path, file_names) for source, target, cat_source, cat_target, labelA, labelB in data_it: data_source = _get_indices(text=source) data_target = _get_indices(text=target) # print(indices, type(indices), len(indices)) seg_source = jieba.lcut(source) seg_target = jieba.lcut(target) bm25 = calculate_bm25_similarity(bm25Model, seg_source, seg_target) tf_cosine = calculate_tf_cosine_similarity(seg_source, seg_target) tfidf_cosine = calculate_tfidf_cosine_similarity(seg_source, seg_target, bm25Model.idf) yield data_source['input_ids'], data_source['token_type_ids'], data_source['attention_mask'], data_target['input_ids'], data_target['token_type_ids'], data_target['attention_mask'], bm25, tf_cosine, tfidf_cosine, cat_source, cat_target, labelA, labelB