def read_file(filename): """读取文件数据""" items = [] with open_file(filename) as f: for line in f: item = line.strip() if item: items.append(item) return items
def test(filename): topic_feature_path = 'output/topics_feature.npy' if not os.path.exists(topic_feature_path): topic_to_features(topics_dir, topic_feature_path) topics_feature = np.load(topic_feature_path) blog_file = open_file(filename, 'r') real_topics, blogs = [], [] for line in blog_file: try: fields = line.strip().split('\t') topic = fields[0] blog = fields[1] real_topics.append(topic) blogs.append(blog) except: pass blog_file.close() session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 读取保存的模型 print('Test...') tokenizer = tokenization.FullTokenizer(vocab_file=vocab_dir, do_lower_case=True) blogs_pad = sentences2id(blogs, tokenizer, config.seq_length) batch_data = batch_iter(blogs_pad, 1) start_time = time.time() top1_count = 0 top5_count5 = 0 for i, blog_batch in enumerate(batch_data): blog_feature = session.run(model.blog_feature, feed_dict={ model.blog: blog_batch, model.keep_prob: 1.0 }) dis = cosin(topics_feature, blog_feature) dis_idx = np.argsort(-dis)[:5] predict_topics = [topics[i] for i in dis_idx] if real_topics[i] in predict_topics: top5_count5 += 1 if real_topics[i] == predict_topics[0]: top1_count += 1 acc1 = float(top1_count) / len(real_topics) acc5 = float(top5_count5) / len(real_topics) print("top1-acc:%f\ttop5-acc:%f" % (acc1, acc5)) time_dif = get_time_dif(start_time) print("Time usage:", time_dif)
def gen_hard_examples(input_file=train_dir, output_file='model_train.data'): topics_feature = np.load('data/topics_feature.npy') blogs_feature = np.load('data/blogs_feature.npy') blog_file = open_file(input_file, 'r') real_topics = [] for line in blog_file: topic = line.split('\t')[0] real_topics.append(topic) print(len(real_topics)) blog_file.close() print('Hard examples generating...') f = open_file(output_file, 'w') for i, blog_feature in enumerate(blogs_feature): dis = cosin(topics_feature, blog_feature) dis_idx = np.argsort(-dis)[:128] hard_topics = [topics[i] for i in dis_idx] f.write('%s\t%s\n' % (real_topics[i], '\t'.join(hard_topics))) f.close() print("done!")
def read_file2(filename): """读取文件数据""" contents, labels = [], [] with open_file(filename) as f: for line in f: try: label, content = line.strip().split('\t') if content: contents.append(native_content(content)) #contents.append(list((native_content(content)))) labels.append(native_content(label)) except: pass return contents, labels
def extra_result_file(test_dir, y): """ 用来整合结果数据,组合最后生成的json格式 :param test_dir: :param y: :return: """ get_data = [] with open_file(test_dir) as f: for line in f: label, content = line.strip().split('\t') get_data.append(content) id_to_cate = id_to_category() cate_list = [] for index in y: cate_list.append(id_to_cate[str(index)]) # defaultdict类的初始化函数接受一个类型作为参数,当所访问的键不存在的时候,可以实例化一个值作为默认值 zipped = defaultdict(list) for (key, value) in zip(cate_list, get_data): # 处理孕产次 if (key == "孕产次"): nums = pc.extract_yunzhou_yunci(value) tags = ("孕次", "产次") dictionary = dict(zip(tags, nums)) zipped[key].append(dictionary) # 处理孕周 elif (key == "孕周"): nums = pc.extract_yunzhou_yunci(value) tags = ("周数", "天数") dictionary = dict(zip(tags, nums)) zipped[key].append(dictionary) # 处理阿氏评分 elif (key == "阿氏评分"): nums = pc.extract_alpha(value) tags = ("1分钟", "5分钟", "10分钟") dictionary = dict(zip(tags, nums)) zipped[key].append(dictionary) # 如果是其他情况,则直接加入dict即可 else: zipped[key].append(value) with open('write.json', 'w', encoding="utf-8") as f: json.dump(zipped, f, ensure_ascii=False)