def read_file(filename):
    """读取文件数据"""
    items = []
    with open_file(filename) as f:
        for line in f:
            item = line.strip()
            if item:
                items.append(item)
    return items
def test(filename):
    topic_feature_path = 'output/topics_feature.npy'
    if not os.path.exists(topic_feature_path):
        topic_to_features(topics_dir, topic_feature_path)

    topics_feature = np.load(topic_feature_path)

    blog_file = open_file(filename, 'r')
    real_topics, blogs = [], []
    for line in blog_file:
        try:
            fields = line.strip().split('\t')
            topic = fields[0]
            blog = fields[1]
            real_topics.append(topic)
            blogs.append(blog)
        except:
            pass
    blog_file.close()

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型
    print('Test...')

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_dir,
                                           do_lower_case=True)

    blogs_pad = sentences2id(blogs, tokenizer, config.seq_length)
    batch_data = batch_iter(blogs_pad, 1)

    start_time = time.time()
    top1_count = 0
    top5_count5 = 0
    for i, blog_batch in enumerate(batch_data):
        blog_feature = session.run(model.blog_feature,
                                   feed_dict={
                                       model.blog: blog_batch,
                                       model.keep_prob: 1.0
                                   })

        dis = cosin(topics_feature, blog_feature)
        dis_idx = np.argsort(-dis)[:5]

        predict_topics = [topics[i] for i in dis_idx]
        if real_topics[i] in predict_topics:
            top5_count5 += 1
        if real_topics[i] == predict_topics[0]:
            top1_count += 1

    acc1 = float(top1_count) / len(real_topics)
    acc5 = float(top5_count5) / len(real_topics)
    print("top1-acc:%f\ttop5-acc:%f" % (acc1, acc5))

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)
def gen_hard_examples(input_file=train_dir, output_file='model_train.data'):
    topics_feature = np.load('data/topics_feature.npy')
    blogs_feature = np.load('data/blogs_feature.npy')

    blog_file = open_file(input_file, 'r')
    real_topics = []
    for line in blog_file:
        topic = line.split('\t')[0]
        real_topics.append(topic)
    print(len(real_topics))
    blog_file.close()

    print('Hard examples generating...')
    f = open_file(output_file, 'w')
    for i, blog_feature in enumerate(blogs_feature):
        dis = cosin(topics_feature, blog_feature)
        dis_idx = np.argsort(-dis)[:128]
        hard_topics = [topics[i] for i in dis_idx]

        f.write('%s\t%s\n' % (real_topics[i], '\t'.join(hard_topics)))
    f.close()
    print("done!")
示例#4
0
def read_file2(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(native_content(content))
                    #contents.append(list((native_content(content))))
                    labels.append(native_content(label))
            except:
                pass
    return contents, labels
def extra_result_file(test_dir, y):
    """
    用来整合结果数据,组合最后生成的json格式
    :param test_dir:
    :param y:
    :return:
    """
    get_data = []
    with open_file(test_dir) as f:
        for line in f:
            label, content = line.strip().split('\t')
            get_data.append(content)
    id_to_cate = id_to_category()
    cate_list = []
    for index in y:
        cate_list.append(id_to_cate[str(index)])

    # defaultdict类的初始化函数接受一个类型作为参数,当所访问的键不存在的时候,可以实例化一个值作为默认值
    zipped = defaultdict(list)
    for (key, value) in zip(cate_list, get_data):
        # 处理孕产次
        if (key == "孕产次"):
            nums = pc.extract_yunzhou_yunci(value)
            tags = ("孕次", "产次")
            dictionary = dict(zip(tags, nums))
            zipped[key].append(dictionary)

        # 处理孕周
        elif (key == "孕周"):
            nums = pc.extract_yunzhou_yunci(value)
            tags = ("周数", "天数")
            dictionary = dict(zip(tags, nums))
            zipped[key].append(dictionary)

        # 处理阿氏评分
        elif (key == "阿氏评分"):
            nums = pc.extract_alpha(value)
            tags = ("1分钟", "5分钟", "10分钟")
            dictionary = dict(zip(tags, nums))
            zipped[key].append(dictionary)

        # 如果是其他情况,则直接加入dict即可
        else:
            zipped[key].append(value)

    with open('write.json', 'w', encoding="utf-8") as f:
        json.dump(zipped, f, ensure_ascii=False)