Python negative_samples примеры использования

Язык программирования: Python

Пространство имен/Пакет: Model

Метод/Функция: negative_samples

Примеров на hotexamples.com: 2

Python negative_samples - 2 примера найдено. Это лучшие примеры Python кода для Model.negative_samples, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def get_train_data(data_type, w2v_model, qa_file, doc_file, to_file_path,
                   args):
    logger.info("preprocessing...")
    ns_amount = args.ns_amount

    questions = []
    answers = []

    # 计算每个question的向量
    input_length = 0
    with open(qa_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
                questions.append(words)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(int(a) - 1)  # 因为原始数据从1开始计数，这里减去1。改为从0开始。
                answers.append(ans)

    question_vecs = []
    for q_words in questions:
        question_vecs.append(sentence2vec(w2v_model, q_words, input_length))
    print("len(question_vecs)", len(question_vecs))

    # 计算每个document的向量
    docs = []
    output_length = 0
    with open(doc_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "":
                words = word_tokenize(remove_punc(line))
                output_length = max(len(words), output_length)
                docs.append(words)
    doc_vecs = []
    output_length = args.output_length
    for d_words in docs:
        doc_vecs.append(sentence2vec(w2v_model, d_words, output_length))
    print("len(doc_vecs)", len(doc_vecs))
    logger.info("input_length:%d, output_length:%d" %
                (input_length, output_length))

    # 计算每个doc出现的频率
    doc_count = {}
    for ans in answers:
        for a in ans:
            if a in doc_count.keys():
                doc_count[a] += 1
            else:
                doc_count[a] = 1

    # 计算每个doc的weight
    doc_weight = {}
    t_max = 0
    for k in doc_count.keys():
        t_max = max(t_max, doc_count[k])
    for k in doc_count.keys():
        doc_weight[k] = doc_count[k] / t_max

    total = len(question_vecs)
    train_num = int(total * 0.9)
    logger.info("train_num:%d, total:%d" % (train_num, total))

    # 打乱数据
    qa_index = list(range(total))
    random.shuffle(qa_index)

    step = 0
    while step * 200 <= train_num:
        # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w]
        q_encoder_input = []
        r_decoder_input = []
        w_decoder_input = []
        weight_data_r = []
        weight_data_w = []
        y_data = []

        qid_list = []
        label_list = []
        aid_list = []

        logger.info("step: %d" % step)

        end = min(train_num, (step + 1) * 200)
        for ss in range(step * 200, end):
            i = qa_index[ss]
            logger.info("question: %d" % i)
            qid_list.append(i)
            label_list.append(1)

            y = [1] + [0] * ns_amount
            y_data.append(y)
            # question
            q_encoder_input.append(question_vecs[i])
            # 每个question一个正确答案
            aid = answers[i][0]
            aid_list.append(aid)
            r_decoder_input.append(doc_vecs[aid])
            weight_data_r.append(doc_weight[aid])
            # 10个un-related答案
            aids = get_randoms(list(doc_weight.keys()), [aid], 10)
            w_decoder = []
            w_weight = []
            for aid in aids:
                w_decoder.append(doc_vecs[aid])
                w_weight.append(doc_weight[aid])

            w_decoder = np.array(w_decoder).reshape(output_length,
                                                    args.input_dim, ns_amount)
            w_weight = np.array(w_weight).reshape((1, ns_amount))
            w_decoder_input.append(w_decoder)
            weight_data_w.append(w_weight)

            for aaid in aids:
                qid_list.append(i)
                label_list.append(0)
                aid_list.append(aaid)

                # 这些答案都是unrelated
                y = [0] * (1 + ns_amount)
                y_data.append(y)
                # question
                q_encoder_input.append(question_vecs[i])

                r_decoder_input.append(doc_vecs[aaid])
                weight_data_r.append(doc_weight[aaid])
                # 10个un-related答案
                aids = get_randoms(list(doc_weight.keys()), [aid], 10)
                w_decoder = []
                w_weight = []
                for aid in aids:
                    w_decoder.append(doc_vecs[aid])
                    w_weight.append(doc_weight[aid])

                w_decoder = np.array(w_decoder).reshape(
                    output_length, args.input_dim, ns_amount)
                w_weight = np.array(w_weight).reshape((1, ns_amount))
                w_decoder_input.append(w_decoder)
                weight_data_w.append(w_weight)

        logger.info("loading weights: ckpt/nn_weights_%s.h5" % data_type)
        model = negative_samples(input_length=input_length,
                                 input_dim=args.input_dim,
                                 output_length=output_length,
                                 output_dim=args.output_dim,
                                 hidden_dim=args.hidden_dim,
                                 ns_amount=ns_amount,
                                 learning_rate=args.learning_rate,
                                 drop_rate=args.drop_rate)
        model.load_weights("ckpt/nn_weights_%s.h5" % data_type)
        new_dnn_model = Model(inputs=model.input,
                              outputs=model.get_layer('dropout_con').output)

        logger.info("predicting...")
        res = new_dnn_model.predict([
            q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r,
            weight_data_w
        ])
        # print(res)

        with open(to_file_path, "a") as f:
            for i in range(len(res)):
                row = res[i]
                feature_str = ''
                for j in range(len(row)):
                    feature_str = feature_str + (" %d:%.9f" % (j + 1, row[j]))
                label = label_list[i]
                id = qid_list[i]
                doc_id = aid_list[i]

                line = "%d qid:%d%s # doc-%d \n" % (label, id, feature_str,
                                                    doc_id)
                f.write(line)
        print("saved to:", to_file_path)
        logger.info("step:%d added" % step)
        step += 1

    logger.info("saved to: %s" % to_file_path)

Пример #2

Показать файл

def get_train_data(data_type,
                   w2v_model,
                   ckpt_path,
                   qa_file,
                   doc_file,
                   to_file_path,
                   args,
                   step=0):

    if os.path.exists(to_file_path):
        logger.info("file exists: %s" % to_file_path)
        return

    logger.info("preprocessing...")
    ns_amount = 10

    questions = []
    answers = []

    # 计算每个question的向量
    input_length = 0
    with open(qa_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i >= 2000:
                break
            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
                questions.append(words)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(int(a) - 1)  # 因为原始数据从1开始计数，这里减去1。改为从0开始。
                answers.append(ans)

    question_vecs = []
    for q_words in questions:
        question_vecs.append(sentence2vec(w2v_model, q_words, input_length))
    print("len(question_vecs)", len(question_vecs))

    # 计算每个document的向量
    docs = []
    output_length = 0
    with open(doc_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "":
                words = word_tokenize(remove_punc(line))
                output_length = max(len(words), output_length)
                docs.append(words)
    doc_vecs = []
    output_length = 1000
    for d_words in docs:
        doc_vecs.append(sentence2vec(w2v_model, d_words, output_length))
    print("len(doc_vecs)", len(doc_vecs))
    logger.info("input_length:%d, output_length:%d" %
                (input_length, output_length))

    # 计算每个doc出现的频率
    doc_count = {}
    for ii in range(len(docs)):
        doc_count[ii] = 0
    for ans in answers:
        for a in ans:
            if a in doc_count.keys():
                doc_count[a] += 1

    # 计算每个doc的weight
    doc_weight = {}
    t_max = 0
    for k in doc_count.keys():
        t_max = max(t_max, doc_count[k])
    for k in doc_count.keys():
        doc_weight[k] = doc_count[k] / t_max

    logger.info("loading weights...")
    model = negative_samples(input_length=input_length,
                             input_dim=args.input_dim,
                             output_length=output_length,
                             output_dim=args.output_dim,
                             hidden_dim=args.hidden_dim,
                             ns_amount=ns_amount,
                             learning_rate=args.learning_rate,
                             drop_rate=args.drop_rate)
    model.load_weights(ckpt_path)
    new_dnn_model = Model(inputs=model.input,
                          outputs=model.get_layer('dropout_con').output)

    total = len(question_vecs)
    train_num = int(total * 0.9)

    qid_list = []

    # 打乱数据
    qa_index = list(range(total))
    # random.shuffle(qa_index)

    for ss in range(train_num, total):
        i = qa_index[ss]

        # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w]
        q_encoder_input = []
        r_decoder_input = []
        w_decoder_input = []
        weight_data_r = []
        weight_data_w = []

        logger.info("get all documents for question: %d" % i)
        print("get all documents for question: %d" % i)
        # qid_list.append(i)
        # label_list.append(1)

        cur_answers = answers[i]
        doc_list_ordered = [a for a in cur_answers]
        for aid in list(doc_weight.keys()):
            if aid not in doc_list_ordered:
                doc_list_ordered.append(aid)

        label_list = []
        aid_list = []

        print("len(doc_list_ordered):", len(doc_list_ordered))
        print("len(cur_answers):", len(cur_answers))

        for aid in doc_list_ordered:
            aid_list.append(aid)
            if aid in cur_answers:
                label_list.append(1)
            else:
                label_list.append(0)

            # question
            q_encoder_input.append(question_vecs[i])
            r_decoder_input.append(doc_vecs[aid])
            weight_data_r.append(doc_weight[aid])
            # 10个un-related答案
            aids = get_randoms(list(doc_weight.keys()), cur_answers, ns_amount)
            w_decoder = []
            w_weight = []
            for aid in aids:
                w_decoder.append(doc_vecs[aid])
                w_weight.append(doc_weight[aid])

            w_decoder = np.array(w_decoder).reshape(output_length,
                                                    args.input_dim, ns_amount)
            w_weight = np.array(w_weight).reshape((1, ns_amount))
            w_decoder_input.append(w_decoder)
            weight_data_w.append(w_weight)

        logger.info("now:%d , predicting question: %d" % (ss, i))
        print("now:%d , predicting question: %d" % (ss, i))

        start = 0
        end = len(q_encoder_input)
        for cur in range(0, end, 1000):
            print("cur:%d / %d" % (cur, end))
            a = q_encoder_input[cur:cur + 1000]
            b = r_decoder_input[cur:cur + 1000]
            c = w_decoder_input[cur:cur + 1000]
            d = weight_data_r[cur:cur + 1000]
            e = weight_data_w[cur:cur + 1000]

            res = new_dnn_model.predict([a, b, c, d, e])
            # print(res)

            with open(to_file_path, "a") as f:
                for j in range(len(res)):
                    row = res[j]
                    feature_str = ''
                    for k in range(len(row)):
                        feature_str = feature_str + (" %d:%.9f" %
                                                     (k + 1, row[k]))
                    label = label_list[j]
                    doc_id = aid_list[j]

                    line = "%d qid:%d%s # doc-%d \n" % (label, i, feature_str,
                                                        doc_id)
                    f.write(line)
    print("saved to:", to_file_path)
    logger.info("total:%d" % total)
    logger.info("saved to: %s" % to_file_path)