Exemplo n.º 1
0
def retrieve():
    import pickle as pkl
    print('retrieve...', file=sys.stderr)
    #1. read dictionary
    dictionary = Dictionary()
    dictionary.load_from_galago_dump(args.dict_file, args.dict_min_freq)

    #2. make snrm instance & load weight
    device = torch.device('cpu')
    snrm = SNRM(args).to(device)
    snrm.load_state_dict(torch.load(args.model_file))  ## load model
    snrm.eval()  ## set inference mode

    #3. read train data
    q_data = Triplet('query', args, dictionary)

    #4. read index
    inverted_index = InMemoryInvertedIndex(args.conv3_channel)
    inverted_index.load(args.index_file)

    #5. read data
    db_loader = DataLoader(dataset=q_data,
                           batch_size=1,
                           shuffle=False,
                           num_workers=0)

    #6. retrieve
    with torch.no_grad():
        result = dict()
        for k, (q_id, query) in enumerate(db_loader):
            query_repr = snrm(query.float())

            query_repr = query_repr.numpy()
            retrieval_scores = dict()
            for i in range(len(query_repr[0])):
                if query_repr[0][i] > 0.:
                    doc_rank = 0
                    for (did, weight) in inverted_index.index[i]:
                        #print('did=', did)
                        #print('weight=', weight)
                        docid = did[0]
                        if docid not in retrieval_scores:
                            retrieval_scores[docid] = 0.
                        retrieval_scores[docid] += query_repr[0][i] * weight
                        doc_rank += 1

            if (k % 10 == 0):
                print(k, ' query retrieved \r', file=sys.stderr, end='')
                #break

            qid = q_id[0]
            result[qid] = sorted(retrieval_scores.items(), key=lambda x: x[1])
            print('qid=', qid)
            print('result=', result[qid])

    pkl.dump(result, open(args.retrieve_result_file, 'wb'))
    print('>save result: ', args.retrieve_result_file, file=sys.stderr)
Exemplo n.º 2
0
def build_index():
    print('build index..', file=sys.stderr)
    #1. read dictionary
    dictionary = Dictionary()
    dictionary.load_from_galago_dump(args.dict_file, args.dict_min_freq)

    #2. make snrm instance & load weight
    device = torch.device('cpu')
    snrm = SNRM(args).to(device)
    snrm.load_state_dict(torch.load(args.model_file))  ## load model
    snrm.eval()  ## set inference mode

    #3. read train data
    doc_data = Triplet('doc', args, dictionary)

    #4. make index
    db_loader = DataLoader(dataset=doc_data,
                           batch_size=1,
                           shuffle=False,
                           num_workers=0)

    inverted_index = InMemoryInvertedIndex(
        args.conv3_channel)  ## last channel is output representation
    with torch.no_grad():
        for i, (doc_id, doc) in enumerate(db_loader):
            doc_repr = snrm(doc.float())
            inverted_index.add(doc_id.numpy(), doc_repr.numpy())
            if (i % 10 == 0):
                print(i, ' document inferenced \r', file=sys.stderr, end='')

    inverted_index.store(args.index_file)
    print('>save index: ', args.index_file, file=sys.stderr)
Exemplo n.º 3
0
            max_doc_len=FLAGS.max_doc_len,
            emb_dim=FLAGS.emb_dim,
            layer_size=layer_size,
            dropout_parameter=FLAGS.dropout_parameter,
            regularization_term=FLAGS.regularization_term,
            learning_rate=FLAGS.learning_rate)

#
step = 0

if not os.path.exists(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name +
                      "-inverted-index"):
    os.mkdir(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name +
             "-inverted-index")

inverted_index = InMemoryInvertedIndex(layer_size[-1])

while not check_gpu_available():
    time.sleep(1)

batch_index_id = 0
with tf.Session(graph=snrm.graph) as session:
    session.run(snrm.init)
    print('Initialized')

    model_index = "68000"  # my trained "model/nladuo-snrm2000d44000.data-00000-of-00001"
    snrm.saver.restore(session, FLAGS.base_path + FLAGS.model_path +
                       FLAGS.run_name + model_index)  # restore all variables
    logging.info(
        'Load model from {:s}'.format(FLAGS.base_path + FLAGS.model_path +
                                      FLAGS.run_name + model_index))
Exemplo n.º 4
0
print("creating SNRM model...")
# The SNRM model.
snrm = SNRM(dictionary=dictionary,
            pre_trained_embedding_file_name=FLAGS.base_path +
            FLAGS.pre_trained_embedding_file_name,
            batch_size=FLAGS.batch_size,
            max_q_len=FLAGS.max_q_len,
            max_doc_len=FLAGS.max_doc_len,
            emb_dim=FLAGS.emb_dim,
            layer_size=layer_size,
            dropout_parameter=FLAGS.dropout_parameter,
            regularization_term=FLAGS.regularization_term,
            learning_rate=FLAGS.learning_rate)

inverted_index = InMemoryInvertedIndex(layer_size[-1])
inverted_index.load(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name +
                    '-inverted-index-20190812.pkl')

with tf.Session(graph=snrm.graph) as session:
    session.run(snrm.init)
    print('Initialized')

    model_index = "9994000"  # my trained "model/nladuo-snrm2000d44000.data-00000-of-00001"
    snrm.saver.restore(session, FLAGS.base_path + FLAGS.model_path +
                       FLAGS.run_name + model_index)  # restore all variables
    logging.info(
        'Load model from {:s}'.format(FLAGS.base_path + FLAGS.model_path +
                                      FLAGS.run_name + model_index))

    client = pymongo.MongoClient()
Exemplo n.º 5
0
            max_doc_len=FLAGS.max_doc_len,
            emb_dim=FLAGS.emb_dim,
            layer_size=layer_size,
            dropout_parameter=FLAGS.dropout_parameter,
            regularization_term=FLAGS.regularization_term,
            learning_rate=FLAGS.learning_rate)

#
step = 0

if not os.path.exists(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name +
                      "-inverted-index"):
    os.mkdir(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name +
             "-inverted-index")

inverted_index = InMemoryInvertedIndex(layer_size[-1])

# while not check_gpu_available():
#     time.sleep(1)

batch_index_id = 0
index_id = 0
with tf.Session(graph=snrm.graph) as session:
    session.run(snrm.init)
    print('Initialized')

    model_index = "9994000"  # my trained "model/nladuo-snrm2000d44000.data-00000-of-00001"
    snrm.saver.restore(session, FLAGS.base_path + FLAGS.model_path +
                       FLAGS.run_name + model_index)  # restore all variables
    logging.info(
        'Load model from {:s}'.format(FLAGS.base_path + FLAGS.model_path +