示例#1
0
    def eval_by_qid_list_helper(self, qid_list, pair_generator):

        relevance_dict = load_pickle(self.config.relevance_dict_path)

        qid_list = sorted(qid_list)

        qualified_qid_list = []
        res_dict = OrderedDict()
        for qid in qid_list:

            relevance = relevance_dict.get(qid)

            supervised_docid_list = relevance.get_supervised_docid_list()
            if len(supervised_docid_list) < self.config.nb_supervised_doc:
                # cannot construct d2d feature, thus not need to be update
                score_list = relevance.get_supervised_score_list()
                res = Result(qid, supervised_docid_list, score_list,
                             self.config.runid)
                res_dict.update({qid: res})
                logging.warn("query {0} not to be rerank".format(qid))
            else:
                qualified_qid_list.append(qid)
        # generate re rank score
        dd_q, dd_d, score_gate, len_indicator = \
                              pair_generator.generate_list_batch(qualified_qid_list, self.config.rerank_topk)

        return [dd_q, dd_d,
                score_gate], len_indicator, res_dict, qualified_qid_list
    def __init__(self,
                 relevance_dict_path,
                 dd_q_feature_path,
                 dd_d_feature_path,
                 sample_perquery_limit,
                 sample_total_limit,
                 query_maxlen=9,
                 doc_topk_term=30,
                 nb_supervised_doc=20,
                 hist_size=30,
                 batch_size=32,
                 shuffle=True):

        super(NPRFDRMMPairGenerator,
              self).__init__(relevance_dict_path, batch_size, shuffle,
                             sample_perquery_limit, sample_total_limit)

        self.query_maxlen = query_maxlen
        self.doc_topk_term = doc_topk_term
        self.nb_supervised_doc = nb_supervised_doc
        self.hist_size = hist_size

        # self.qd_q_gating_dict = load_pickle(qd_q_feature_path)
        self.dd_q_gating_dict = load_pickle(dd_q_feature_path)

        # self.qd_d_feature_path = qd_d_feature_path
        self.dd_d_feature_path = dd_d_feature_path
示例#3
0
def hist_d2d(relevance_file,
             text_max_len,
             hist_size,
             sim_path,
             hist_path,
             d2d=False):
    # qid_list = os.listdir(sim_path)
    relevance_dict = load_pickle(relevance_file)
    qid_list = relevance_dict.keys()
    with poolcontext(processes=14) as pool:
        pool.map(
            partial(hist_per_query, relevance_dict, text_max_len, hist_size,
                    sim_path, hist_path, d2d), qid_list)
    logging.info("Finish all!")
示例#4
0
def hist_per_query(relevance_dict, text_max_len, hist_size, sim_path,
                   hist_path, d2d, qid):
    # relevance_dict = load_pickle(relevance_file)
    hist_output_dir = make_directory(hist_path, str(qid))
    # files = glob.glob(os.path.join(sim_path, str(qid), '*.pickle'))
    relevance = relevance_dict.get(qid)
    supervised_docid_list = relevance.get_supervised_docid_list()
    judged_docid_list = relevance.get_judged_docid_list()
    '''
    because we only want to rerank top 500 docs, but judged docs that lie in
    in top 1000 should also be considered, for the sufficiency of training 
  '''
    cand = judged_docid_list[0] + judged_docid_list[1] + judged_docid_list[2]
    waitlist = [
        docid for docid in cand if docid in supervised_docid_list[500:2000]
    ]
    useful_docid_list = supervised_docid_list[:1000]  #[:500] + waitlist

    for docid in useful_docid_list:  # supervised_docid_list[:1000]: # supervised_docid_list[:500] + waitlist:
        # file_name = os.path.basename(sim_file)
        # file_name = re.sub('pickle', 'npy', file_name)
        # _file = os.path.join(hist_output_dir, file_name)
        if d2d:
            sim_file_name = os.path.join(sim_path, str(qid),
                                         'q{0}_d{1}.pickle'.format(qid, docid))
        else:
            sim_file_name = os.path.join(sim_path, str(qid),
                                         'q{0}_d{1}.npy'.format(qid, docid))
        hist_file_name = os.path.join(hist_output_dir,
                                      'q{0}_d{1}.npy'.format(qid, docid))

        if os.path.exists(hist_file_name):
            pass
        else:
            if d2d:
                sim_list = load_pickle(sim_file_name)
                hist_array = np.zeros((len(sim_list), text_max_len, hist_size),
                                      dtype=np.float32)
                for i, sim_mat in enumerate(sim_list):
                    sim_mat = sim_mat[:, :20000]
                    hist = hist_from_matrix(text_max_len, hist_size, sim_mat)
                    hist_array[i] = hist

                np.save(hist_file_name, hist_array)
            else:
                sim_mat = np.load(sim_file_name)
                hist = hist_from_matrix(text_max_len, hist_size, sim_mat)
                np.save(hist_file_name, hist)
    logging.info("Finish for topic {0}".format(qid))
示例#5
0
def sim_mat_and_kernel_d2d(relevance_file, topic_file, corpus_file,
                           topk_corpus_file, embedding_file, stop_file,
                           sim_output_path, kernel_output_path, kernel_mu_list,
                           kernel_sigma_list, topk_supervised, d2d, test):
    '''Simultaneously compute similarity matrix and RBF kernel features

  Args:
    relevance_file: A dumped relevance dict file
    topic_file: a single line format topic file. format: qid term1 term2 ...
    corpus_file: corpus corresponding to docnolist file. format: docno\tdoclen\tterm1 term2
    topk_corpus_file: corpus that contain only the topk terms for each document, format: same as corpus_file
    embedding_file: output file from word2vec toolkit, boolean=True
    stop_file: a stopword list file, one word per line
    sim_output_path:
    kernel_output_path:
    kernel_mu_list:
    kernel_sigma_list:
    topk_supervised: number of top-n documents for each query
    d2d: True for NPRF, False for simple query-document matching used by e.g. DRMM, K-NRM
    test: control the temporary output. Set false

  Returns:

  '''
    relevance_dict = load_pickle(relevance_file)
    topic_dict = parse_topic(topic_file)
    corpus = parse_corpus(corpus_file)
    topk_corpus = parse_corpus(topk_corpus_file)

    embeddings = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
    stoplist = parse_stoplist(stop_file)
    qid_list = relevance_dict.keys()

    import time

    for qid in qid_list:

        start = time.time()
        sim_mat_and_kernel_per_query(relevance_dict, topic_dict, corpus,
                                     topk_corpus, embeddings, stoplist,
                                     sim_output_path, kernel_output_path,
                                     kernel_mu_list, kernel_sigma_list,
                                     topk_supervised, d2d, test, qid)
        print(time.time() - start)

        break
示例#6
0
def parse_idf_for_document(relevance_file,
                           df_file,
                           document_file,
                           output_file,
                           rerank_topk=500,
                           doc_topk_term=30,
                           nb_doc=528155):
    '''Get the idf weight for top k terms in each document

  Args:
    relevance_file:
    df_file:
    document_file:
    output_file:
    rerank_topk:
    doc_topk_term:
    nb_doc:

  Returns:

  '''
    relevance_dict = load_pickle(relevance_file)
    df_map = df_map_from_file(df_file)
    topk_term_corpus = parse_corpus(document_file)

    idf_map = OrderedDict()
    for qid, relevance in relevance_dict.items():
        #relevance = relevance_dict.get(qid)
        logging.info("query {0}".format(qid))
        supervised_docid_list = relevance.get_supervised_docid_list(
        )[:rerank_topk]
        curr_idf = parse_idf_per_query(supervised_docid_list, df_map,
                                       doc_topk_term, topk_term_corpus, nb_doc)
        idf_map.update({qid: curr_idf})

    save_pickle(idf_map, output_file)
示例#7
0
    # print(n)
    # qid_list = np.ones((5, 30), dtype=np.int)
    # for i in range(30):
    #   l = np.random.permutation(m[5*i: 5*(i+1)])
    #   qid_list[:, i] = l
    #
    # qid_list = np.sort(qid_list)
    # print(qid_list.tolist())

    global_info_path = "/home/lcj/data/desc.disk12/features/global.info"
    idf_params = {
        'df_file':
        os.path.join(global_info_path, 'disk12.dfcf.txt'),
        'topic_file':
        os.path.join(global_info_path, 'disk12.desc.porter.morefilter.txt'),
        'output_file':
        os.path.join(global_info_path, 'desc.idf.pickle'),
        'maxlen':
        24,
        'nb_doc':
        741856
    }

    parse_idf_for_query(**idf_params)
    idf = load_pickle(idf_params['output_file'])
    print(idf)

    #
    # get_query_length("/media/klaas/data/collection/clue_final/topic/clueweb.desc.porter.txt")
    # get_query_length("/media/klaas/data/collection/disk12/disk12.desc.porter.morefilter.txt")
示例#8
0
    def train(self, model, pair_generator, fold, output_file, use_nprf=False):
        '''Driver function for training

    Args:
      model: a keras Model
      pair_generator: a instantiated pair generator
      fold: which fold to run. partitions will be automatically rotated.
      output_file: temporary file for valiation
      use_nprf: whether to use nprf

    Returns:

    '''
        # set tensorflow not to use the full GPU memory
        session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True)))

        # qid list config
        qid_list = deque(self.config.qid_list)
        rotate = fold - 1
        map(qid_list.rotate(rotate), qid_list)
        #train_qid_list, valid_qid_list, test_qid_list = qid_list[0].tolist() + qid_list[1].tolist() + qid_list[2].tolist(), qid_list[3].tolist(), qid_list[4].tolist()
        train_qid_list, valid_qid_list, test_qid_list = qid_list[0] + qid_list[
            1] + qid_list[2], qid_list[3], qid_list[4]
        print(train_qid_list, valid_qid_list, test_qid_list)
        relevance_dict = load_pickle(self.config.relevance_dict_path)
        # pair_generator = DDMPairGenerator(**self.config.generator_params)
        nb_pair_train = pair_generator.count_pairs_balanced(
            train_qid_list, self.config.pair_sample_size)

        valid_params = self.eval_by_qid_list_helper(valid_qid_list,
                                                    pair_generator)
        test_params = self.eval_by_qid_list_helper(test_qid_list,
                                                   pair_generator)

        print(valid_params[-1], test_params[-1])
        batch_logger = NBatchLogger(50)
        batch_losses = []
        met = [[], [], [], [], [], []]
        iteration = -1
        for i in range(self.config.nb_epoch):
            print("Epoch " + str(i))

            nb_batch = nb_pair_train / self.config.batch_size

            train_generator = pair_generator.generate_pair_batch(
                train_qid_list, self.config.pair_sample_size)
            for j in range(int(nb_batch / 100)):
                iteration += 1
                history = model.fit_generator(
                    generator=train_generator,
                    steps_per_epoch=
                    100,  # nb_pair_train / self.config.batch_size,
                    epochs=1,
                    shuffle=False,
                    verbose=0,
                    callbacks=[batch_logger],
                )
                batch_losses.append(batch_logger.losses)
                print("[Iter {0}]\tLoss: {1}".format(iteration,
                                                     history.history['loss']))

                kwargs = {
                    'model': model,
                    'relevance_dict': relevance_dict,
                    'rerank_topk': self.config.rerank_topk,
                    'qrels_file': self.config.qrels_file,
                    'docnolist_file': self.config.docnolist_file,
                    'runid': self.config.runid,
                    'output_file': output_file
                }
                if use_nprf:
                    kwargs.update({
                        'nb_supervised_doc': self.config.nb_supervised_doc,
                        'doc_topk_term': self.config.doc_topk_term,
                    })

                valid_met = self.eval_by_qid_list(*valid_params, **kwargs)
                print("[Valid]\t\tMAP\tP20\tNDCG20")
                print("\t\t{0}\t{1}\t{2}".format(valid_met[0], valid_met[1],
                                                 valid_met[2]))
                met[0].append(valid_met[0])
                met[1].append(valid_met[1])
                met[2].append(valid_met[2])

                kwargs['output_file'] = os.path.join(
                    self.config.result_path,
                    "fold{0}.iter{1}.res".format(fold, iteration))
                # test_met = eval_partial(qid_list=test_qid_list)
                test_met = self.eval_by_qid_list(*test_params, **kwargs)
                print("[Test]\t\tMAP\tP20\tNDCG20")
                print("\t\t{0}\t{1}\t{2}".format(test_met[0], test_met[1],
                                                 test_met[2]))
                met[3].append(test_met[0])
                met[4].append(test_met[1])
                met[5].append(test_met[2])
            print("[Attention]\t\tCurrent best iteration {0}\n".format(
                met[0].index(max(met[0]))))
            if iteration > self.config.max_iteration:
                break
            # model.save_weights(os.path.join(self.config.save_path, "fold{0}.epoch{1}.h5".format(fold, i)))
        best_iter, eval_met = self._extract_max_metric(met)
        retain_file(self.config.result_path, "fold{0}".format(fold),
                    "fold{0}.iter{1}.res".format(fold, best_iter))
        # np.save('loss.npy', batch_losses)
        # np.save('met.npy', met)
        return eval_met