示例#1
0
def main(argv):
    method = 3
    qrelfile = ""
    depth = 10
    collection = "rob04"
    # pd = 100
    # qid = 651
    try:
        opts, args = getopt.getopt(argv, "j:d:c:h",
                                   ["jfile", "depth", "collection"])
    except getopt.GetoptError:
        print('-j <qrelfile> -d <depth> -h help')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('-j <qrelfile> -c <collection> -d <depth> -h help')
            sys.exit()
        elif opt in ("-j", "--jfile"):
            qrelfile = arg
        elif opt in ("-d", "--d"):
            depth = int(arg)
        elif opt in ("-c", "--c"):
            collection = arg
    # if collection == "tb06":
    #     pd = 50
    # elif collection == "tb04":
    #     pd = 80

    prifix_dir = "testcase/"
    rank_dir = prifix_dir + collection + "/doc_rank/"
    fit_dir = prifix_dir + collection + "/background_gain/fit/origin/" + str(
        depth) + "/"
    out_dir = prifix_dir + collection + "/background_gain/opt_score/" + str(
        depth) + "/"

    curr_qrel = Qrel(qrelfile)
    qid = curr_qrel.get_qid()
    tlist = []
    p = 0.95
    for q in qid:
        fit_mat = np.loadtxt(fit_dir + str(q) + ".txt",
                             delimiter=" ",
                             dtype=float)
        for i in range(1, 5):
            curr_opt = RBPOpt(p, 1000, q, qrelfile, fit_mat[:, i], rank_dir, i,
                              out_dir)
            curr_opt.start()
            tlist.append(curr_opt)
        for t in tlist:
            t.join()
    get_doc_prob(qid, out_dir, rank_dir, fit_dir)
示例#2
0
 def __init__(self, qrelname, d, gname, gidx):
     """
     init Chao92 estimator
     :param qrelname: qrel name
     :param d: considered pooling depth
     """
     self._d = d
     self._qrel = Qrel(qrelname)
     self._qid = self._qrel.get_qid()
     self._isout = False
     if len(gidx) > 0:
         self._gname = gname
         self._gidx = np.array(gidx)
         self._isout = True
示例#3
0
def main(argv):
    qrelfile = ""
    depth = 10
    collection = "robust"
    # pd = 100

    try:
        opts, args = getopt.getopt(argv, "j:d:hc:", ["runf", "jfile", "depth"])
    except getopt.GetoptError:
        print('-r <runlist> -j <qrelfile> -d <depth> -h help')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('-r <runlist> -j <qrelfile> -o <output> -d <depth> -h help')
            sys.exit()
        elif opt in ("-j", "--jfile"):
            qrelfile = arg
        elif opt in ("-d", "--d"):
            depth = int(arg)
        elif opt in ("-c", "--c"):
            collection = arg
    # if collection == "tb06":
    #     pd = 50
    # elif collection == "tb04":
    #     pd = 80
    prifix_dir = "testcase/"
    rank_dir = prifix_dir + collection + "/doc_rank/"
    fit_dir = prifix_dir + collection + "/background_gain/fit/origin/" + str(
        depth) + "/"
    out_dir = prifix_dir + collection + "/background_gain/sample_rbp/hybrid/" + str(
        depth) + "/"

    curr_qrel = Qrel(qrelfile)
    result_list = [None] * 4
    t_list = []
    qid = curr_qrel.get_qid()
    w_param = [None] * 4
    for q in range(0, len(qid)):
        for i in range(0, 4):
            result_list[i] = HybridOpt(0.95, 1000, qid[q],
                                       curr_qrel.get_rel_by_qid(qid[q]),
                                       out_dir, rank_dir, (depth, i))
            result_list[i].start()
            t_list.append(result_list[i])
        for t in t_list:
            t.join()
        for i in range(0, 4):
            w_param[i] = result_list[i].res
        get_doc_prob(out_dir, w_param, qid[q], depth)
示例#4
0
 def __init__(self,
              p,
              d,
              q,
              qrelname,
              fitted_vec,
              rank_dir,
              method,
              out_dir,
              is_binary=True):
     """
     init the opt process
     :param p: persistance values
     :param d: considered pooling depth.
     :param q: qid.
     :param qrelname: qrel name
     :param fitted_vec: fitted_vector for method
     :param rank_dir: dir of rank mat
     :param method: method idx
     :param out_dir: output dir
     :param: is_binary: True
     """
     threading.Thread.__init__(self)
     self._outname = out_dir + "opt-weight-" + str(method) + ".txt"
     self._rmse = out_dir + "opt-rmse-" + str(method) + ".txt"
     self._k = d
     self._q = q
     self._qrel = Qrel(qrelname).get_rel_by_qid(q)
     self._p = p
     tmp_rank_mat, self._runnum = futils.read_csv_to_dict(
         rank_dir + str(q) + "-rank.txt", is_prob=False)
     self._rank_bg = fitted_vec
     self._rbp = np.zeros(self._runnum)
     self._bg_vectors = np.zeros((self._k, self._runnum, self._runnum))
     self._bg_rbp = np.zeros((self._k, self._runnum))
     self._binary = is_binary
     # load the rank matrix
     for k, v in tmp_rank_mat.iteritems():
         tmp_v = np.array(v)  # convert to np array for processing.
         is_judged = False
         curr_rel = 0
         if k in self._qrel:
             if self._qrel[k] > 0:
                 curr_rel = 1 if self._binary else self._qrel[k]
             is_judged = True
         if min(tmp_v) < self._k and max(
                 tmp_v
         ) > -1:  # this document is retrieved by one of the system
             tmp = self._rank_bg[tmp_v]
             for i in range(0, len(tmp_v)):
                 if 0 <= tmp_v[i] < self._k:
                     self._rbp[i] += curr_rel * np.power(self._p, tmp_v[i])
                     self._bg_rbp[tmp_v[i], i] = curr_rel * np.power(
                         self._p, tmp_v[i])
                     if is_judged:
                         self._bg_vectors[
                             tmp_v[i],
                             i, :] = tmp  # set the fitted vector to judged documents
示例#5
0
def main(argv):
    runfile = ""
    qrelfile = ""
    collection = ""
    dump_rel = False
    pd = 100
    try:
        opts, args = getopt.getopt(
            argv, "r:c:j:d:bh",
            ["runlist", "collection", "qrelfile", "depth"
             "backgain", "help"])
    except getopt.GetoptError:
        print('-j <qrelfile> -c <collection> -j <qrelfile> '
              '-d <pooling depth> -b backgain -h help')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('-j <qrelfile> -c <collection> -j <qrelfile> '
                  '-d <pooling depth> -b backgain -h help')
            sys.exit()
        elif opt in ("-j", "--jfile"):
            qrelfile = arg
        elif opt in ("-c", "--collection"):
            collection = arg
        elif opt in ("-r", "--run list"):
            runfile = arg
        elif opt in ("-b", "--backgain"):
            dump_rel = True
        elif opt in ("-d", "--depth"):
            pd = int(arg)

    cstr = collection

    dirstr = cstr  # director of runs
    ##
    qrels = Qrel(qrelfile)
    runlist = []
    with open(runfile, 'rb') as fin:
        for rname in fin:
            runlist.append(Qres(dirstr + rname.strip()))
    # start to dump rank
    dump_rank = DumpRank(runlist=runlist, qrels=qrels)
    if not dump_rel:
        out_pref = "doc_rank/"
        dump_rank.dump_rank(out_pref)
    else:
        out_pref = "rank_rel/"
        dump_rank.dump_rel(out_pref, pd)
示例#6
0
def sep_stratum(first_stratum, fname):
    """
    seperate the two-strata sampled qrels
    :param fname:
    :return:
    """
    qrel_str = []
    first_qrel = Qrel(first_stratum)
    sampled_qrel = Qrel(fname)
    qid = first_qrel.get_qid()
    for i in range(0, len(qid)):
        curr_qrel = first_qrel.get_rel_by_qid(qid[i])
        curr_sampled_qrel = sampled_qrel.get_rel_by_qid(qid[i])
        for doc, rel in curr_sampled_qrel.iteritems():
            if doc not in curr_qrel:
                qrel_str.append(
                    str(qid[i]) + " 0 " + doc + " " +
                    str(curr_sampled_qrel[doc]))
    with open(fname + "-2s", "w") as fout:
        for line in qrel_str:
            fout.write(line + "\n")
示例#7
0
class NaiveEstimator:
    """
    calculate sample coverage.
    Try to estimate total number of
    relevant documents using Chao92
    """
    def __init__(self, qrelname, d, gname, gidx):
        """
        init Chao92 estimator
        :param qrelname: qrel name
        :param d: considered pooling depth
        """
        self._d = d
        self._qrel = Qrel(qrelname)
        self._qid = self._qrel.get_qid()
        self._isout = False
        if len(gidx) > 0:
            self._gname = gname
            self._gidx = np.array(gidx)
            self._isout = True

    def naive_estimator(self, rank_dir, out_dir):
        jaccard_mat = np.zeros((len(self._qid), 6))
        jaccard_mat[:, 0] = self._qid
        col_num = 0
        for q in range(0, len(self._qid)):
            curr_rel = self._qrel.get_rel_by_qid(self._qid[q])
            doc_idx = []
            col_num = 0
            with open(rank_dir + str(self._qid[q]) + "-rank.txt", "rb") as fin:
                i = 0
                for lines in fin:
                    curr_line = lines.strip().split(",")
                    if col_num == 0:
                        col_num = len(curr_line)
                    if curr_line[0] in curr_rel:
                        if curr_rel[curr_line[0]] > 0:
                            doc_idx.append(i)
                    i += 1
            if len(doc_idx) == 0:
                jaccard_mat[q, 1:] = 0
            else:
                rank_mat = np.loadtxt(rank_dir + str(self._qid[q]) +
                                      "-rank.txt",
                                      usecols=range(1, col_num),
                                      delimiter=",",
                                      dtype=int)
                rank_mat = rank_mat[np.array(doc_idx), :]
                rank_mat[rank_mat >= self._d] = -2
                rank_mat[rank_mat >= 0] = 1
                rank_mat[rank_mat < 0] = 0
                col_num -= 1
                k = len(doc_idx)
                for j in range(0, (col_num - 1)):
                    sys_i = rank_mat[:, j]
                    for r in range(j + 1, col_num):
                        sys_j = rank_mat[:, r]
                        if sum(sys_i | sys_j) > 0:
                            jaccard_mat[q,
                                        1] += sum(sys_i & sys_j) / sum(sys_i
                                                                       | sys_j)
                        else:
                            jaccard_mat[q, 1] += 0
                f_stat = np.sum(rank_mat, axis=1)
                uniq_f_stat = np.unique(f_stat)
                tot_sample = np.sum(rank_mat)
                for f in uniq_f_stat:
                    jaccard_mat[q, 3] += f * (f - 1) * len(f_stat[f_stat == f])
                jaccard_mat[q, 2] = 1 - (len(f_stat[f_stat == 1]) / tot_sample)
                if jaccard_mat[q, 2] == 0:
                    jaccard_mat[q, 3] = 0
                    jaccard_mat[q, 4] = k
                else:
                    tmp_val = (k / jaccard_mat[q, 2]) / (
                        (tot_sample - 1) * tot_sample)
                    gamma_sq = max(jaccard_mat[q, 3] * tmp_val - 1, 0)
                    jaccard_mat[q, -2] = (k / jaccard_mat[q, 2]) + \
                                         gamma_sq * (tot_sample*(1-jaccard_mat[q, 2]))/jaccard_mat[q, 2]
                    jaccard_mat[q, 3] = np.sqrt(gamma_sq) / col_num
                jaccard_mat[q, -1] = k
        jaccard_mat[:, 1] /= ((col_num - 1) * col_num * 0.5)
        jaccard_mat[:, 2] /= col_num  # average to per run
        fname = out_dir + str(self._d) + "-rsim.txt"
        if self._isout:
            fname = out_dir + self._gname + "-" + str(self._d) + "-rsim.txt"
        np.savetxt(fname=fname,
                   X=jaccard_mat,
                   fmt="%.4f",
                   delimiter=",",
                   header="qid, jaccard, hat_c, gamma, est_N, tot_N")

    def avg_sim_all(self, rank_dir, out_dir, k=1000):
        """
        Calculate different similarity measurement
        :param rank_dir:
        :param out_dir:
        :param k: considered depth
        :return:
        """
        jaccard_mat = np.zeros((len(self._qid), 4))
        col_num = 0
        jaccard_mat[:, 0] = self._qid
        for i in range(0, len(self._qid)):
            with open(rank_dir + str(self._qid[i]) + "-rank.txt", "rb") as fin:
                col_num = len(fin.readline().strip().split(","))
            rank_mat = np.loadtxt(rank_dir + str(self._qid[i]) + "-rank.txt",
                                  usecols=range(1, col_num),
                                  delimiter=",",
                                  dtype=int)
            if self._isout:
                rank_mat = np.delete(rank_mat, self._gidx, axis=1)
            rank_mat[rank_mat >= k] = -2
            rank_mat[rank_mat > -1] = 1
            rank_mat[rank_mat < 0] = 0
            col_num = rank_mat.shape[1]
            for j in range(0, (col_num - 1)):
                sys_i = rank_mat[:, j]
                for r in range(j + 1, col_num):
                    sys_j = rank_mat[:, r]
                    jaccard_mat[i,
                                1] += sum(sys_i & sys_j) / sum(sys_i | sys_j)
            f_stat = np.sum(rank_mat, axis=1)
            uniq_f_stat = np.unique(f_stat)
            tot_sample = np.sum(rank_mat)
            for f in uniq_f_stat:
                jaccard_mat[i, -1] += f * (f - 1) * len(f_stat[f_stat == f])
            jaccard_mat[i, 2] = 1 - (len(f_stat[f_stat == 1]) / tot_sample)
            tmp_val = (rank_mat.shape[0] / jaccard_mat[i, 2]) / (
                (tot_sample - 1) * tot_sample)
            jaccard_mat[i, -1] *= tmp_val
            jaccard_mat[i,
                        -1] = np.sqrt(max(jaccard_mat[i, -1] - 1, 0)) / col_num
        jaccard_mat[:, 1] /= ((col_num - 1) * col_num * 0.5)
        jaccard_mat[:, 2:] /= col_num  # average to per run
        fname = out_dir + str(self._d) + "-asim.txt"
        if self._isout:
            fname = out_dir + self._gname + "-" + str(self._d) + "-asim.txt"
        print fname
        np.savetxt(fname=fname,
                   X=jaccard_mat,
                   fmt="%.4f",
                   delimiter=",",
                   header="qid, jaccard, hat_c, gamma")