def get_last_top_document(trec_reader: TrecReader, qid):
    if trec_reader.num_epochs() == 1:
        return None

    last_round, current_round = sorted(trec_reader)[-2:]

    last_top_doc_id = trec_reader[last_round][qid][0]
    current_top_doc_id = trec_reader[current_round][qid][0]

    if parse_doc_id(last_top_doc_id)[2] == parse_doc_id(current_top_doc_id)[2]:
        return None

    return last_top_doc_id
示例#2
0
 def get_pids(self, qid):
     epoch = min(self.__epochs)
     player_ids = [
         utils.parse_doc_id(doc_id)[2]
         for doc_id in self.__ranked_list[epoch][qid]
     ]
     return player_ids
def create_initial_trectext_file(trectext_file, output_dir, qid, bots, only_bots):
    logger = logging.getLogger(sys.argv[0])

    new_trectext_file = output_dir + 'documents_{}_{}.trectext'.format(qid, ','.join(bots))
    ensure_dirs(new_trectext_file)

    parser = etree.XMLParser(recover=True)
    tree = ET.parse(trectext_file, parser=parser)
    root = tree.getroot()
    docs = {}
    for doc in root:
        pid = None
        for att in doc:
            if att.tag == 'DOCNO':
                doc_id = att.text
                epoch, last_qid, pid = parse_doc_id(doc_id)
                if epoch != '01' or last_qid != qid or (only_bots and pid not in bots):
                    break
                pid = pid.replace('_', '')
            elif att.tag == 'TEXT':
                docs[get_doc_id(1, qid, pid)] = '\n'.join(sent_tokenize(att.text))

    create_trectext_file(docs, new_trectext_file)
    logger.info('Competition trectext file created')
    return new_trectext_file
def get_rankings(trec_file, bot_ids, qid, epoch):
    """
    :param trec_file: a trecfile
    :param bot_ids: the pids of the players who are bots
    :param qid: query id
    :param epoch: current round
    :return: two dictionaries of the form {pid: location}, one for the bots and the other for the students
    """

    bots = {}
    students = {}
    # position = 0
    epoch = str(epoch).zfill(2)
    with open(trec_file, 'r') as f:
        rank = 0
        for line in f:
            doc_id = line.split()[2]
            last_epoch, last_qid, pid = parse_doc_id(doc_id)
            if last_epoch != epoch or last_qid != qid:
                continue
            if pid in bot_ids:
                bots[pid] = rank
            else:
                students[pid] = rank
            rank += 1
    return bots, students
def get_ranked_competitors_list(trec_file, current_epoch):
    competitors_ranked_list = []
    with open(trec_file, 'r') as f:
        for line in f:
            epoch = int(line.split()[0][-2:])
            if epoch != current_epoch:
                continue
            doc_id = line.split()[2]
            competitors_ranked_list.append(parse_doc_id(doc_id)[2])
    return competitors_ranked_list
示例#6
0
    def __read_trec_file(self, trec_file):
        ranked_list = defaultdict(dict)
        with open(trec_file) as file:
            for line in file:
                doc_id = line.split()[2]
                epoch, qid, _ = utils.parse_doc_id(doc_id)

                self.__epochs.add(epoch)
                self.__queries.add(qid)

                if qid not in ranked_list[epoch]:
                    ranked_list[epoch][qid] = []
                ranked_list[epoch][qid].append(doc_id)
        return dict(ranked_list)
示例#7
0
 def __read_trec_dir(self, trec_dir):
     ranked_list = defaultdict(dict)
     trec_files = sorted(os.listdir(trec_dir))
     for trec_fname in trec_files:
         trec_file = f'{trec_dir}/{trec_fname}'
         qid = '_'.join(trec_fname.split('_')[-2:])
         self.__queries.add(qid)
         with open(trec_file, 'r') as f:
             for line in f:
                 doc_id = line.split()[2]
                 epoch, _, pid = utils.parse_doc_id(doc_id)
                 self.__epochs.add(epoch)
                 if qid not in ranked_list[epoch]:
                     ranked_list[epoch][qid] = []
                 ranked_list[epoch][qid].append(doc_id)
     return ranked_list
def create_initial_trec_file(output_dir, qid, bots, only_bots, **kwargs):
    logger = logging.getLogger(sys.argv[0])

    new_trec_file = output_dir + 'trec_file_{}_{}'.format(qid, ','.join(bots))

    lines_written = 0
    ensure_dirs(new_trec_file)
    if 'trec_file' in kwargs:
        qrid = get_qrid(qid, 1)
        with open(kwargs['trec_file'], 'r') as trec_file:
            with open(new_trec_file, 'w') as new_file:
                for line in trec_file:
                    last_qrid = line.split()[0]
                    if last_qrid != qrid:
                        continue
                    pid = line.split()[2].split('-')[-1]
                    if not only_bots or pid in bots:
                        new_file.write(line)
                        lines_written += 1

    else:
        ranked_list = []
        with open(kwargs['positions_file'], 'r') as pos_file:
            for line in pos_file:
                doc_id = line.split()[2]
                epoch, last_qid, pid = parse_doc_id(doc_id)
                if epoch != '01' or last_qid != qid or (only_bots and pid not in bots):
                    continue
                if '_' in pid:
                    pid = pid.replace('_', '')
                position = int(line.split()[3])
                ranked_list.append([get_qrid(qid, 1), get_doc_id(1, qid, pid), 3 - position])
        ranked_list.sort(key=lambda x: x[2], reverse=True)
        with open(new_trec_file, 'w') as new_file:
            for file in ranked_list:
                new_file.write(f'{file[0]} Q0 {file[1]} 0 {file[2]} positions\n')
                lines_written += 1

    if lines_written == 0 and not only_bots:
        raise ValueError(f'query {qid} not in dataset')

    if only_bots and lines_written != len(bots):
        raise ValueError('Competitors {} not in dataset'.format(', '.join(kwargs['pid_list'])))

    logger.info('Competition trec file created')
    return new_trec_file
示例#9
0
    def __read_positions_file(self, positions_file):
        max_rank = 0
        with open(positions_file, 'r') as f:
            for line in f:
                rank = int(line.split()[-1])
                max_rank = max(max_rank, rank)

        ranked_list = defaultdict(dict)
        with open(positions_file, 'r') as f:
            for line in f:
                _, _, doc_id, rank = line.split()
                rank = int(rank) - 1
                epoch, qid, _ = utils.parse_doc_id(doc_id)

                self.__epochs.add(epoch)
                self.__queries.add(qid)

                if qid not in ranked_list[epoch]:
                    ranked_list[epoch][qid] = [None] * max_rank
                ranked_list[epoch][qid][rank] = utils.fix_format(doc_id)
        return ranked_list
示例#10
0
 def get_top_player(self, qid, epoch=None):
     if epoch is None:
         epoch = max(self.__epochs)
     top_doc_id = self[epoch][qid][0]
     return utils.parse_doc_id(top_doc_id)[2]