def get_last_top_document(trec_reader: TrecReader, qid): if trec_reader.num_epochs() == 1: return None last_round, current_round = sorted(trec_reader)[-2:] last_top_doc_id = trec_reader[last_round][qid][0] current_top_doc_id = trec_reader[current_round][qid][0] if parse_doc_id(last_top_doc_id)[2] == parse_doc_id(current_top_doc_id)[2]: return None return last_top_doc_id
def get_pids(self, qid): epoch = min(self.__epochs) player_ids = [ utils.parse_doc_id(doc_id)[2] for doc_id in self.__ranked_list[epoch][qid] ] return player_ids
def create_initial_trectext_file(trectext_file, output_dir, qid, bots, only_bots): logger = logging.getLogger(sys.argv[0]) new_trectext_file = output_dir + 'documents_{}_{}.trectext'.format(qid, ','.join(bots)) ensure_dirs(new_trectext_file) parser = etree.XMLParser(recover=True) tree = ET.parse(trectext_file, parser=parser) root = tree.getroot() docs = {} for doc in root: pid = None for att in doc: if att.tag == 'DOCNO': doc_id = att.text epoch, last_qid, pid = parse_doc_id(doc_id) if epoch != '01' or last_qid != qid or (only_bots and pid not in bots): break pid = pid.replace('_', '') elif att.tag == 'TEXT': docs[get_doc_id(1, qid, pid)] = '\n'.join(sent_tokenize(att.text)) create_trectext_file(docs, new_trectext_file) logger.info('Competition trectext file created') return new_trectext_file
def get_rankings(trec_file, bot_ids, qid, epoch): """ :param trec_file: a trecfile :param bot_ids: the pids of the players who are bots :param qid: query id :param epoch: current round :return: two dictionaries of the form {pid: location}, one for the bots and the other for the students """ bots = {} students = {} # position = 0 epoch = str(epoch).zfill(2) with open(trec_file, 'r') as f: rank = 0 for line in f: doc_id = line.split()[2] last_epoch, last_qid, pid = parse_doc_id(doc_id) if last_epoch != epoch or last_qid != qid: continue if pid in bot_ids: bots[pid] = rank else: students[pid] = rank rank += 1 return bots, students
def get_ranked_competitors_list(trec_file, current_epoch): competitors_ranked_list = [] with open(trec_file, 'r') as f: for line in f: epoch = int(line.split()[0][-2:]) if epoch != current_epoch: continue doc_id = line.split()[2] competitors_ranked_list.append(parse_doc_id(doc_id)[2]) return competitors_ranked_list
def __read_trec_file(self, trec_file): ranked_list = defaultdict(dict) with open(trec_file) as file: for line in file: doc_id = line.split()[2] epoch, qid, _ = utils.parse_doc_id(doc_id) self.__epochs.add(epoch) self.__queries.add(qid) if qid not in ranked_list[epoch]: ranked_list[epoch][qid] = [] ranked_list[epoch][qid].append(doc_id) return dict(ranked_list)
def __read_trec_dir(self, trec_dir): ranked_list = defaultdict(dict) trec_files = sorted(os.listdir(trec_dir)) for trec_fname in trec_files: trec_file = f'{trec_dir}/{trec_fname}' qid = '_'.join(trec_fname.split('_')[-2:]) self.__queries.add(qid) with open(trec_file, 'r') as f: for line in f: doc_id = line.split()[2] epoch, _, pid = utils.parse_doc_id(doc_id) self.__epochs.add(epoch) if qid not in ranked_list[epoch]: ranked_list[epoch][qid] = [] ranked_list[epoch][qid].append(doc_id) return ranked_list
def create_initial_trec_file(output_dir, qid, bots, only_bots, **kwargs): logger = logging.getLogger(sys.argv[0]) new_trec_file = output_dir + 'trec_file_{}_{}'.format(qid, ','.join(bots)) lines_written = 0 ensure_dirs(new_trec_file) if 'trec_file' in kwargs: qrid = get_qrid(qid, 1) with open(kwargs['trec_file'], 'r') as trec_file: with open(new_trec_file, 'w') as new_file: for line in trec_file: last_qrid = line.split()[0] if last_qrid != qrid: continue pid = line.split()[2].split('-')[-1] if not only_bots or pid in bots: new_file.write(line) lines_written += 1 else: ranked_list = [] with open(kwargs['positions_file'], 'r') as pos_file: for line in pos_file: doc_id = line.split()[2] epoch, last_qid, pid = parse_doc_id(doc_id) if epoch != '01' or last_qid != qid or (only_bots and pid not in bots): continue if '_' in pid: pid = pid.replace('_', '') position = int(line.split()[3]) ranked_list.append([get_qrid(qid, 1), get_doc_id(1, qid, pid), 3 - position]) ranked_list.sort(key=lambda x: x[2], reverse=True) with open(new_trec_file, 'w') as new_file: for file in ranked_list: new_file.write(f'{file[0]} Q0 {file[1]} 0 {file[2]} positions\n') lines_written += 1 if lines_written == 0 and not only_bots: raise ValueError(f'query {qid} not in dataset') if only_bots and lines_written != len(bots): raise ValueError('Competitors {} not in dataset'.format(', '.join(kwargs['pid_list']))) logger.info('Competition trec file created') return new_trec_file
def __read_positions_file(self, positions_file): max_rank = 0 with open(positions_file, 'r') as f: for line in f: rank = int(line.split()[-1]) max_rank = max(max_rank, rank) ranked_list = defaultdict(dict) with open(positions_file, 'r') as f: for line in f: _, _, doc_id, rank = line.split() rank = int(rank) - 1 epoch, qid, _ = utils.parse_doc_id(doc_id) self.__epochs.add(epoch) self.__queries.add(qid) if qid not in ranked_list[epoch]: ranked_list[epoch][qid] = [None] * max_rank ranked_list[epoch][qid][rank] = utils.fix_format(doc_id) return ranked_list
def get_top_player(self, qid, epoch=None): if epoch is None: epoch = max(self.__epochs) top_doc_id = self[epoch][qid][0] return utils.parse_doc_id(top_doc_id)[2]