def create_initial_trectext_file(trectext_file, output_dir, qid, bots, only_bots): logger = logging.getLogger(sys.argv[0]) new_trectext_file = output_dir + 'documents_{}_{}.trectext'.format(qid, ','.join(bots)) ensure_dirs(new_trectext_file) parser = etree.XMLParser(recover=True) tree = ET.parse(trectext_file, parser=parser) root = tree.getroot() docs = {} for doc in root: pid = None for att in doc: if att.tag == 'DOCNO': doc_id = att.text epoch, last_qid, pid = parse_doc_id(doc_id) if epoch != '01' or last_qid != qid or (only_bots and pid not in bots): break pid = pid.replace('_', '') elif att.tag == 'TEXT': docs[get_doc_id(1, qid, pid)] = '\n'.join(sent_tokenize(att.text)) create_trectext_file(docs, new_trectext_file) logger.info('Competition trectext file created') return new_trectext_file
def log_error(error_dir, command, error): ensure_dirs(error_dir) command = ' '.join([ argument for argument in command.split() if not ('word2vec_dump' in argument or 'output_dir' in argument) ]) with open(error_dir + command, 'w') as f: f.write(str(error))
def create_ws(raw_ds, ws_fname, ref_index): ensure_dirs(ws_fname) with open(ws_fname, 'w') as ws: for qrid in raw_ds: epoch, qid = parse_qrid(qrid) query_write = f'{qid}{epoch.lstrip("0")}{ref_index + 1}' for i, pair in enumerate(raw_ds[qrid]): name = generate_pair_name(pair) ws.write(query_write + " Q0 " + name + " 0 " + str(i + 1) + " pairs_seo\n")
def run_all_competitions( mode, tr_method, validation_method, run_name, source='goren', positions_file_paper='./data/paper_data/documents.positions', trec_file_raifer='data/trec_file_original_sorted.txt', embedding_model_file='/lv_local/home/hadarsi/work_files/word2vec_model/word2vec_model' ): """ A function which runs all possible queries and bot combinations for the given arguments top_refinement: one of the following options: 'vanilla', 'acceleration', 'past_top', 'highest_rated_inferiors', 'past_targets', 'everything' """ if mode.startswith('rerun'): print('Implement this rerunning thing') return name = tr_method + '_' + validation_method if run_name is not '': name += '_' + run_name folder_name = '_'.join((mode, datetime.now().strftime('%m_%d'), name)) results_dir = f'results/{folder_name}/' output_dir = f'output/{folder_name}/' print( f'Running mode {mode} with top refinement method {tr_method} and validation method {validation_method}' ) word2vec_pkl = output_dir + 'word_embedding_model.pkl' ensure_dirs(output_dir) word_embedding_model = load_word_embedding_model(embedding_model_file) with open(word2vec_pkl, 'wb') as f: pickle.dump(word_embedding_model, f) assert mode.endswith('of5') num_of_bots = int(mode[0]) if source == 'goren': kwargs = {'mode': source, 'positions_file': positions_file_paper} elif source == 'raifer': kwargs = {'mode': source, 'trec_file': trec_file_raifer} else: raise ValueError(f'Illegal source given {source}') run_all_queries(output_dir, results_dir, num_of_bots, tr_method, validation_method, word2vec_pkl, **kwargs) print( f'\t\tFinished running {name} with mode {mode} and TRM {tr_method} and val method {validation_method}' ) os.remove(word2vec_pkl)
def feature_creation(qrid, ranked_lists, doc_texts, ref_index, copy_docs, doc_tfidf_vectors_dir, sentence_tfidf_vectors_dir, raw_dataset_file, query_text, output_feature_files_dir, output_final_features_file, workingset_file, word_embed_model): ensure_dirs(output_feature_files_dir, output_final_features_file) raw_ds = read_raw_ds(raw_dataset_file) create_ws(raw_ds, workingset_file, ref_index) create_features(qrid, ranked_lists, doc_texts, ref_index, copy_docs, doc_tfidf_vectors_dir, sentence_tfidf_vectors_dir, query_text, output_feature_files_dir, raw_ds, word_embed_model) constants.lock.acquire() command = f"perl scripts/generateSentences.pl {output_feature_files_dir} {workingset_file}" run_and_print(command, 'generateSentences.pl') command = "mv features " + output_final_features_file run_and_print(command, 'move') constants.lock.release()
def run_all_queries(output_dir, results_dir, num_of_bots, tr_method, validation_method, word2vec_pickle, print_interval=5, total_players=5, **kwargs): error_dir = output_dir + 'errors/' bots_list = get_bots(num_of_bots, total_players, **kwargs) mode = kwargs.pop('mode') iteration = 1 for qid in bots_list: for bots in bots_list[qid]: run_description = f'output_dir={output_dir} qid={qid} bots={",".join(bots)} trm={tr_method} ' \ f'val={validation_method}' if iteration == 1 or iteration % print_interval == 0: print(f'{iteration}. {run_description}') stdout = sys.stdout sys.stdout = open(os.devnull, 'w') try: competition_setup(mode=mode, output_dir=output_dir, qid=qid, bots=bots, word2vec_pickle=word2vec_pickle, top_refinement=tr_method, validation_method=validation_method, mute=True) sys.stdout = stdout except Exception as e: sys.stdout = stdout print( f'#### Error occured in competition {qid} {", ".join(bots)}: \n{str(e)}\n' ) log_error(error_dir, run_description, e) ensure_dirs(results_dir) for directory in [ 'trec_files', 'trectext_filces', 'errors', 'replacements' ]: if os.path.exists(f'{output_dir}/{directory}'): command = f'cp -r {output_dir}/{directory} {results_dir}' run_bash_command(command) iteration += 1
def create_initial_trec_file(output_dir, qid, bots, only_bots, **kwargs): logger = logging.getLogger(sys.argv[0]) new_trec_file = output_dir + 'trec_file_{}_{}'.format(qid, ','.join(bots)) lines_written = 0 ensure_dirs(new_trec_file) if 'trec_file' in kwargs: qrid = get_qrid(qid, 1) with open(kwargs['trec_file'], 'r') as trec_file: with open(new_trec_file, 'w') as new_file: for line in trec_file: last_qrid = line.split()[0] if last_qrid != qrid: continue pid = line.split()[2].split('-')[-1] if not only_bots or pid in bots: new_file.write(line) lines_written += 1 else: ranked_list = [] with open(kwargs['positions_file'], 'r') as pos_file: for line in pos_file: doc_id = line.split()[2] epoch, last_qid, pid = parse_doc_id(doc_id) if epoch != '01' or last_qid != qid or (only_bots and pid not in bots): continue if '_' in pid: pid = pid.replace('_', '') position = int(line.split()[3]) ranked_list.append([get_qrid(qid, 1), get_doc_id(1, qid, pid), 3 - position]) ranked_list.sort(key=lambda x: x[2], reverse=True) with open(new_trec_file, 'w') as new_file: for file in ranked_list: new_file.write(f'{file[0]} Q0 {file[1]} 0 {file[2]} positions\n') lines_written += 1 if lines_written == 0 and not only_bots: raise ValueError(f'query {qid} not in dataset') if only_bots and lines_written != len(bots): raise ValueError('Competitors {} not in dataset'.format(', '.join(kwargs['pid_list']))) logger.info('Competition trec file created') return new_trec_file
def record_doc_similarity(doc_texts, current_epoch, similarity_file, word_embedding_model, document_tfidf_dir): logger = logging.getLogger(sys.argv[0]) ensure_dirs(similarity_file) recent_documents = [] recent_texts = [] for document in doc_texts: epoch = int(document.split('-')[1]) if epoch == current_epoch: recent_documents.append(document) recent_texts.append(doc_texts[document]) assert len(recent_documents) == 2 tfidf_sim = tfidf_similarity(*[document_tfidf_dir + doc for doc in recent_documents]) embedding_sim = embedding_similarity(*recent_texts, word_embedding_model) with open(similarity_file, 'a') as f: if current_epoch == 1: f.write('Round\ttfidf\tembedding\n') f.write(f'{current_epoch - 1}\t{round(tfidf_sim, 3)}\t{round(embedding_sim, 3)}\n') logger.info('Recorded document similarity')
def run_reranking(qrid, trec_file, base_index, new_index, swig_path, scripts_dir, stopwords_file, queries_text_file, jar_path, rank_model, output_dir, reranking_ws_name='reranking_ws', new_feature_file_name='new_feature_file', feature_dir_name='feature_dir/', new_trec_file_name='trec_file', score_file_name='score_file'): logger = logging.getLogger(sys.argv[0]) ensure_dirs(output_dir) reranked_trec_file = output_dir + new_trec_file_name feature_file = output_dir + new_feature_file_name full_feature_dir = output_dir + feature_dir_name reranking_ws = output_dir + reranking_ws_name score_file = output_dir + score_file_name raw_ranked_lists = TrecReader(trec_file=trec_file, raw=True) create_reranking_ws(qrid, raw_ranked_lists, reranking_ws) logger.info("creating features") features_file = create_features_file_diff(full_feature_dir, base_index, new_index, feature_file, reranking_ws, scripts_dir, swig_path, stopwords_file, queries_text_file) logger.info("creating docname index") docname_index = create_index_to_doc_name_dict(features_file) logger.info("docname index creation is completed") query_index = create_index_to_query_dict(features_file) logger.info("features creation completed") logger.info("running ranking model on features file") run_model(features_file, jar_path, score_file, rank_model) logger.info("ranking completed") logger.info("retrieving scores") scores = retrieve_scores(docname_index, query_index, score_file) logger.info("scores retrieval completed") logger.info("creating trec_eval file") create_trec_eval_file(scores, reranked_trec_file) logger.info("trec file creation is completed") logger.info("ordering trec file") final = order_trec_file(reranked_trec_file) logger.info("ranking procedure completed") return final
def competition_setup(mode, qid: str, bots: list, top_refinement, validation_method, output_dir='output/tmp/', mute=False, **kwargs): embedding_model_file = '/lv_local/home/hadarsi/work_files/word2vec_model/word2vec_model' alternation_classifier_pickle = 'predictors/alteration_classifier.pkl' clueweb_index = '/lv_local/home/hadarsi/work_files/clueweb_index/' swig_path = '/lv_local/home/hadarsi/indri-5.6/swig/obj/java/' coherency_qrels_file = 'data/coherency_aggregated_labels.txt' queries_text_file = 'data/working_comp_queries_expanded.txt' trectext_file_paper = 'data/paper_data/documents.trectext' unranked_features_file = 'data/features_bot_sorted.txt' positions_file = 'data/paper_data/documents.positions' trec_file = 'data/trec_file_original_sorted.txt' trectext_file_raifer = 'data/documents.trectext' aggregated_data_dir = 'data/learning_dataset/' rank_model = 'rank_models/model_lambdatamart' queries_xml_file = 'data/queries_seo_exp.xml' indri_path = '/lv_local/home/hadarsi/indri/' seo_qrels_file = 'data/qrels_seo_bot.txt' stopwords_file = 'data/stopwords_list' ranklib_jar = 'scripts/RankLib.jar' svm_rank_scripts_dir = 'scripts/' svm_models_dir = 'rank_models/' scripts_dir = 'scripts/' pair_ranker_args = ('harmonic', 1) ensure_dirs(output_dir) document_workingset_file = output_dir + 'document_ws.txt' rep_val_dir = output_dir + 'replacement_evaluation/' final_features_dir = output_dir + 'final_features/' doc_tfidf_dir = output_dir + 'document_tfidf/' trectext_dir = output_dir + 'trectext_files/' predictions_dir = output_dir + 'predictions/' reranking_dir = output_dir + 'reranking/' raw_ds_dir = output_dir + 'raw_datasets/' trec_dir = output_dir + 'trec_files/' # competition_index = output_dir + 'index_' + qid + '_' + ','.join(bots) competition_index = output_dir + 'index' program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.CRITICAL + 1 if mute else logging.INFO) logger.info("Running {}".format(' '.join(sys.argv))) pair_ranker = svm_models_dir + get_model_name(pair_ranker_args) if not os.path.exists(pair_ranker): create_pair_ranker(pair_ranker, pair_ranker_args, aggregated_data_dir, seo_qrels_file, coherency_qrels_file, unranked_features_file, svm_rank_scripts_dir) if 'top_ranker_args' in kwargs: top_ranker_args = kwargs.pop('top_ranker_args') top_ranker = svm_models_dir + get_model_name(top_ranker_args) if not os.path.exists(top_ranker): create_pair_ranker(top_ranker, top_ranker_args, aggregated_data_dir, seo_qrels_file, coherency_qrels_file, unranked_features_file, svm_rank_scripts_dir) else: top_ranker = pair_ranker # load word2vec model if 'word2vec_dump' in kwargs: word2vec_dump = kwargs.pop('word2vec_dump') word_embedding_model = pickle.load(open(word2vec_dump, 'rb')) logger.info('Loaded word Embedding Model from pickle') else: word_embedding_model = load_word_embedding_model(embedding_model_file) logger.info('Loaded word Embedding Model from file') alternation_classifier = pickle.load(open(alternation_classifier_pickle, 'rb')) if mode == '2of2': trectext_file = trectext_file_raifer assert len(bots) == 2 replacements_file = output_dir + 'replacements/replacements_{}_{}'.format(qid, ','.join(bots)) similarity_file = output_dir + 'similarity_results/similarity_{}_{}.txt'.format(qid, ','.join(bots)) for file in [replacements_file, similarity_file]: if os.path.exists(file): os.remove(file) run_2_bot_competition(qid, bots, trectext_file, trec_file, output_dir, clueweb_index, competition_index, document_workingset_file, doc_tfidf_dir, reranking_dir, trec_dir, trectext_dir, raw_ds_dir, predictions_dir, final_features_dir, swig_path, indri_path, replacements_file, similarity_file, svm_rank_scripts_dir, 10, scripts_dir, stopwords_file, queries_text_file, queries_xml_file, ranklib_jar, rank_model, pair_ranker, top_ranker, word_embedding_model) else: replacements_file = output_dir + 'replacements/replacements_' + '_'.join([qid, ','.join(bots)]) if os.path.exists(replacements_file): os.remove(replacements_file) competitors = get_competitors(qid=qid, trec_file=(trec_file if mode == 'raifer' else positions_file)) if mode == 'raifer': trectext_file = trectext_file_raifer kwargs = dict(trec_file=trec_file) rounds = 7 elif mode == 'goren': trectext_file = trectext_file_paper kwargs = dict(positions_file=positions_file) rounds = 3 else: raise ValueError('Illegal mode given') if not all([bot in competitors for bot in bots]): raise ValueError(f'Not all given bots are competitors in the query \n' f'bots: {bots} \ncompetitors: {competitors}') run_general_competition(qid, competitors, bots, rounds, top_refinement, validation_method, trectext_file, output_dir, document_workingset_file, indri_path, swig_path, doc_tfidf_dir, reranking_dir, trec_dir, trectext_dir, raw_ds_dir, predictions_dir, final_features_dir, clueweb_index, competition_index, replacements_file, svm_rank_scripts_dir, scripts_dir, stopwords_file, queries_text_file, queries_xml_file, ranklib_jar, rank_model, pair_ranker, top_ranker, word_embedding_model, alternation_classifier, rep_val_dir, **kwargs)
def record_replacement(replacements_file, epoch, in_doc_id, out_doc_id, out_index, in_index, features): ensure_dirs(replacements_file) with open(replacements_file, 'a') as f: items = [str(item) for item in [epoch, in_doc_id, out_doc_id, out_index, in_index, ','.join(features)]] f.write('\t'.join(items) + '\n')
def generate_document_tfidf_files(workingset_file, output_dir, new_index, base_index=clueweb_index): ensure_dirs(output_dir) command = f'java -Djava.library.path={swig_path} -cp {indri_utils_path} PrepareTFIDFVectorsWSDiff ' \ f'{base_index} {new_index} {workingset_file} {output_dir}' run_and_print(command, command_name='Document tfidf Creation')
def generate_predictions(model_path, svm_rank_scripts_dir, predictions_dir, feature_file): predictions_file = predictions_dir + '_predictions'.join(splitext(basename(feature_file))) ensure_dirs(predictions_file) command = f'{svm_rank_scripts_dir}svm_rank_classify {feature_file} {model_path} {predictions_file}' run_and_print(command, 'pair classify') return predictions_file
def create_model(svm_rank_scripts_dir, model_path, learning_data, svm_rank_c=0.01): ensure_dirs(model_path) command = f'{svm_rank_scripts_dir}svm_rank_learn -c {svm_rank_c} {learning_data} {model_path}' run_and_print(command, 'pair ranker learn')