def create_initial_trectext_file(trectext_file, output_dir, qid, bots, only_bots):
    logger = logging.getLogger(sys.argv[0])

    new_trectext_file = output_dir + 'documents_{}_{}.trectext'.format(qid, ','.join(bots))
    ensure_dirs(new_trectext_file)

    parser = etree.XMLParser(recover=True)
    tree = ET.parse(trectext_file, parser=parser)
    root = tree.getroot()
    docs = {}
    for doc in root:
        pid = None
        for att in doc:
            if att.tag == 'DOCNO':
                doc_id = att.text
                epoch, last_qid, pid = parse_doc_id(doc_id)
                if epoch != '01' or last_qid != qid or (only_bots and pid not in bots):
                    break
                pid = pid.replace('_', '')
            elif att.tag == 'TEXT':
                docs[get_doc_id(1, qid, pid)] = '\n'.join(sent_tokenize(att.text))

    create_trectext_file(docs, new_trectext_file)
    logger.info('Competition trectext file created')
    return new_trectext_file
示例#2
0
def log_error(error_dir, command, error):
    ensure_dirs(error_dir)
    command = ' '.join([
        argument for argument in command.split()
        if not ('word2vec_dump' in argument or 'output_dir' in argument)
    ])
    with open(error_dir + command, 'w') as f:
        f.write(str(error))
def create_ws(raw_ds, ws_fname, ref_index):
    ensure_dirs(ws_fname)
    with open(ws_fname, 'w') as ws:
        for qrid in raw_ds:
            epoch, qid = parse_qrid(qrid)
            query_write = f'{qid}{epoch.lstrip("0")}{ref_index + 1}'
            for i, pair in enumerate(raw_ds[qrid]):
                name = generate_pair_name(pair)
                ws.write(query_write + " Q0 " + name + " 0 " + str(i + 1) + " pairs_seo\n")
示例#4
0
def run_all_competitions(
    mode,
    tr_method,
    validation_method,
    run_name,
    source='goren',
    positions_file_paper='./data/paper_data/documents.positions',
    trec_file_raifer='data/trec_file_original_sorted.txt',
    embedding_model_file='/lv_local/home/hadarsi/work_files/word2vec_model/word2vec_model'
):
    """
    A function which runs all possible queries and bot combinations for the given arguments
    top_refinement: one of the following options: 'vanilla', 'acceleration', 'past_top', 'highest_rated_inferiors',
                              'past_targets', 'everything'
    """
    if mode.startswith('rerun'):
        print('Implement this rerunning thing')
        return

    name = tr_method + '_' + validation_method
    if run_name is not '':
        name += '_' + run_name

    folder_name = '_'.join((mode, datetime.now().strftime('%m_%d'), name))
    results_dir = f'results/{folder_name}/'
    output_dir = f'output/{folder_name}/'

    print(
        f'Running mode {mode} with top refinement method {tr_method} and validation method {validation_method}'
    )

    word2vec_pkl = output_dir + 'word_embedding_model.pkl'
    ensure_dirs(output_dir)
    word_embedding_model = load_word_embedding_model(embedding_model_file)
    with open(word2vec_pkl, 'wb') as f:
        pickle.dump(word_embedding_model, f)

    assert mode.endswith('of5')
    num_of_bots = int(mode[0])
    if source == 'goren':
        kwargs = {'mode': source, 'positions_file': positions_file_paper}
    elif source == 'raifer':
        kwargs = {'mode': source, 'trec_file': trec_file_raifer}
    else:
        raise ValueError(f'Illegal source given {source}')

    run_all_queries(output_dir, results_dir, num_of_bots, tr_method,
                    validation_method, word2vec_pkl, **kwargs)
    print(
        f'\t\tFinished running {name} with mode {mode} and TRM {tr_method} and val method {validation_method}'
    )
    os.remove(word2vec_pkl)
def feature_creation(qrid, ranked_lists, doc_texts, ref_index, copy_docs, doc_tfidf_vectors_dir,
                     sentence_tfidf_vectors_dir, raw_dataset_file, query_text, output_feature_files_dir,
                     output_final_features_file, workingset_file, word_embed_model):
    ensure_dirs(output_feature_files_dir, output_final_features_file)
    raw_ds = read_raw_ds(raw_dataset_file)
    create_ws(raw_ds, workingset_file, ref_index)
    create_features(qrid, ranked_lists, doc_texts, ref_index, copy_docs, doc_tfidf_vectors_dir,
                    sentence_tfidf_vectors_dir, query_text, output_feature_files_dir, raw_ds, word_embed_model)

    constants.lock.acquire()
    command = f"perl scripts/generateSentences.pl {output_feature_files_dir} {workingset_file}"
    run_and_print(command, 'generateSentences.pl')
    command = "mv features " + output_final_features_file
    run_and_print(command, 'move')
    constants.lock.release()
示例#6
0
def run_all_queries(output_dir,
                    results_dir,
                    num_of_bots,
                    tr_method,
                    validation_method,
                    word2vec_pickle,
                    print_interval=5,
                    total_players=5,
                    **kwargs):
    error_dir = output_dir + 'errors/'
    bots_list = get_bots(num_of_bots, total_players, **kwargs)
    mode = kwargs.pop('mode')

    iteration = 1
    for qid in bots_list:
        for bots in bots_list[qid]:
            run_description = f'output_dir={output_dir} qid={qid} bots={",".join(bots)} trm={tr_method} ' \
                              f'val={validation_method}'
            if iteration == 1 or iteration % print_interval == 0:
                print(f'{iteration}. {run_description}')

            stdout = sys.stdout
            sys.stdout = open(os.devnull, 'w')
            try:
                competition_setup(mode=mode,
                                  output_dir=output_dir,
                                  qid=qid,
                                  bots=bots,
                                  word2vec_pickle=word2vec_pickle,
                                  top_refinement=tr_method,
                                  validation_method=validation_method,
                                  mute=True)
                sys.stdout = stdout
            except Exception as e:
                sys.stdout = stdout
                print(
                    f'#### Error occured in competition {qid} {", ".join(bots)}: \n{str(e)}\n'
                )
                log_error(error_dir, run_description, e)

            ensure_dirs(results_dir)
            for directory in [
                    'trec_files', 'trectext_filces', 'errors', 'replacements'
            ]:
                if os.path.exists(f'{output_dir}/{directory}'):
                    command = f'cp -r {output_dir}/{directory} {results_dir}'
                    run_bash_command(command)
            iteration += 1
def create_initial_trec_file(output_dir, qid, bots, only_bots, **kwargs):
    logger = logging.getLogger(sys.argv[0])

    new_trec_file = output_dir + 'trec_file_{}_{}'.format(qid, ','.join(bots))

    lines_written = 0
    ensure_dirs(new_trec_file)
    if 'trec_file' in kwargs:
        qrid = get_qrid(qid, 1)
        with open(kwargs['trec_file'], 'r') as trec_file:
            with open(new_trec_file, 'w') as new_file:
                for line in trec_file:
                    last_qrid = line.split()[0]
                    if last_qrid != qrid:
                        continue
                    pid = line.split()[2].split('-')[-1]
                    if not only_bots or pid in bots:
                        new_file.write(line)
                        lines_written += 1

    else:
        ranked_list = []
        with open(kwargs['positions_file'], 'r') as pos_file:
            for line in pos_file:
                doc_id = line.split()[2]
                epoch, last_qid, pid = parse_doc_id(doc_id)
                if epoch != '01' or last_qid != qid or (only_bots and pid not in bots):
                    continue
                if '_' in pid:
                    pid = pid.replace('_', '')
                position = int(line.split()[3])
                ranked_list.append([get_qrid(qid, 1), get_doc_id(1, qid, pid), 3 - position])
        ranked_list.sort(key=lambda x: x[2], reverse=True)
        with open(new_trec_file, 'w') as new_file:
            for file in ranked_list:
                new_file.write(f'{file[0]} Q0 {file[1]} 0 {file[2]} positions\n')
                lines_written += 1

    if lines_written == 0 and not only_bots:
        raise ValueError(f'query {qid} not in dataset')

    if only_bots and lines_written != len(bots):
        raise ValueError('Competitors {} not in dataset'.format(', '.join(kwargs['pid_list'])))

    logger.info('Competition trec file created')
    return new_trec_file
def record_doc_similarity(doc_texts, current_epoch, similarity_file, word_embedding_model, document_tfidf_dir):
    logger = logging.getLogger(sys.argv[0])
    ensure_dirs(similarity_file)

    recent_documents = []
    recent_texts = []
    for document in doc_texts:
        epoch = int(document.split('-')[1])
        if epoch == current_epoch:
            recent_documents.append(document)
            recent_texts.append(doc_texts[document])
    assert len(recent_documents) == 2

    tfidf_sim = tfidf_similarity(*[document_tfidf_dir + doc for doc in recent_documents])
    embedding_sim = embedding_similarity(*recent_texts, word_embedding_model)
    with open(similarity_file, 'a') as f:
        if current_epoch == 1:
            f.write('Round\ttfidf\tembedding\n')
        f.write(f'{current_epoch - 1}\t{round(tfidf_sim, 3)}\t{round(embedding_sim, 3)}\n')
    logger.info('Recorded document similarity')
def run_reranking(qrid, trec_file, base_index, new_index, swig_path, scripts_dir, stopwords_file, queries_text_file,
                  jar_path, rank_model, output_dir, reranking_ws_name='reranking_ws',
                  new_feature_file_name='new_feature_file', feature_dir_name='feature_dir/',
                  new_trec_file_name='trec_file', score_file_name='score_file'):
    logger = logging.getLogger(sys.argv[0])
    ensure_dirs(output_dir)

    reranked_trec_file = output_dir + new_trec_file_name
    feature_file = output_dir + new_feature_file_name
    full_feature_dir = output_dir + feature_dir_name
    reranking_ws = output_dir + reranking_ws_name
    score_file = output_dir + score_file_name

    raw_ranked_lists = TrecReader(trec_file=trec_file, raw=True)
    create_reranking_ws(qrid, raw_ranked_lists, reranking_ws)
    logger.info("creating features")
    features_file = create_features_file_diff(full_feature_dir, base_index, new_index, feature_file, reranking_ws,
                                              scripts_dir, swig_path, stopwords_file, queries_text_file)
    logger.info("creating docname index")
    docname_index = create_index_to_doc_name_dict(features_file)
    logger.info("docname index creation is completed")
    query_index = create_index_to_query_dict(features_file)
    logger.info("features creation completed")
    logger.info("running ranking model on features file")
    run_model(features_file, jar_path, score_file, rank_model)
    logger.info("ranking completed")
    logger.info("retrieving scores")
    scores = retrieve_scores(docname_index, query_index, score_file)
    logger.info("scores retrieval completed")
    logger.info("creating trec_eval file")
    create_trec_eval_file(scores, reranked_trec_file)
    logger.info("trec file creation is completed")
    logger.info("ordering trec file")
    final = order_trec_file(reranked_trec_file)
    logger.info("ranking procedure completed")
    return final
def competition_setup(mode, qid: str, bots: list, top_refinement, validation_method, output_dir='output/tmp/',
                      mute=False, **kwargs):
    embedding_model_file = '/lv_local/home/hadarsi/work_files/word2vec_model/word2vec_model'
    alternation_classifier_pickle = 'predictors/alteration_classifier.pkl'
    clueweb_index = '/lv_local/home/hadarsi/work_files/clueweb_index/'
    swig_path = '/lv_local/home/hadarsi/indri-5.6/swig/obj/java/'
    coherency_qrels_file = 'data/coherency_aggregated_labels.txt'
    queries_text_file = 'data/working_comp_queries_expanded.txt'
    trectext_file_paper = 'data/paper_data/documents.trectext'
    unranked_features_file = 'data/features_bot_sorted.txt'
    positions_file = 'data/paper_data/documents.positions'
    trec_file = 'data/trec_file_original_sorted.txt'
    trectext_file_raifer = 'data/documents.trectext'
    aggregated_data_dir = 'data/learning_dataset/'
    rank_model = 'rank_models/model_lambdatamart'
    queries_xml_file = 'data/queries_seo_exp.xml'
    indri_path = '/lv_local/home/hadarsi/indri/'
    seo_qrels_file = 'data/qrels_seo_bot.txt'
    stopwords_file = 'data/stopwords_list'
    ranklib_jar = 'scripts/RankLib.jar'
    svm_rank_scripts_dir = 'scripts/'
    svm_models_dir = 'rank_models/'
    scripts_dir = 'scripts/'

    pair_ranker_args = ('harmonic', 1)

    ensure_dirs(output_dir)
    document_workingset_file = output_dir + 'document_ws.txt'
    rep_val_dir = output_dir + 'replacement_evaluation/'
    final_features_dir = output_dir + 'final_features/'
    doc_tfidf_dir = output_dir + 'document_tfidf/'
    trectext_dir = output_dir + 'trectext_files/'
    predictions_dir = output_dir + 'predictions/'
    reranking_dir = output_dir + 'reranking/'
    raw_ds_dir = output_dir + 'raw_datasets/'
    trec_dir = output_dir + 'trec_files/'
    # competition_index = output_dir + 'index_' + qid + '_' + ','.join(bots)
    competition_index = output_dir + 'index'

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.CRITICAL + 1 if mute else logging.INFO)
    logger.info("Running {}".format(' '.join(sys.argv)))

    pair_ranker = svm_models_dir + get_model_name(pair_ranker_args)
    if not os.path.exists(pair_ranker):
        create_pair_ranker(pair_ranker, pair_ranker_args, aggregated_data_dir,
                           seo_qrels_file, coherency_qrels_file, unranked_features_file,
                           svm_rank_scripts_dir)

    if 'top_ranker_args' in kwargs:
        top_ranker_args = kwargs.pop('top_ranker_args')
        top_ranker = svm_models_dir + get_model_name(top_ranker_args)
        if not os.path.exists(top_ranker):
            create_pair_ranker(top_ranker, top_ranker_args, aggregated_data_dir,
                               seo_qrels_file, coherency_qrels_file, unranked_features_file,
                               svm_rank_scripts_dir)
    else:
        top_ranker = pair_ranker

    # load word2vec model
    if 'word2vec_dump' in kwargs:
        word2vec_dump = kwargs.pop('word2vec_dump')
        word_embedding_model = pickle.load(open(word2vec_dump, 'rb'))
        logger.info('Loaded word Embedding Model from pickle')
    else:
        word_embedding_model = load_word_embedding_model(embedding_model_file)
        logger.info('Loaded word Embedding Model from file')

    alternation_classifier = pickle.load(open(alternation_classifier_pickle, 'rb'))

    if mode == '2of2':
        trectext_file = trectext_file_raifer
        assert len(bots) == 2
        replacements_file = output_dir + 'replacements/replacements_{}_{}'.format(qid, ','.join(bots))
        similarity_file = output_dir + 'similarity_results/similarity_{}_{}.txt'.format(qid, ','.join(bots))
        for file in [replacements_file, similarity_file]:
            if os.path.exists(file):
                os.remove(file)

        run_2_bot_competition(qid, bots, trectext_file, trec_file, output_dir, clueweb_index,
                              competition_index, document_workingset_file, doc_tfidf_dir, reranking_dir, trec_dir,
                              trectext_dir, raw_ds_dir, predictions_dir, final_features_dir, swig_path,
                              indri_path, replacements_file, similarity_file, svm_rank_scripts_dir,
                              10, scripts_dir, stopwords_file, queries_text_file, queries_xml_file,
                              ranklib_jar, rank_model, pair_ranker, top_ranker, word_embedding_model)

    else:
        replacements_file = output_dir + 'replacements/replacements_' + '_'.join([qid, ','.join(bots)])
        if os.path.exists(replacements_file):
            os.remove(replacements_file)
        competitors = get_competitors(qid=qid, trec_file=(trec_file if mode == 'raifer' else positions_file))

        if mode == 'raifer':
            trectext_file = trectext_file_raifer
            kwargs = dict(trec_file=trec_file)
            rounds = 7
        elif mode == 'goren':
            trectext_file = trectext_file_paper
            kwargs = dict(positions_file=positions_file)
            rounds = 3
        else:
            raise ValueError('Illegal mode given')

        if not all([bot in competitors for bot in bots]):
            raise ValueError(f'Not all given bots are competitors in the query \n'
                             f'bots: {bots} \ncompetitors: {competitors}')

        run_general_competition(qid, competitors, bots, rounds, top_refinement, validation_method, trectext_file,
                                output_dir, document_workingset_file, indri_path, swig_path, doc_tfidf_dir,
                                reranking_dir, trec_dir, trectext_dir, raw_ds_dir, predictions_dir, final_features_dir,
                                clueweb_index, competition_index, replacements_file, svm_rank_scripts_dir, scripts_dir,
                                stopwords_file, queries_text_file, queries_xml_file, ranklib_jar, rank_model,
                                pair_ranker, top_ranker, word_embedding_model, alternation_classifier, rep_val_dir,
                                **kwargs)
def record_replacement(replacements_file, epoch, in_doc_id, out_doc_id, out_index, in_index, features):
    ensure_dirs(replacements_file)
    with open(replacements_file, 'a') as f:
        items = [str(item) for item in [epoch, in_doc_id, out_doc_id, out_index, in_index, ','.join(features)]]
        f.write('\t'.join(items) + '\n')
def generate_document_tfidf_files(workingset_file, output_dir, new_index, base_index=clueweb_index):
    ensure_dirs(output_dir)
    command = f'java -Djava.library.path={swig_path} -cp {indri_utils_path} PrepareTFIDFVectorsWSDiff ' \
              f'{base_index} {new_index} {workingset_file} {output_dir}'
    run_and_print(command, command_name='Document tfidf Creation')
def generate_predictions(model_path, svm_rank_scripts_dir, predictions_dir, feature_file):
    predictions_file = predictions_dir + '_predictions'.join(splitext(basename(feature_file)))
    ensure_dirs(predictions_file)
    command = f'{svm_rank_scripts_dir}svm_rank_classify {feature_file} {model_path} {predictions_file}'
    run_and_print(command, 'pair classify')
    return predictions_file
def create_model(svm_rank_scripts_dir, model_path, learning_data, svm_rank_c=0.01):
    ensure_dirs(model_path)
    command = f'{svm_rank_scripts_dir}svm_rank_learn -c {svm_rank_c} {learning_data} {model_path}'
    run_and_print(command, 'pair ranker learn')