Exemplo n.º 1
0
def validate_answers_with_corpus(corpus, qa_pairs, output_directory):
    """
    Verify that all the answers in the Q&A pairs are present in the corpus.

    If they are all present in the corpus, this does nothing. If any are missing, it creates two new files:
    answers.in-corpus.csv and answers.not-in-corpus.csv.

    :param corpus: corpus downloaded from xmgr
    :type corpus: pandas.DataFrame
    :param qa_pairs: Q&A pairs extracted from the usage logs
    :type qa_pairs: pandas.DataFrame
    :param output_directory: directory in which to create files
    :type output_directory: str
    """
    missing_answers = ~qa_pairs[ANSWER].isin(corpus[ANSWER])
    if any(missing_answers):
        ensure_directory_exists(output_directory)
        missing_answer_qa_pairs = qa_pairs[missing_answers]
        n = len(qa_pairs.drop_duplicates(ANSWER))
        m = len(missing_answer_qa_pairs.drop_duplicates(ANSWER))
        print("%d usage log answers of %d (%0.3f%%) not in the corpus" % (m, n, 100.0 * m / n))
        answers_in_corpus_csv = os.path.join(output_directory, "answers.in-corpus.csv")
        answers_not_in_corpus_csv = os.path.join(output_directory, "answers.not-in-corpus.csv")
        print("Writing Q&A pairs to %s and %s" % (answers_not_in_corpus_csv, answers_in_corpus_csv))
        to_csv(answers_in_corpus_csv, QAPairFileType.output_format(qa_pairs[~missing_answers]))
        to_csv(answers_not_in_corpus_csv, QAPairFileType.output_format(missing_answer_qa_pairs))
    else:
        print("All usage log answers are the corpus.")
Exemplo n.º 2
0
def validate_truth_with_corpus(corpus, truth, output_directory):
    """
    Verify that all the answer IDs in the truth appear in the corpus.

    If they are all present in the corpus, this does nothing. If any are missing, it creates two new files:
    truth.in-corpus.csv and truth.not-in-corpus.csv.

    :param corpus: corpus downloaded from xmgr
    :type corpus: pandas.DataFrame
    :param truth:  truth downloaded from xmgr
    :type truth: pandas.DataFrame
    :param output_directory: directory in which to create files
    :type output_directory: str
    """
    missing_answers = missing_truth_in_corpus(corpus, truth)
    if any(missing_answers):
        ensure_directory_exists(output_directory)
        missing_truth_answers = truth[missing_answers]
        n = len(truth)
        m = len(missing_truth_answers)
        print("%d truth answers of %d (%0.3f%%) not in the corpus" % (m, n, 100.0 * m / n))
        truth_in_corpus_csv = os.path.join(output_directory, "truth.in-corpus.csv")
        truth_not_in_corpus_csv = os.path.join(output_directory, "truth.not-in-corpus.csv")
        print("Writing truth to %s and %s" % (truth_in_corpus_csv, truth_not_in_corpus_csv))
        to_csv(truth_in_corpus_csv, TruthFileType.output_format(truth[~missing_answers]))
        to_csv(truth_not_in_corpus_csv, TruthFileType.output_format(missing_truth_answers))
    else:
        print("All truth answer ids are in the corpus.")
Exemplo n.º 3
0
def validate_answers_with_corpus(corpus, qa_pairs, output_directory):
    """
    Verify that all the answers in the Q&A pairs are present in the corpus.

    If they are all present in the corpus, this does nothing. If any are missing, it creates two new files:
    answers.in-corpus.csv and answers.not-in-corpus.csv.

    :param corpus: corpus downloaded from xmgr
    :type corpus: pandas.DataFrame
    :param qa_pairs: Q&A pairs extracted from the usage logs
    :type qa_pairs: pandas.DataFrame
    :param output_directory: directory in which to create files
    :type output_directory: str
    """
    missing_answers = ~qa_pairs[ANSWER].isin(corpus[ANSWER])
    if any(missing_answers):
        ensure_directory_exists(output_directory)
        missing_answer_qa_pairs = qa_pairs[missing_answers]
        n = len(qa_pairs.drop_duplicates(ANSWER))
        m = len(missing_answer_qa_pairs.drop_duplicates(ANSWER))
        print("%d usage log answers of %d (%0.3f%%) not in the corpus" %
              (m, n, 100.0 * m / n))
        answers_in_corpus_csv = os.path.join(output_directory,
                                             "answers.in-corpus.csv")
        answers_not_in_corpus_csv = os.path.join(output_directory,
                                                 "answers.not-in-corpus.csv")
        print("Writing Q&A pairs to %s and %s" %
              (answers_not_in_corpus_csv, answers_in_corpus_csv))
        to_csv(answers_in_corpus_csv,
               QAPairFileType.output_format(qa_pairs[~missing_answers]))
        to_csv(answers_not_in_corpus_csv,
               QAPairFileType.output_format(missing_answer_qa_pairs))
    else:
        print("All usage log answers are the corpus.")
Exemplo n.º 4
0
def validate_truth_with_corpus(corpus, truth, output_directory):
    """
    Verify that all the answer IDs in the truth appear in the corpus.

    If they are all present in the corpus, this does nothing. If any are missing, it creates two new files:
    truth.in-corpus.csv and truth.not-in-corpus.csv.

    :param corpus: corpus downloaded from xmgr
    :type corpus: pandas.DataFrame
    :param truth:  truth downloaded from xmgr
    :type truth: pandas.DataFrame
    :param output_directory: directory in which to create files
    :type output_directory: str
    """
    missing_answers = missing_truth_in_corpus(corpus, truth)
    if any(missing_answers):
        ensure_directory_exists(output_directory)
        missing_truth_answers = truth[missing_answers]
        n = len(truth)
        m = len(missing_truth_answers)
        print("%d truth answers of %d (%0.3f%%) not in the corpus" %
              (m, n, 100.0 * m / n))
        truth_in_corpus_csv = os.path.join(output_directory,
                                           "truth.in-corpus.csv")
        truth_not_in_corpus_csv = os.path.join(output_directory,
                                               "truth.not-in-corpus.csv")
        print("Writing truth to %s and %s" %
              (truth_in_corpus_csv, truth_not_in_corpus_csv))
        to_csv(truth_in_corpus_csv,
               TruthFileType.output_format(truth[~missing_answers]))
        to_csv(truth_not_in_corpus_csv,
               TruthFileType.output_format(missing_truth_answers))
    else:
        print("All truth answer ids are in the corpus.")
Exemplo n.º 5
0
def download_truth_from_xmgr(xmgr, output_directory):
    """
    Download truth from an XMGR project.

    Truth is a mapping of sets of questions to answer documents. Truth is used to train the WEA model and may be used
    to train an NLC model.

    This function creates two files in the output directory: a raw truth.json that contains all the information
    downloaded from XMGR and a filtered truth.csv file.

    :param xmgr: connection to an XMGR project REST API
    :type xmgr: XmgrProject
    :param output_directory: directory in which to create truth.json and truth.csv
    :type output_directory: str
    """
    ensure_directory_exists(output_directory)
    truth_json = os.path.join(output_directory, "truth.json")
    truth_csv = os.path.join(output_directory, "truth.csv")
    if os.path.isfile(truth_json) and os.path.isfile(truth_csv):
        logger.info("Truth already downloaded")
        return
    if not os.path.isfile(truth_json):
        logger.info("Get questions from %s" % xmgr)
        mapped_questions = [
            question for question in xmgr.get_questions()
            if not question["state"] == "REJECTED"
        ]
        with open(truth_json, "w") as f:
            json.dump(mapped_questions, f, indent=2)
    else:
        with open(truth_json) as f:
            mapped_questions = json.load(f)
    logger.info("Build truth from questions")
    truth = get_truth_from_mapped_questions(mapped_questions)
    to_csv(truth_csv, TruthFileType.output_format(truth))
Exemplo n.º 6
0
def download_truth_from_xmgr(xmgr, output_directory):
    """
    Download truth from an XMGR project.

    Truth is a mapping of sets of questions to answer documents. Truth is used to train the WEA model and may be used
    to train an NLC model.

    This function creates two files in the output directory: a raw truth.json that contains all the information
    downloaded from XMGR and a filtered truth.csv file.

    :param xmgr: connection to an XMGR project REST API
    :type xmgr: XmgrProject
    :param output_directory: directory in which to create truth.json and truth.csv
    :type output_directory: str
    """
    ensure_directory_exists(output_directory)
    truth_json = os.path.join(output_directory, "truth.json")
    truth_csv = os.path.join(output_directory, "truth.csv")
    if os.path.isfile(truth_json) and os.path.isfile(truth_csv):
        logger.info("Truth already downloaded")
        return
    if not os.path.isfile(truth_json):
        logger.info("Get questions from %s" % xmgr)
        mapped_questions = [question for question in xmgr.get_questions() if not question["state"] == "REJECTED"]
        with open(truth_json, "w") as f:
            json.dump(mapped_questions, f, indent=2)
    else:
        with open(truth_json) as f:
            mapped_questions = json.load(f)
    logger.info("Build truth from questions")
    truth = get_truth_from_mapped_questions(mapped_questions)
    to_csv(truth_csv, TruthFileType.output_format(truth))
Exemplo n.º 7
0
def plot_handler(args):
    curves = generate_curves(args.type, args.collated)
    # Write curves data.
    ensure_directory_exists(args.output)
    for label, curve in curves.items():
        filename = os.path.join(args.output, "%s.%s.csv" % (args.type, label))
        to_csv(filename, curve)
    # Optionally draw plot.
    if args.draw:
        plot_curves(curves, args.type)
Exemplo n.º 8
0
def download_corpus_from_xmgr(xmgr, output_directory, checkpoint_frequency, max_docs):
    """
    Download the corpus from an XMGR project

    A corpus is a mapping of answer text to answer Ids. It also contains answer titles and the names of the documents
    from which the answers were extracted.

    This can take a long time to complete, so intermediate results are saved in the directory. If you restart an
    incomplete download it will pick up where it left off.

    :param xmgr: connection to an XMGR project REST API
    :type xmgr: XmgrProject
    :param output_directory: directory into which write the corpus.csv file
    :type output_directory: str
    :checkpoint_frequency: how often to write intermediate results to a checkpoint file
    :type checkpoint_frequency: int
    :param max_docs: maximum number of corpus documents to download, if None, download them all
    :type max_docs: int
    """
    document_ids_csv = os.path.join(output_directory, "document_ids.csv")
    corpus_csv = os.path.join(output_directory, "corpus.csv")
    if os.path.isfile(corpus_csv) and not os.path.isfile(document_ids_csv):
        logger.info("Corpus already downloaded")
        return
    logger.info("Download corpus from %s" % xmgr)
    document_ids = sorted(set(document["id"] for document in xmgr.get_documents()))
    document_ids = document_ids[:max_docs]
    n = len(document_ids)
    downloaded_document_ids = DataFrameCheckpoint(document_ids_csv, [DOCUMENT_ID, "Paus"], checkpoint_frequency)
    corpus = DataFrameCheckpoint(corpus_csv, CorpusFileType.columns)
    try:
        if downloaded_document_ids.recovered:
            logger.info("Recovered %d documents from previous run" % len(downloaded_document_ids.recovered))
        document_ids = sorted(set(document_ids) - downloaded_document_ids.recovered)
        m = len(document_ids)
        start = len(downloaded_document_ids.recovered) + 1
        if m:
            for i, document_id in enumerate(document_ids, start):
                if i % checkpoint_frequency == 0 or i == start or i == m:
                    corpus.flush()
                    logger.info(percent_complete_message("Get PAUs from document", i, n))
                paus = xmgr.get_paus_from_document(document_id)
                # The document id and number of PAUs are both integers. Cast them to strings, otherwise pandas will
                # write them as floats.
                for pau in paus:
                    corpus.write(pau["id"], pau["responseMarkup"], pau["title"], pau["sourceName"], str(document_id))
                downloaded_document_ids.write(str(document_id), str(len(paus)))
    finally:
        downloaded_document_ids.close()
        corpus.close()
    corpus = from_csv(corpus_csv).drop_duplicates(ANSWER_ID)
    to_csv(corpus_csv, CorpusFileType.output_format(corpus))
    docs = len(from_csv(document_ids_csv))
    os.remove(document_ids_csv)
    logger.info("%d documents and %d PAUs in corpus" % (docs, len(corpus)))
Exemplo n.º 9
0
def train_nlc(url, username, password, truth, name):
    logger.info("Train model %s with %d instances" % (name, len(truth)))
    with tempfile.TemporaryFile() as training_file:
        # NLC cannot handle newlines.
        truth[QUESTION] = truth[QUESTION].str.replace("\n", " ")
        to_csv(training_file, truth[[QUESTION, ANSWER_ID]], header=False, index=False)
        training_file.seek(0)
        nlc = NaturalLanguageClassifier(url=url, username=username, password=password)
        r = nlc.create(training_data=training_file, name=name)
        logger.info(pretty_print_json(r))
    return r["classifier_id"]
Exemplo n.º 10
0
def train_nlc(url, username, password, truth, name):
    logger.info("Train model %s with %d instances" % (name, len(truth)))
    with tempfile.TemporaryFile() as training_file:
        # NLC cannot handle newlines.
        truth[QUESTION] = truth[QUESTION].str.replace("\n", " ")
        to_csv(training_file, truth[[QUESTION, ANSWER_ID]], header=False, index=False)
        training_file.seek(0)
        nlc = NaturalLanguageClassifier(url=url, username=username, password=password)
        r = nlc.create(training_data=training_file, name=name)
        logger.info(pretty_print_json(r))
    return r["classifier_id"]
Exemplo n.º 11
0
def nlc_router_train(url, username, password, oracle_out, path, all_correct):

    """
    NLC Training on the oracle experiment output to determine which system(NLC or Solr) should
    answer particular question.

    1. Splitting up the oracle experiment output data into 8 equal training records and testing records. This is to
    ensure 8-fold cross validation of the data-set. All training and Testing files will be stored
    at the "path"

     2. Perform NLC training on the all 8 training set simultaneously and returns list of classifier
     ids as json file in the working directory

    :param url: URL of NLC instance
    :param username: NLC Username
    :param password: NLC password
    :param oracle_out: file created by oracle experiment
    :param path: directory path to save intermediate results
    :param all_correct: optional boolean parameter to train with only correct QA pairs
    :return: list of classifier ids by NLC training
    """
    ensure_directory_exists(path)

    sys_name = oracle_out[SYSTEM][0]
    oracle_out[QUESTION] = oracle_out[QUESTION].str.replace("\n", " ")
    kfold_split(oracle_out, path, NLC_ROUTER_FOLDS, True)
    classifier_list = []
    list = []

    for x in range(0, NLC_ROUTER_FOLDS):
        train = pandas.read_csv(os.path.join(path, "Train{0}.csv".format(str(x))))
        if all_correct:
            logger.info("Training only on CORRECT examples.")
            # Ignore records from training which are not correct
            train = train[train[CORRECT]]
            train = train[train[IN_PURVIEW]]
        train = train[[QUESTION, ANSWERING_SYSTEM]]
        logger.info("Training set size = {0}".format(str(len(train))))
        with tempfile.TemporaryFile() as training_file:
            to_csv(training_file, train[[QUESTION, ANSWERING_SYSTEM]], header=False, index=False)
            training_file.seek(0)
            nlc = NaturalLanguageClassifier(url=url, username=username, password=password)
            classifier_id = nlc.create(training_data=training_file, name="{0}_fold_{1}".format(str(sys_name), str(x)))
            classifier_list.append(classifier_id["classifier_id"].encode("utf-8"))
            list.append({classifier_id["name"].encode("utf-8"): classifier_id["classifier_id"].encode("utf-8")})
            logger.info(pretty_print_json(classifier_id))
            pretty_print_json(classifier_id)

    with open(os.path.join(path, 'classifier.json'), 'wb') as f:
        json.dump(list, f)
    return classifier_list
Exemplo n.º 12
0
def analyze_truth_handler(parser, args):
    if args.histogram is None and args.corpus is not None:
        parser.print_usage()
        parser.error("The corpus is only used when drawing a histogram.")
    pairs, questions, answers, question_histogram = truth_statistics(args.truth)
    print("%d training pairs, %d unique questions, %d unique answers, average %0.3f questions per answer" %
          (pairs, questions, answers, questions / float(answers)))
    if args.histogram:
        if args.corpus is not None:
            question_histogram = pandas.merge(question_histogram, args.corpus[[ANSWER_ID, ANSWER]].set_index(ANSWER_ID),
                                              left_index=True, right_index=True)
            question_histogram = question_histogram[[ANSWER, QUESTION]]
        to_csv(args.histogram,
               question_histogram.sort_values(QUESTION, ascending=False).rename(columns={QUESTION: "Questions"}))
Exemplo n.º 13
0
def in_purview_disagreement_evaluate(systems_data, output_file):

    purview_disagreement = in_purview_disagreement(systems_data)
    questions_to_judge = purview_disagreement[QUESTION].unique()
    for question in questions_to_judge:

        purview_judgment = _get_in_purview_judgment(question)

        current_question_rows = systems_data[systems_data[QUESTION] == question]
        for row in current_question_rows.iterrows():
            index = row[0]
            original_judgment = row[1]["In Purview"]

            if purview_judgment != original_judgment:
                if purview_judgment:
                    systems_data.ix[index, IN_PURVIEW] = True
                    systems_data.ix[index, CORRECT] = _judge_answer(row)
                else:
                    systems_data.ix[index, IN_PURVIEW] = False
                    systems_data.ix[index, CORRECT] = False
            to_csv(output_file, systems_data, index=False)
        # break
    # print systems_data[systems_data[QUESTION] == question
    return systems_data
Exemplo n.º 14
0
def download_corpus_from_xmgr(xmgr, output_directory, checkpoint_frequency,
                              max_docs):
    """
    Download the corpus from an XMGR project

    A corpus is a mapping of answer text to answer Ids. It also contains answer titles and the names of the documents
    from which the answers were extracted.

    This can take a long time to complete, so intermediate results are saved in the directory. If you restart an
    incomplete download it will pick up where it left off.

    :param xmgr: connection to an XMGR project REST API
    :type xmgr: XmgrProject
    :param output_directory: directory into which write the corpus.csv file
    :type output_directory: str
    :checkpoint_frequency: how often to write intermediate results to a checkpoint file
    :type checkpoint_frequency: int
    :param max_docs: maximum number of corpus documents to download, if None, download them all
    :type max_docs: int
    """
    document_ids_csv = os.path.join(output_directory, "document_ids.csv")
    corpus_csv = os.path.join(output_directory, "corpus.csv")
    if os.path.isfile(corpus_csv) and not os.path.isfile(document_ids_csv):
        logger.info("Corpus already downloaded")
        return
    logger.info("Download corpus from %s" % xmgr)
    document_ids = sorted(
        set(document["id"] for document in xmgr.get_documents()))
    document_ids = document_ids[:max_docs]
    n = len(document_ids)
    downloaded_document_ids = DataFrameCheckpoint(document_ids_csv,
                                                  [DOCUMENT_ID, "Paus"],
                                                  checkpoint_frequency)
    corpus = DataFrameCheckpoint(corpus_csv, CorpusFileType.columns)
    try:
        if downloaded_document_ids.recovered:
            logger.info("Recovered %d documents from previous run" %
                        len(downloaded_document_ids.recovered))
        document_ids = sorted(
            set(document_ids) - downloaded_document_ids.recovered)
        m = len(document_ids)
        start = len(downloaded_document_ids.recovered) + 1
        if m:
            for i, document_id in enumerate(document_ids, start):
                if i % checkpoint_frequency == 0 or i == start or i == m:
                    corpus.flush()
                    logger.info(
                        percent_complete_message("Get PAUs from document", i,
                                                 n))
                paus = xmgr.get_paus_from_document(document_id)
                # The document id and number of PAUs are both integers. Cast them to strings, otherwise pandas will
                # write them as floats.
                for pau in paus:
                    corpus.write(pau["id"], pau["responseMarkup"],
                                 pau["title"], pau["sourceName"],
                                 str(document_id))
                downloaded_document_ids.write(str(document_id), str(len(paus)))
    finally:
        downloaded_document_ids.close()
        corpus.close()
    corpus = from_csv(corpus_csv).drop_duplicates(ANSWER_ID)
    to_csv(corpus_csv, CorpusFileType.output_format(corpus))
    docs = len(from_csv(document_ids_csv))
    os.remove(document_ids_csv)
    logger.info("%d documents and %d PAUs in corpus" % (docs, len(corpus)))
Exemplo n.º 15
0
def trec_handler(args):
    checkpoint_filename = os.path.join(args.output_directory, "corpus.trec.temp.csv")
    corpus = corpus_from_trec(checkpoint_filename, args.directory, args.checkpoint_frequency, args.max_docs)
    to_csv(os.path.join(args.output_directory, "corpus.csv"), CorpusFileType.output_format(corpus))
    logger.info("%d documents and %d PAUs in corpus" % (len(corpus[DOCUMENT_ID].drop_duplicates()), len(corpus)))
    os.remove(checkpoint_filename)
Exemplo n.º 16
0
def wea_handler(args):
    wea_answers = get_answers_from_usage_log(args.questions, args.qa_pairs)
    to_csv(args.output, wea_answers)
Exemplo n.º 17
0
def analyze_corpus_handler(args):
    answers, tokens, histogram = corpus_statistics(args.corpus)
    print("%d answers, %d tokens, average %0.3f tokens per answer" % (answers, tokens, tokens / float(answers)))
    if args.histogram:
        r = pandas.DataFrame(list(histogram.items()), columns=("Tokens", "Count")).set_index("Tokens").sort_index()
        to_csv(args.histogram, r)