示例#1
0
def interpret_annotation_assist(annotation_assist, judgment_threshold):
    """
    Convert the file produced by the Annotation Assist tool into a set of judgments that can be used by Themis.

    Convert the in purview column from an integer value to a boolean. Convert the annotation score column to a boolean
    correct column by applying a threshold. An answer can only be correct if the question is in purview. Drop any Q&A
    pairs that have multiple annotations.

    :param annotation_assist: Annotation Assist judgments
    :type annotation_assist: pandas.DataFrame
    :param judgment_threshold: threshold above which an answer is deemed correct
    :type judgment_threshold: pandas.DataFrame
    :return: Annotation Assist judgments with a boolean Correct column
    :rtype: pandas.DataFrame
    """
    qa_duplicates = annotation_assist[[QUESTION, ANSWER]].duplicated()
    if any(qa_duplicates):
        n = sum(qa_duplicates)
        logger.warning(
            "Dropping %d Q&A pairs with multiple annotations (%0.3f%%)" %
            (n, 100.0 * n / len(annotation_assist)))
        annotation_assist.drop_duplicates((QUESTION, ANSWER),
                                          keep=False,
                                          inplace=True)
    annotation_assist[IN_PURVIEW] = annotation_assist[IN_PURVIEW].astype(
        "bool")
    annotation_assist[CORRECT] = \
        annotation_assist[IN_PURVIEW] & (annotation_assist[ANNOTATION_SCORE] >= judgment_threshold)
    logger.info("Processed %d judgments" % len(annotation_assist))
    return annotation_assist.drop(ANNOTATION_SCORE, axis="columns")
示例#2
0
def interpret_annotation_assist(annotation_assist, judgment_threshold):
    """
    Convert the file produced by the Annotation Assist tool into a set of judgments that can be used by Themis.

    Convert the in purview column from an integer value to a boolean. Convert the annotation score column to a boolean
    correct column by applying a threshold. An answer can only be correct if the question is in purview. Drop any Q&A
    pairs that have multiple annotations.

    :param annotation_assist: Annotation Assist judgments
    :type annotation_assist: pandas.DataFrame
    :param judgment_threshold: threshold above which an answer is deemed correct
    :type judgment_threshold: pandas.DataFrame
    :return: Annotation Assist judgments with a boolean Correct column
    :rtype: pandas.DataFrame
    """
    qa_duplicates = annotation_assist[[QUESTION, ANSWER]].duplicated()
    if any(qa_duplicates):
        n = sum(qa_duplicates)
        logger.warning(
            "Dropping %d Q&A pairs with multiple annotations (%0.3f%%)" % (n, 100.0 * n / len(annotation_assist)))
        annotation_assist.drop_duplicates((QUESTION, ANSWER), keep=False, inplace=True)
    annotation_assist[IN_PURVIEW] = annotation_assist[IN_PURVIEW].astype("bool")
    annotation_assist[CORRECT] = \
        annotation_assist[IN_PURVIEW] & (annotation_assist[ANNOTATION_SCORE] >= judgment_threshold)
    logger.info("Processed %d judgments" % len(annotation_assist))
    return annotation_assist.drop(ANNOTATION_SCORE, axis="columns")
示例#3
0
def drop_missing(systems_data):
    if any(systems_data.isnull()):
        n = len(systems_data)
        systems_data = systems_data.dropna()
        m = n - len(systems_data)
        logger.warning("Dropping %d of %d question/answer pairs missing information (%0.3f%%)" % (m, n, 100.0 * m / n))
    return systems_data
示例#4
0
def judge_sample_handler(args):
    questions = pandas.concat(args.judgments)[[QUESTION]].drop_duplicates()
    sample = pandas.merge(questions, args.frequency, on=QUESTION, how="left")
    n = len(sample)
    logger.info("%d judged questions" % n)
    m = sum(sample[FREQUENCY].isnull())
    if m:
        logger.warning("Missing frequencies for %d questions (%0.3f%%)" % (m, 100.0 * m / n))
    print_csv(QuestionFrequencyFileType.output_format(sample))
示例#5
0
 def __call__(self, filename):
     collated = super(self.__class__, self).__call__(filename)
     m = sum(collated[collated[IN_PURVIEW] == False][CORRECT])
     if m:
         n = len(collated)
         logger.warning(
             "%d out of %d question/answer pairs in %s are marked as out of purview but correct (%0.3f%%)"
             % (m, n, filename, 100.0 * m / n))
     return collated
示例#6
0
def precision(judgments, t):
    s = judgments[judgments[CONFIDENCE] >= t]
    correct = sum(s[s[CORRECT]][FREQUENCY])
    in_purview = sum(s[s[IN_PURVIEW]][FREQUENCY])
    try:
        return correct / float(in_purview)
    except ZeroDivisionError:
        logger.warning("No in-purview questions at threshold level %0.3f" % t)
        return None
示例#7
0
def drop_missing(systems_data):
    if any(systems_data.isnull()):
        n = len(systems_data)
        systems_data = systems_data.dropna()
        m = n - len(systems_data)
        if m:
            logger.warning("Dropping %d of %d question/answer pairs missing information (%0.3f%%)" %
                           (m, n, 100.0 * m / n))
    return systems_data
示例#8
0
def precision(judgments, t):
    s = judgments[judgments[CONFIDENCE] >= t]
    correct = sum(s[s[CORRECT]][FREQUENCY])
    in_purview = sum(s[s[IN_PURVIEW]][FREQUENCY])
    try:
        return correct / float(in_purview)
    except ZeroDivisionError:
        logger.warning("No in-purview questions at threshold level %0.3f" % t)
        return None
示例#9
0
def questions_attempted(judgments, t):
    s = judgments[judgments[CONFIDENCE] >= t]
    in_purview_attempted = sum(s[s[IN_PURVIEW]][FREQUENCY])
    total_in_purview = sum(judgments[judgments[IN_PURVIEW]][FREQUENCY])
    try:
        return in_purview_attempted / float(total_in_purview)
    except ZeroDivisionError:
        logger.warning("No in-purview questions attempted at threshold level %0.3f" % t)
        return None
示例#10
0
 def __call__(self, filename):
     collated = super(self.__class__, self).__call__(filename)
     m = sum(collated[collated[IN_PURVIEW] == False][CORRECT])
     if m:
         n = len(collated)
         logger.warning(
             "%d out of %d question/answer pairs in %s are marked as out of purview but correct (%0.3f%%)"
             % (m, n, filename, 100.0 * m / n))
     return collated
示例#11
0
def questions_attempted(judgments, t):
    s = judgments[judgments[CONFIDENCE] >= t]
    in_purview_attempted = sum(s[s[IN_PURVIEW]][FREQUENCY])
    total_in_purview = sum(judgments[judgments[IN_PURVIEW]][FREQUENCY])
    try:
        return in_purview_attempted / float(total_in_purview)
    except ZeroDivisionError:
        logger.warning(
            "No in-purview questions attempted at threshold level %0.3f" % t)
        return None
示例#12
0
文件: trec.py 项目: stefanvds/themis
def corpus_from_trec(checkpoint_filename, trec_directory, checkpoint_frequency, max_docs):
    trec_filenames = sorted(glob.glob(os.path.join(trec_directory, "*.xml")))[:max_docs]
    checkpoint = get_items("TREC files",
                           trec_filenames,
                           TrecFileCheckpoint(checkpoint_filename, checkpoint_frequency),
                           parse_trec_file,
                           checkpoint_frequency)
    if checkpoint.invalid:
        n = len(trec_filenames)
        logger.warning("%d of %d TREC files are invalid (%0.3f%%)" %
                       (checkpoint.invalid, n, 100 * checkpoint.invalid / n))
    # I'm not sure why I'm getting duplicates after a restart.
    return from_csv(checkpoint_filename).drop_duplicates().drop(TrecFileCheckpoint.TREC_FILENAME, axis="columns")
示例#13
0
 def get_pau_mapping(question):
     if "predefinedAnswerUnit" in question:
         return question["predefinedAnswerUnit"]
     elif "mappedQuestion" in question:
         question_id = question["mappedQuestion"]["id"]
         try:
             mapped_question = questions[question_id]
         except KeyError:
             logger.warning("Question %s mapped to non-existent question %s" % (question["id"], question_id))
             return None
         return get_pau_mapping(mapped_question)
     else:
         return None
示例#14
0
 def __call__(self, filename):
     if os.path.isfile(filename):
         collated = super(self.__class__, self).__call__(filename)
         m = sum(collated[collated[IN_PURVIEW] == False][CORRECT])
         if m:
             n = len(collated)
             logger.warning(
                 "%d out of %d question/answer pairs in %s are marked as out of purview but correct (%0.3f%%)"
                 % (m, n, filename, 100.0 * m / n))
         return collated
     else:
         logger.info("{0} does not exist".format(filename))
         return None
示例#15
0
文件: xmgr.py 项目: stefanvds/themis
 def get_pau_mapping(question):
     if "predefinedAnswerUnit" in question:
         return question["predefinedAnswerUnit"]
     elif "mappedQuestion" in question:
         question_id = question["mappedQuestion"]["id"]
         try:
             mapped_question = questions[question_id]
         except KeyError:
             logger.warning(
                 "Question %s mapped to non-existent question %s" %
                 (question["id"], question_id))
             return None
         return get_pau_mapping(mapped_question)
     else:
         return None
示例#16
0
def corpus_statistics(corpus):
    """
    Generate statistics for the corpus.

    :param corpus: corpus generated by 'xmgr corpus' command
    :type corpus: pandas.DataFrame
    :return: answers in corpus, tokens in the corpus, histogram of answer length in tokens
    :rtype: (int, int, dict(int, int))
    """
    answers = len(corpus)
    token_frequency = FreqDist([len(word_tokenize(BeautifulSoup(answer, "lxml").text)) for answer in corpus[ANSWER]])
    histogram = {}
    for frequency, count in token_frequency.items():
        histogram[frequency] = histogram.get(frequency, 0) + count
    tokens = sum(token_frequency.keys())
    n = sum(corpus.duplicated(ANSWER_ID))
    if n:
        logger.warning("%d duplicated answer IDs (%0.3f%%)" % (n, 100.0 * n / answers))
    return answers, tokens, histogram
示例#17
0
def corpus_statistics(corpus):
    """
    Generate statistics for the corpus.

    :param corpus: corpus generated by 'xmgr corpus' command
    :type corpus: pandas.DataFrame
    :return: answers in corpus, tokens in the corpus, histogram of answer length in tokens
    :rtype: (int, int, dict(int, int))
    """
    answers = len(corpus)
    token_frequency = FreqDist([len(word_tokenize(BeautifulSoup(answer, "lxml").text)) for answer in corpus[ANSWER]])
    histogram = {}
    for frequency, count in token_frequency.items():
        histogram[frequency] = histogram.get(frequency, 0) + count
    tokens = sum(token_frequency.keys())
    n = sum(corpus.duplicated(ANSWER_ID))
    if n:
        logger.warning("%d duplicated answer IDs (%0.3f%%)" % (n, 100.0 * n / answers))
    return answers, tokens, histogram
示例#18
0
def in_purview_disagreement(systems_data):
    """
    Return collated data where in-purview judgments are not unanimous for a question.

    These questions' purview should be rejudged to make them consistent.

    :param systems_data: collated results for all systems
    :type systems_data: pandas.DataFrame
    :return: subset of collated data where the purview judgments are not unanimous for a question
    :rtype: pandas.DataFrame
    """
    question_groups = systems_data[[QUESTION, IN_PURVIEW]].groupby(QUESTION)
    index = question_groups.filter(lambda qg: len(qg[IN_PURVIEW].unique()) == 2).index
    purview_disagreement = systems_data.loc[index]
    m = len(purview_disagreement[QUESTION].drop_duplicates())
    if m:
        n = len(systems_data[QUESTION].drop_duplicates())
        logger.warning("%d out of %d questions have non-unanimous in-purview judgments (%0.3f%%)"
                       % (m, n, 100.0 * m / n))
    return purview_disagreement
示例#19
0
def in_purview_disagreement(systems_data):
    """
    Return collated data where in-purview judgments are not unanimous for a question.

    These questions' purview should be rejudged to make them consistent.

    :param systems_data: collated results for all systems
    :type systems_data: pandas.DataFrame
    :return: subset of collated data where the purview judgments are not unanimous for a question
    :rtype: pandas.DataFrame
    """
    question_groups = systems_data[[QUESTION, IN_PURVIEW]].groupby(QUESTION)
    index = question_groups.filter(lambda qg: len(qg[IN_PURVIEW].unique()) > 1).index
    purview_disagreement = systems_data.loc[index]
    m = len(purview_disagreement[QUESTION].drop_duplicates())
    if m:
        n = len(systems_data[QUESTION].drop_duplicates())
        logger.warning("%d out of %d questions have non-unanimous in-purview judgments (%0.3f%%)"
                       % (m, n, 100.0 * m / n))
    return purview_disagreement
示例#20
0
def collate_handler(parser, args):
    labeled_qa_pairs = answer_labels(parser, args)
    judgments = pandas.concat(args.judgments)
    all_systems = []
    for label, qa_pairs in labeled_qa_pairs:
        # Only consider the questions listed in the frequency file.
        qa_pairs = qa_pairs[qa_pairs[QUESTION].isin(args.frequency[QUESTION])]
        collated = add_judgments_and_frequencies_to_qa_pairs(qa_pairs, judgments, args.frequency, args.remove_newlines)
        collated[SYSTEM] = label
        all_systems.append(collated)
    collated = pandas.concat(all_systems)
    logger.info("%d question/answer pairs" % len(collated))
    n = len(collated)
    for column, s in [(ANSWER, "answers"), (IN_PURVIEW, "in purview judgments"), (CORRECT, "correctness judgments")]:
        m = sum(collated[column].isnull())
        if m:
            logger.warning("%d question/answer pairs out of %d missing %s (%0.3f%%)" % (m, n, s, 100.0 * m / n))
    # This will print a warning if any in-purview judgments are not unanimous for a given question.
    in_purview_disagreement(collated)
    print_csv(CollatedFileType.output_format(collated))
示例#21
0
def get_answers_from_usage_log(questions, qa_pairs_from_logs):
    """
    Get answers returned by WEA to questions by looking them up in the usage log.

    Each question in the Q&A pairs must have a unique answer.

    :param questions: questions to look up in the usage logs
    :type questions: pandas.DataFrame
    :param qa_pairs_from_logs: question/answer pairs extracted from user logs
    :type qa_pairs_from_logs: pandas.DataFrame
    :return: Question, Answer, and Confidence
    :rtype: pandas.DataFrame
    """
    answers = pandas.merge(questions, qa_pairs_from_logs, on=QUESTION, how="left")
    missing_answers = answers[answers[ANSWER].isnull()]
    if len(missing_answers):
        logger.warning("%d questions without answers" % len(missing_answers))
    logger.info("Answered %d questions" % len(answers))
    answers = answers[[QUESTION, ANSWER, CONFIDENCE]].sort_values([QUESTION, CONFIDENCE], ascending=[True, False])
    return answers.set_index(QUESTION)
示例#22
0
def extract_question_answer_pairs_from_usage_logs(usage_log):
    """
    Extract questions and answers from usage logs, adding question frequency information.

    We are assuming here that a given question always elicits the same answer. Print a warning if this is not the case
    and drop answers to make the answers unique. It is arbitrary which answer is dropped.

    :param usage_log: QuestionsData.csv usage log
    :type usage_log: pandas.DatFrame
    :return: Q&A pairs with question frequency information
    :rtype: pandas.DatFrame
    """
    frequency = question_frequency(usage_log)
    qa_pairs = usage_log.drop_duplicates(subset=(QUESTION, ANSWER))
    m = sum(qa_pairs.duplicated(QUESTION))
    if m:
        n = len(frequency)
        logger.warning("%d questions of %d have multiple answers (%0.3f%%), only keeping one answer per question" %
                       (m, n, 100.0 * m / n))
        qa_pairs = qa_pairs.drop_duplicates(QUESTION)
    qa_pairs = pandas.merge(qa_pairs, frequency, on=QUESTION)
    logger.info("%d question/answer pairs" % len(qa_pairs))
    return qa_pairs
示例#23
0
def extract_question_answer_pairs_from_usage_logs(usage_log):
    """
    Extract questions and answers from usage logs, adding question frequency information.

    We are assuming here that a given question always elicits the same answer. Print a warning if this is not the case
    and drop answers to make the answers unique. It is arbitrary which answer is dropped.

    :param usage_log: QuestionsData.csv usage log
    :type usage_log: pandas.DatFrame
    :return: Q&A pairs with question frequency information
    :rtype: pandas.DatFrame
    """
    frequency = question_frequency(usage_log)
    qa_pairs = usage_log.drop_duplicates(subset=(QUESTION, ANSWER))
    m = sum(qa_pairs.duplicated(QUESTION))
    if m:
        n = len(frequency)
        logger.warning(
            "%d questions of %d have multiple answers (%0.3f%%), only keeping one answer per question"
            % (m, n, 100.0 * m / n))
        qa_pairs = qa_pairs.drop_duplicates(QUESTION)
    qa_pairs = pandas.merge(qa_pairs, frequency, on=QUESTION)
    logger.info("%d question/answer pairs" % len(qa_pairs))
    return qa_pairs