Python get_answers_for_doc примеры, magpie.utils.get_answers_for_doc Python примеры использования

Пример #1

0

Показать файл

Файл: utils.py Проект: Rowl1ng/magpie

def calculate_recall_for_kw_candidates(data_dir, recreate_ontology=False, verbose=False):
    """
    Generate keyword candidates for files in a given directory
    and compute their recall in reference to ground truth answers
    :param data_dir: directory with .txt and .key files
    :param recreate_ontology: boolean flag for recreating the ontology
    :param verbose: whether to print computation times

    :return average_recall: float
    """
    average_recall = 0
    total_kw_number = 0

    ontology = get_ontology(recreate=recreate_ontology)
    docs = get_documents(data_dir)
    considered_keywords = set(get_keywords())
    total_docs = 0

    start_time = time.clock()
    for doc in docs:
        kw_candidates = {kw.get_canonical_form() for kw
                         in generate_keyword_candidates(doc, ontology)}

        answers = get_answers_for_doc(doc.filename, data_dir, filtered_by=considered_keywords)
        # print(document.get_meaningful_words())

        # print(u"Candidates:")
        # for kw in sorted(kw_candidates):
        #     print(u"\t" + unicode(kw))
        # print
        #
        # print(u"Answers:")
        # for kw in sorted(answers):
        #     print(u"\t" + unicode(kw))
        # print
        #
        # print(u"Conjunction:")
        # for kw in sorted(kw_candidates & answers):
        #     print(u"\t" + unicode(kw))
        # print

        recall = 1 if not answers else len(kw_candidates & answers) / (len(answers))
        if verbose:
            print
            print("Paper: " + doc.filename)
            print("Candidates: " + str(len(kw_candidates)))
            print("Recall: " + unicode(recall * 100) + "%")

        average_recall += recall
        total_kw_number += len(kw_candidates)
        total_docs += 1

    average_recall /= total_docs

    if verbose:
        print
        print("Total # of keywords: " + str(total_kw_number))
        print("Time elapsed: " + str(time.clock() - start_time))

    return average_recall

Пример #2

0

Показать файл

Файл: build_matrices.py Проект: mediacloud/nytlabels-annotator-train

def build_train_matrices(docs, model, file_dir, ontology):
    """
    Build X matrix and y vector from the input data
    :param docs: documents to process. Either list of generator of Document obj
    :param model: LearningModel object
    :param file_dir: directory where the answer files are located
    :param ontology: Ontology object

    :return: X and y numpy arrays
    """
    considered_keywords = set(get_keywords())
    feature_matrices = []
    output_vectors = []

    for doc in docs:
        inv_index = InvertedIndex(doc)

        # Generate keyword candidates
        kw_candidates = list(generate_keyword_candidates(doc, ontology))

        # Get ground truth answers
        doc_answers = get_answers_for_doc(
            doc.filename,
            file_dir,
            filtered_by=considered_keywords,
        )

        # If an answer was not generated, add it anyway
        add_gt_answers_to_candidates_set(kw_candidates, doc_answers, ontology)

        # Create the output vector
        output_vector = np.zeros((len(kw_candidates), 2), dtype=np.int16)
        for i, kw in enumerate(kw_candidates):
            if kw.get_canonical_form() in doc_answers:
                output_vector[i][0] = True
            output_vector[i][1] = doc.doc_id

        output_vectors.append(output_vector)

        X = build_feature_matrix(kw_candidates, inv_index, model)
        feature_matrices.append(X)

    # Merge the pandas
    X = pd.concat(feature_matrices)

    # Cast the output vector to numpy
    y = np.concatenate(output_vectors)

    return X, y

Пример #3

0

Показать файл

Файл: input_data.py Проект: slon1024/magpie

def build_x_and_y(filenames, file_directory, **kwargs):
    """
    Given file names and their directory, build (X, y) data matrices
    :param filenames: iterable of strings showing file ids (no extension)
    :param file_directory: path to a directory where those files lie
    :param kwargs: additional necessary data for matrix building e.g. scaler

    :return: a tuple (X, y)
    """
    label_indices = kwargs['label_indices']
    word2vec_model = kwargs['word2vec_model']
    scaler = kwargs['scaler']
    nn_model = kwargs['nn_model']

    x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE))
    y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_)

    for doc_id, fname in enumerate(filenames):
        doc = Document(doc_id, os.path.join(file_directory, fname + '.txt'))
        words = doc.get_all_words()[:SAMPLE_LENGTH]

        for i, w in enumerate(words):
            if w in word2vec_model:
                word_vector = word2vec_model[w].reshape(1, -1)
                x_matrix[doc_id][i] = scaler.transform(word_vector,
                                                       copy=True)[0]

        labels = get_answers_for_doc(
            fname + '.txt',
            file_directory,
            filtered_by=set(label_indices.keys()),
        )

        for lab in labels:
            index = label_indices[lab]
            y_matrix[doc_id][index] = True

    if nn_model and type(nn_model.input) == list:
        return_data = [x_matrix] * len(nn_model.input), y_matrix
    else:
        return_data = [x_matrix], y_matrix

    if type(nn_model) == Graph:
        return {'input': return_data[0], 'output': return_data[1]}
    else:
        return return_data

Пример #4

0

Показать файл

Файл: input_data.py Проект: Rowl1ng/magpie

def build_x_and_y(filenames, file_directory, **kwargs):
    """
    Given file names and their directory, build (X, y) data matrices
    :param filenames: iterable of strings showing file ids (no extension)
    :param file_directory: path to a directory where those files lie
    :param kwargs: additional necessary data for matrix building e.g. scaler

    :return: a tuple (X, y)
    """
    label_indices = kwargs['label_indices']
    word2vec_model = kwargs['word2vec_model']
    scaler = kwargs['scaler']
    nn_model = kwargs['nn_model']

    x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE))
    y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_)

    for doc_id, fname in enumerate(filenames):
        doc = Document(doc_id, os.path.join(file_directory, fname + '.txt'))
        words = doc.get_all_words()[:SAMPLE_LENGTH]

        for i, w in enumerate(words):
            if w in word2vec_model:
                word_vector = word2vec_model[w].reshape(1, -1)
                x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0]

        labels = get_answers_for_doc(
            fname + '.txt',
            file_directory,
            filtered_by=set(label_indices.keys()),
        )

        for lab in labels:
            index = label_indices[lab]
            y_matrix[doc_id][index] = True

    if nn_model and type(nn_model.input) == list:
        return_data = [x_matrix] * len(nn_model.input), y_matrix
    else:
        return_data = [x_matrix], y_matrix

    if type(nn_model) == Graph:
        return {'input': return_data[0], 'output': return_data[1]}
    else:
        return return_data

Пример #5

0

Показать файл

Файл: build_matrices.py Проект: eamonnmag/magpie

def build_test_matrices(docs, model, file_dir, ontology):
    """
    Build the X feature matrix and answers & kw_vector variables, needed for
    evaluating the predictions.
    :param docs: documents to process. Either list or generator of Document obj
    :param model: LearningModel object
    :param file_dir: directory where the answer files are located
    :param ontology: Ontology object
    :return: X numpy array, answers dictionary and kw_vector tuple list
    """
    considered_keywords = set(get_considered_keywords())
    feature_matrices = []
    kw_vector = []
    answers = dict()

    for doc in docs:
        inv_index = InvertedIndex(doc)

        # Generate keyword candidates
        kw_candidates = list(generate_keyword_candidates(doc, ontology))

        X = build_feature_matrix(kw_candidates, inv_index, model)
        feature_matrices.append(X)

        # Get ground truth answers
        answers[doc.doc_id] = get_answers_for_doc(
            doc.filename,
            file_dir,
            filtered_by=considered_keywords,
        )

        kw_vector.extend([(doc.doc_id, kw.get_canonical_form())
                          for kw in kw_candidates])

    # Merge feature matrices from different documents
    X = pd.concat(feature_matrices)

    return X, answers, kw_vector

Пример #6

0

Показать файл

Файл: build_matrices.py Проект: mediacloud/nytlabels-annotator-train

def build_test_matrices(docs, model, file_dir, ontology):
    """
    Build the X feature matrix and answers & kw_vector variables, needed for
    evaluating the predictions.
    :param docs: documents to process. Either list or generator of Document obj
    :param model: LearningModel object
    :param file_dir: directory where the answer files are located
    :param ontology: Ontology object
    :return: X numpy array, answers dictionary and kw_vector tuple list
    """
    considered_keywords = set(get_keywords())
    feature_matrices = []
    kw_vector = []
    answers = dict()

    for doc in docs:
        inv_index = InvertedIndex(doc)

        # Generate keyword candidates
        kw_candidates = list(generate_keyword_candidates(doc, ontology))

        X = build_feature_matrix(kw_candidates, inv_index, model)
        feature_matrices.append(X)

        # Get ground truth answers
        answers[doc.doc_id] = get_answers_for_doc(
            doc.filename,
            file_dir,
            filtered_by=considered_keywords,
        )

        kw_vector.extend([(doc.doc_id, kw.get_canonical_form())
                          for kw in kw_candidates])

    # Merge feature matrices from different documents
    X = pd.concat(feature_matrices)

    return X, answers, kw_vector

Пример #7

0

Показать файл

def calculate_recall_for_kw_candidates(data_dir,
                                       recreate_ontology=False,
                                       verbose=False):
    """
    Generate keyword candidates for files in a given directory
    and compute their recall in reference to ground truth answers
    :param data_dir: directory with .txt and .key files
    :param recreate_ontology: boolean flag for recreating the ontology
    :param verbose: whether to print computation times

    :return average_recall: float
    """
    average_recall = 0
    total_kw_number = 0

    ontology = get_ontology(recreate=recreate_ontology)
    docs = get_documents(data_dir)
    considered_keywords = set(get_keywords())
    total_docs = 0

    start_time = time.clock()
    for doc in docs:
        kw_candidates = {
            kw.get_canonical_form()
            for kw in generate_keyword_candidates(doc, ontology)
        }

        answers = get_answers_for_doc(doc.filename,
                                      data_dir,
                                      filtered_by=considered_keywords)
        # print(document.get_meaningful_words())

        # print(u"Candidates:")
        # for kw in sorted(kw_candidates):
        #     print(u"\t" + unicode(kw))
        # print
        #
        # print(u"Answers:")
        # for kw in sorted(answers):
        #     print(u"\t" + unicode(kw))
        # print
        #
        # print(u"Conjunction:")
        # for kw in sorted(kw_candidates & answers):
        #     print(u"\t" + unicode(kw))
        # print

        recall = 1 if not answers else len(kw_candidates
                                           & answers) / (len(answers))
        if verbose:
            print
            print("Paper: " + doc.filename)
            print("Candidates: " + str(len(kw_candidates)))
            print("Recall: " + unicode(recall * 100) + "%")

        average_recall += recall
        total_kw_number += len(kw_candidates)
        total_docs += 1

    average_recall /= total_docs

    if verbose:
        print
        print("Total # of keywords: " + str(total_kw_number))
        print("Time elapsed: " + str(time.clock() - start_time))

    return average_recall

Python get_answers_for_doc примеры использования