Пример #1
0
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test classification using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)
    filenames = data.get_file_names(path)
    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        print '    '+str(i)+"/"+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)
    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/class'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score
def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric+' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
Пример #3
0
def classification_demo():
    """Function intended to illustrate classification in the experimental framework.

    Intended as a basis for new experiments for those not intimately
    familiar with the code.
    """
    print 'Evaluation type: Classification'
    print 'Graph type:      Co-occurrence w/2-word window context'
    print 'Centrality:      Weighted degree'
    print
    print '> Reading data..'
    corpus_path = '../data/tasa/TASA900_preprocessed'
    docs, labels = data.read_files(corpus_path)

    print '> Creating representations..'
    dicts = []
    for i, doc in enumerate(docs):
        print '   ',str(i)+'/'+str(len(docs))
        g = graph_representation.construct_cooccurrence_network(doc)
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        dicts.append(d)
    vectors = graph_representation.dicts_to_vectors(dicts)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(vectors, labels)
    print '    score:', score
    print
Пример #4
0
def classification_comparison_freq(dataset='reuters'):
    print '> Reading data..', dataset
    training_path = '../data/' + dataset + '/training_preprocessed'
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/' + dataset + '/test_preprocessed'
    test_docs, test_labels = data.read_files(test_path)

    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        training_dicts = freq_representation.text_to_dict(
            training_docs, metric)
        test_dicts = freq_representation.text_to_dict(test_docs, metric)
        print '    dicst -> vectors'
        keys = set()
        for d in training_dicts + test_dicts:
            keys = keys.union(d.keys())
        print '    vocabulary size:', len(keys)
        training_rep = graph_representation.dicts_to_vectors(
            training_dicts, keys)
        test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)
        reps = {'training': training_rep, 'test': test_rep}
        labels = {'training': training_labels, 'test': test_labels}
        score = evaluation.evaluate_classification(reps, labels, mode='split')
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Пример #5
0
def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(
            text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs,
                                                         metric,
                                                         verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric + ' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
Пример #6
0
def classification_comparison_freq(dataset='reuters'):
    print '> Reading data..', dataset
    training_path = '../data/'+dataset+'/training_preprocessed'
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/'+dataset+'/test_preprocessed'
    test_docs, test_labels = data.read_files(test_path)

    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        training_dicts = freq_representation.text_to_dict(training_docs, metric)
        test_dicts = freq_representation.text_to_dict(test_docs, metric)
        print '    dicst -> vectors'
        keys = set()
        for d in training_dicts + test_dicts:
            keys = keys.union(d.keys())
        print '    vocabulary size:', len(keys)
        training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
        test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)
        reps = {'training':training_rep, 'test':test_rep}
        labels = {'training':training_labels, 'test':test_labels}
        score = evaluation.evaluate_classification(reps, labels, mode='split')
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Пример #7
0
def edge_direction_evaluation(direction):
    """
    Evaluate impact of using different edge directions on dependency networks.

    Values for *direction*: ``forward``, ``backward``, and ``undirected``.
    """
    results = {'_edge-direction':direction}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_dependency_network(text, direction=direction)
        metric  = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, direction=direction)
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    results['retrieval'] = score

    data.pickle_to_file(results, 'output/dependencies/stop_words_retr_'+direction)

    pp.pprint(results)
    return results
Пример #8
0
def evaluate_worker(task_tuple):
    model_id, model_arg, epoch_id, epoch_t, working_dir, test_csv, continuous_cols = task_tuple

    syn_path = os.path.join(working_dir, 'synthetic{}_{}.npz'.format(model_id, epoch_t))
    csv_path = os.path.join(working_dir, 'synthetic{}_{}.csv'.format(model_id, epoch_t))
    synthetic_data = npz_to_csv(syn_path, csv_path)
    try:
        score = evaluate_classification(csv_path, test_csv, continuous_cols)
    except:
        score = -1
    return (model_id, epoch_id, score)
Пример #9
0
def centrality_weights_classification(weighted=True):
    """
    Evaluate whether edge weights are beneficial to the depdendency
    network represenation for the classification task.
    """
    results = {'_is_weighted': weighted, '_evaluation': 'classification'}
    graph_metrics = graph_representation.get_metrics(weighted)

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    rep = {}
    for metric in graph_metrics:
        rep[metric] = []

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_dependency_network(
            text, weighted=weighted)
        for metric in graph_metrics:
            d = graph_representation.graph_to_dict(g, metric)
            rep[metric].append(d)
        g = None  # just to make sure..
        if i % 100 == 0:
            if weighted:
                postfix = '_weighted'
            else:
                postfix = '_unweighted'
            data.pickle_to_file(
                rep,
                'output/dependencies/exp1_class_tmp_' + str(i) + '_' + postfix)

    print '> Creating vector representations..'
    for metric in graph_metrics:
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    for metric in graph_metrics:
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    if weighted:
        postfix = '_weighted'
    else:
        postfix = '_unweighted'
    data.pickle_to_file(results, 'output/dependencies/exp1_class' + postfix)

    pp.pprint(results)
    return results
Пример #10
0
def do_classification_experiments(
        dataset='tasa/TASA900',
        graph_types=['co-occurrence', 'dependency', 'random'],
        use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_dataset': dataset, '_evaluation': 'classification'}
    print '> Evaluation type: classification'
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    docdata = data.read_data(corpus_path, graph_types)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ', gtype
        documents, labels = docdata[gtype]
        graphs = graph_representation.create_graphs(documents, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_classification(
                vectors, labels)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            documents, labels = data.read_files(corpus_path + '_preprocessed')
            vectors = freq_representation.text_to_vector(documents, metric)
            results['freq'][metric] = evaluation.evaluate_classification(
                vectors, labels)

    print
    pp.pprint(results)
    return results
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/'+corpus+'_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res')
    return results
Пример #12
0
def do_classification_experiments(dataset='tasa/TASA900',
                                    graph_types = ['co-occurrence','dependency','random'],
                                    use_frequency = True):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_dataset':dataset,
                '_evaluation':'classification'}
    print '> Evaluation type: classification'
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    docdata = data.read_data(corpus_path, graph_types)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ',gtype
        documents, labels = docdata[gtype]
        graphs = graph_representation.create_graphs(documents, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_classification(vectors, labels)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            documents, labels = data.read_files(corpus_path+'_preprocessed')
            vectors = freq_representation.text_to_vector(documents, metric)
            results['freq'][metric] = evaluation.evaluate_classification(vectors, labels)

    print
    pp.pprint(results)
    return results
Пример #13
0
def centrality_weights_classification(weighted=True):
    """
    Evaluate whether edge weights are beneficial to the depdendency
    network represenation for the classification task.
    """
    results = {'_is_weighted':weighted, '_evaluation':'classification'}
    graph_metrics = graph_representation.get_metrics(weighted)

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    rep = {}
    for metric in graph_metrics:
        rep[metric] = []

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_dependency_network(text, weighted=weighted)
        for metric in graph_metrics:
            d = graph_representation.graph_to_dict(g, metric)
            rep[metric].append(d)
        g = None # just to make sure..
        if i%100==0:
            if weighted:
                postfix = '_weighted'
            else:
                postfix = '_unweighted'
            data.pickle_to_file(rep, 'output/dependencies/exp1_class_tmp_'+str(i)+'_'+postfix)

    print '> Creating vector representations..'
    for metric in graph_metrics:
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    for metric in graph_metrics:
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    if weighted:
        postfix = '_weighted'
    else:
        postfix = '_unweighted'
    data.pickle_to_file(results, 'output/dependencies/exp1_class'+postfix)

    pp.pprint(results)
    return results
Пример #14
0
def freq_classification(dataset='tasa/TASA900'):
    results = {'_dataset':dataset,
                '_evaluation':'classification'}
    corpus_path = '../data/'+dataset
    results['results'] = {}
    for metric in freq_representation.get_metrics():
        print metric
        documents, labels = data.read_files(corpus_path+'_preprocessed')
        vectors = freq_representation.text_to_vector(documents, metric)
        r = evaluation.evaluate_classification(vectors, labels, mode='cross-validation')
        results['results'][metric] = r
        print '   ', r
        print
    pp.pprint(results)
    return results
Пример #15
0
def freq_classification(dataset='tasa/TASA900'):
    results = {'_dataset': dataset, '_evaluation': 'classification'}
    corpus_path = '../data/' + dataset
    results['results'] = {}
    for metric in freq_representation.get_metrics():
        print metric
        documents, labels = data.read_files(corpus_path + '_preprocessed')
        vectors = freq_representation.text_to_vector(documents, metric)
        r = evaluation.evaluate_classification(vectors,
                                               labels,
                                               mode='cross-validation')
        results['results'][metric] = r
        print '   ', r
        print
    pp.pprint(results)
    return results
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/class_context_'+str(window_size))

    pp.pprint(results)
    return results
def test_best_classification():
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    rep = []
    print '> Creating representations..'
    for i, text in enumerate(texts):
        if i%100==0: print '   ',i
        g = graph_representation.construct_cooccurrence_network(text, context='sentence')
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   ', score
Пример #18
0
def stop_word_evaluation(rem_stop_words):
    """
    Experiment for determining what effect removing stop words have on
    dependency networks.
    """
    results = {'_removing stop-words': rem_stop_words}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    total_nodes = 0
    for i, text in enumerate(texts):
        if i % 100 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=rem_stop_words)
        total_nodes += len(g.nodes())
        metric = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    print '(the networks had a total of', total_nodes, 'nodes)'
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    total_nodes = 0
    for i, text in enumerate(description_texts):
        if i % 100 == 0:
            print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=rem_stop_words)
        total_nodes += len(g.nodes())
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    print '(the networks had a total of', total_nodes, 'nodes)'
    results['retrieval'] = score

    if rem_stop_words:
        postfix = '_without'
    else:
        postfix = '_with'
    data.pickle_to_file(results,
                        'output/dependencies/stop_words_retr' + postfix)

    pp.pprint(results)
    return results
Пример #19
0
def evaluate_dep_types():
    """
    Leave-one-out evaluation of the various dependency types from the stanford parser.
    """
    exclude_list = [
        'dep', 'aux', 'auxpass', 'cop', 'agent', 'acomp', 'attr', 'ccomp',
        'xcomp', 'complm', 'dobj', 'iobj', 'pobj', 'mark', 'rel', 'nsubj',
        'nsubjpass', 'csubj', 'csubjpass', 'cc', 'conj', 'expl', 'abbrev',
        'amod', 'appos', 'advcl', 'purpcl', 'det', 'predet', 'preconj',
        'infmod', 'mwe', 'partmod', 'advmod', 'neg', 'rcmod', 'quantmod',
        'tmod', 'nn', 'npadvmod', 'num', 'number', 'prep', 'poss',
        'possessive', 'prt', 'parataxis', 'punct', 'ref', 'xsubj', 'pcomp',
        'prepc'
    ]
    results = {'classification': [], 'retrieval': []}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)
    print '> Creating representations..'
    rep = {}
    for exclude_label in exclude_list:
        rep[exclude_label] = []
    metric = graph.GraphMetrics.CLOSENESS
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for exclude_label in exclude_list:
            g = graph.reduce_edge_set(full_graph, exclude_label)
            d = graph_representation.graph_to_dict(g, metric)
            rep[exclude_label].append(d)
            g = None  # just to make sure..
        full_graph = None
    for exclude_label in exclude_list:
        rep[exclude_label] = graph_representation.dicts_to_vectors(
            rep[exclude_label])
    print '> Evaluating..'
    for exclude_label in exclude_list:
        score = evaluation.evaluate_classification(rep[exclude_label], labels)
        print '  ', exclude_label, score
        results['classification'].append(score)

    data.pickle_to_file(results, 'output/dependencies/types_eval_tmp')

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
    print '> Creating representations..'
    rep = {}
    for exclude_label in exclude_list:
        rep[exclude_label] = []
    metric = graph.GraphMetrics.EIGENVECTOR
    for i, text in enumerate(description_texts):
        if i % 1 == 0: print '   ', str(i) + '/' + str(len(description_texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for exclude_label in exclude_list:
            g = graph.reduce_edge_set(full_graph, exclude_label)
            d = graph_representation.graph_to_dict(g, metric)
            rep[exclude_label].append(d)
            g = None  # just to make sure..
        full_graph = None
        #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i))
    for exclude_label in exclude_list:
        rep[exclude_label] = graph_representation.dicts_to_vectors(
            rep[exclude_label])
    print '> Evaluating..'
    for exclude_label in exclude_list:
        score = evaluation.evaluate_retrieval(rep[exclude_label],
                                              solution_vectors)
        print '  ', exclude_label, score
        results['retrieval'].append(score)

    pp.pprint(results)
    data.pickle_to_file(results, 'output/dependencies/types_eval')

    return results
Пример #20
0
def evaluate_dep_types():
    """
    Leave-one-out evaluation of the various dependency types from the stanford parser.
    """
    exclude_list = ['dep', 'aux', 'auxpass', 'cop', 'agent', 'acomp',
                    'attr', 'ccomp', 'xcomp', 'complm', 'dobj', 'iobj',
                    'pobj', 'mark', 'rel', 'nsubj', 'nsubjpass', 'csubj',
                    'csubjpass', 'cc', 'conj', 'expl', 'abbrev', 'amod',
                    'appos', 'advcl', 'purpcl', 'det', 'predet', 'preconj',
                    'infmod', 'mwe', 'partmod', 'advmod', 'neg', 'rcmod',
                    'quantmod', 'tmod', 'nn', 'npadvmod', 'num', 'number',
                    'prep', 'poss', 'possessive', 'prt', 'parataxis',
                    'punct', 'ref', 'xsubj', 'pcomp', 'prepc']
    results = {'classification':[], 'retrieval':[]}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)
    print '> Creating representations..'
    rep = {}
    for exclude_label in exclude_list:
        rep[exclude_label] = []
    metric  = graph.GraphMetrics.CLOSENESS
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for exclude_label in exclude_list:
            g = graph.reduce_edge_set(full_graph, exclude_label)
            d = graph_representation.graph_to_dict(g, metric)
            rep[exclude_label].append(d)
            g = None # just to make sure..
        full_graph = None
    for exclude_label in exclude_list:
        rep[exclude_label] = graph_representation.dicts_to_vectors(rep[exclude_label])
    print '> Evaluating..'
    for exclude_label in exclude_list:
        score = evaluation.evaluate_classification(rep[exclude_label], labels)
        print '  ', exclude_label, score
        results['classification'].append(score)

    data.pickle_to_file(results, 'output/dependencies/types_eval_tmp')

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
    print '> Creating representations..'
    rep = {}
    for exclude_label in exclude_list:
        rep[exclude_label] = []
    metric = graph.GraphMetrics.EIGENVECTOR
    for i, text in enumerate(description_texts):
        if i%1==0: print '   ',str(i)+'/'+str(len(description_texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for exclude_label in exclude_list:
            g = graph.reduce_edge_set(full_graph, exclude_label)
            d = graph_representation.graph_to_dict(g, metric)
            rep[exclude_label].append(d)
            g = None # just to make sure..
        full_graph = None
        #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i))
    for exclude_label in exclude_list:
        rep[exclude_label] = graph_representation.dicts_to_vectors(rep[exclude_label])
    print '> Evaluating..'
    for exclude_label in exclude_list:
        score = evaluation.evaluate_retrieval(rep[exclude_label], solution_vectors)
        print '  ', exclude_label, score
        results['retrieval'].append(score)

    pp.pprint(results)
    data.pickle_to_file(results, 'output/dependencies/types_eval')

    return results
Пример #21
0
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    import co_occurrence_experiments
    import dependency_experiments

    def make_dicts(docs, icc):
        rep = []
        for i, doc in enumerate(docs):
            if i%100==0: print '    graph',str(i)+'/'+str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'}
    gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network,
                'dependency':graph_representation.construct_dependency_network}
    metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE,
                'dependency':graph.GraphMetrics.CLOSENESS}

    print '--', graph_type
    print '> Reading data..', dataset
    training_path = '../data/'+dataset+'/training'+postfix[graph_type]
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/'+dataset+'/test'+postfix[graph_type]
    test_docs, test_labels = data.read_files(test_path)

    icc_training = None
    icc_test = None
    if icc:
        print '> Calculating ICC..'
        if graph_type is 'co-occurrence':
            icc_training = co_occurrence_experiments.retrieve_centralities(dataset+'/training', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_training = dependency_experiments.retrieve_centralities(dataset+'/training', metrics[graph_type])

        if graph_type is 'co-occurrence':
            icc_test = co_occurrence_experiments.retrieve_centralities(dataset+'/test', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_test = dependency_experiments.retrieve_centralities(dataset+'/test', metrics[graph_type])

    print '> Creating representations..'
    training_dicts = make_dicts(training_docs, icc_training)
    test_dicts = make_dicts(test_docs, icc_test)

    print '    dicts -> vectors'
    keys = set()
    for d in training_dicts + test_dicts:
        keys = keys.union(d.keys())
    keys = list(keys)
    print '    vocabulary size:', len(keys)

    training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
    test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)

    print '> Evaluating..'
    reps = {'training':training_rep, 'test':test_rep}
    labels = {'training':training_labels, 'test':test_labels}
    results = evaluation.evaluate_classification(reps, labels, mode='split')
    print results
    s = 'classification comparison '
    if icc: s += 'USING TC-ICC'
    s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Пример #22
0
def evaluate_dep_type_sets():
    """
    Evaluation of various sets of dependency relations.

    Each set is excluded from the representation, and the performance recorded.
    The best strategy is to exclude those dependencies which removal lead to the
    greatest imporovement for the representation.
    """
    strategies = {
        'defensive': ['agent', 'advcl', 'parataxis'],
        'aggressive': [
            'agent', 'advcl', 'parataxis', 'dep', 'aux', 'ccomp', 'xcomp',
            'dobj', 'pobj', 'nsubj', 'nsubjpass', 'cc', 'abbrev', 'purpcl',
            'predet', 'preconj', 'advmod', 'neg', 'rcmod', 'tmod', 'poss',
            'prepc'
        ],
        'compromise_1': [
            'agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass',
            'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc'
        ],
        'compromise_2': [
            'agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass',
            'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc',
            'attr', 'csubj', 'csubjpass', 'number', 'possessive', 'punct',
            'ref'
        ]
    }
    results = {'classification': {}, 'retrieval': {}}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)
    print '> Creating representations..'
    rep = {}
    for strategy in strategies:
        rep[strategy] = []
    metric = graph.GraphMetrics.CLOSENESS
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        for strategy in strategies:
            g = graph_representation.construct_dependency_network(
                text, exclude=strategies[strategy])
            d = graph_representation.graph_to_dict(g, metric)
            rep[strategy].append(d)
            g = None  # just to make sure. I don't trust this damn garbage collector...
    for strategy in strategies:
        rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy])
    print '> Evaluating..'
    for strategy in strategies:
        score = evaluation.evaluate_classification(rep[strategy], labels)
        print '  ', strategy, score
        results['classification'][strategy] = score

    data.pickle_to_file(results, 'output/dependencies/types_set_eval_tmp')

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
    print '> Creating representations..'
    rep = {}
    for strategy in strategies:
        rep[strategy] = []
    metric = graph.GraphMetrics.EIGENVECTOR
    for i, text in enumerate(description_texts):
        if i % 1 == 0: print '   ', str(i) + '/' + str(len(description_texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for strategy in strategies:
            g = graph_representation.construct_dependency_network(
                text, exclude=strategies[strategy])
            d = graph_representation.graph_to_dict(g, metric)
            rep[strategy].append(d)
            g = None  # just to make sure..
        full_graph = None
        #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i))
    for strategy in strategies:
        rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy])
    print '> Evaluating..'
    for strategy in strategies:
        score = evaluation.evaluate_retrieval(rep[strategy], solution_vectors)
        print '  ', strategy, score
        results['retrieval'][strategy] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/dependencies/types_set_eval')

    return results
Пример #23
0
def evaluate_dep_type_sets():
    """
    Evaluation of various sets of dependency relations.

    Each set is excluded from the representation, and the performance recorded.
    The best strategy is to exclude those dependencies which removal lead to the
    greatest imporovement for the representation.
    """
    strategies = {
            'defensive': ['agent', 'advcl', 'parataxis'],
            'aggressive': ['agent', 'advcl', 'parataxis', 'dep', 'aux', 'ccomp', 'xcomp', 'dobj', 'pobj', 'nsubj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'preconj', 'advmod', 'neg', 'rcmod', 'tmod', 'poss', 'prepc'],
            'compromise_1': ['agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc'],
            'compromise_2': ['agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc', 'attr', 'csubj', 'csubjpass', 'number', 'possessive', 'punct', 'ref']
        }
    results = {'classification':{}, 'retrieval':{}}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)
    print '> Creating representations..'
    rep = {}
    for strategy in strategies:
        rep[strategy] = []
    metric  = graph.GraphMetrics.CLOSENESS
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        for strategy in strategies:
            g = graph_representation.construct_dependency_network(text, exclude=strategies[strategy])
            d = graph_representation.graph_to_dict(g, metric)
            rep[strategy].append(d)
            g = None # just to make sure. I don't trust this damn garbage collector...
    for strategy in strategies:
        rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy])
    print '> Evaluating..'
    for strategy in strategies:
        score = evaluation.evaluate_classification(rep[strategy], labels)
        print '  ', strategy, score
        results['classification'][strategy] = score

    data.pickle_to_file(results, 'output/dependencies/types_set_eval_tmp')

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
    print '> Creating representations..'
    rep = {}
    for strategy in strategies:
        rep[strategy] = []
    metric = graph.GraphMetrics.EIGENVECTOR
    for i, text in enumerate(description_texts):
        if i%1==0: print '   ',str(i)+'/'+str(len(description_texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for strategy in strategies:
            g = graph_representation.construct_dependency_network(text, exclude=strategies[strategy])
            d = graph_representation.graph_to_dict(g, metric)
            rep[strategy].append(d)
            g = None # just to make sure..
        full_graph = None
        #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i))
    for strategy in strategies:
        rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy])
    print '> Evaluating..'
    for strategy in strategies:
        score = evaluation.evaluate_retrieval(rep[strategy], solution_vectors)
        print '  ', strategy, score
        results['retrieval'][strategy] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/dependencies/types_set_eval')

    return results
Пример #24
0
def classification_comparison_graph(dataset='reuters',
                                    graph_type='co-occurrence',
                                    icc=None):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    import co_occurrence_experiments
    import dependency_experiments

    def make_dicts(docs, icc):
        rep = []
        for i, doc in enumerate(docs):
            if i % 100 == 0: print '    graph', str(i) + '/' + str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'}
    gfuns = {
        'co-occurrence': graph_representation.construct_cooccurrence_network,
        'dependency': graph_representation.construct_dependency_network
    }
    metrics = {
        'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE,
        'dependency': graph.GraphMetrics.CLOSENESS
    }

    print '--', graph_type
    print '> Reading data..', dataset
    training_path = '../data/' + dataset + '/training' + postfix[graph_type]
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/' + dataset + '/test' + postfix[graph_type]
    test_docs, test_labels = data.read_files(test_path)

    icc_training = None
    icc_test = None
    if icc:
        print '> Calculating ICC..'
        if graph_type is 'co-occurrence':
            icc_training = co_occurrence_experiments.retrieve_centralities(
                dataset + '/training', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_training = dependency_experiments.retrieve_centralities(
                dataset + '/training', metrics[graph_type])

        if graph_type is 'co-occurrence':
            icc_test = co_occurrence_experiments.retrieve_centralities(
                dataset + '/test', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_test = dependency_experiments.retrieve_centralities(
                dataset + '/test', metrics[graph_type])

    print '> Creating representations..'
    training_dicts = make_dicts(training_docs, icc_training)
    test_dicts = make_dicts(test_docs, icc_test)

    print '    dicts -> vectors'
    keys = set()
    for d in training_dicts + test_dicts:
        keys = keys.union(d.keys())
    keys = list(keys)
    print '    vocabulary size:', len(keys)

    training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
    test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)

    print '> Evaluating..'
    reps = {'training': training_rep, 'test': test_rep}
    labels = {'training': training_labels, 'test': test_labels}
    results = evaluation.evaluate_classification(reps, labels, mode='split')
    print results
    s = 'classification comparison '
    if icc: s += 'USING TC-ICC'
    s += '\nrepresentation: ' + graph_type + '\nresult: ' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Пример #25
0
 def evaluation_results(self):
     return evaluate_classification(self.dataset.documents)
Пример #26
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--corpus_dir', required=True)
    parser.add_argument('--text_column')
    parser.add_argument('--model_name')
    parser.add_argument('--max_seq_length', type=int)
    parser.add_argument('--num_epochs', type=int)
    parser.add_argument('--learning_rate', type=float, default=1e-5)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--output_dir')
    args = parser.parse_args()
    torch.manual_seed(42)

    logging.basicConfig(level=logging.INFO)

    corpus_dir = args.corpus_dir
    text_column_name = args.text_column
    model_name = args.model_name
    max_seq_length = args.max_seq_length
    num_epochs = args.num_epochs
    learning_rate = args.learning_rate
    batch_size = args.batch_size
    output_dir = args.output_dir

    train_df_1 = pd.read_csv(os.path.join(
        corpus_dir, "train_{}.csv".format("sentiments_cloudvision")),
                             encoding="utf-8")
    print("Train_1", train_df_1.shape)
    train_df_1.dropna(subset=[text_column_name], inplace=True)
    print("Train_1", train_df_1.shape)
    val_df_1 = pd.read_csv(os.path.join(
        corpus_dir, "val_{}.csv".format("sentiments_cloudvision")),
                           encoding="utf-8")
    print("Val_1", val_df_1.shape)
    val_df_1.dropna(subset=[text_column_name], inplace=True)
    print("Val_1", val_df_1.shape)
    train_df_2 = pd.read_csv(os.path.join(
        corpus_dir, "train_{}.csv".format("topics_cloudvision")),
                             encoding="utf-8")
    print("Train_2", train_df_2.shape)
    train_df_2.dropna(subset=[text_column_name], inplace=True)
    print("Train_2", train_df_2.shape)
    val_df_2 = pd.read_csv(os.path.join(
        corpus_dir, "val_{}.csv".format("topics_cloudvision")),
                           encoding="utf-8")
    print("Val_2", val_df_2.shape)
    val_df_2.dropna(subset=[text_column_name], inplace=True)
    print("Val_2", val_df_2.shape)
    train_dfs = {"task_1": train_df_1, "task_2": train_df_2}
    val_dfs = {"task_1": val_df_1, "task_2": val_df_2}
    dataset_dict_1, id_to_class_1 = load_dataset(train_df_1, val_df_1,
                                                 text_column_name)
    dataset_dict_2, id_to_class_2 = load_dataset(train_df_2, val_df_2,
                                                 text_column_name)
    classes_list_1 = []
    for i in range(len(id_to_class_1.keys())):
        class_label = id_to_class_1[i]
        classes_list_1.append(class_label)
    classes_list_2 = []
    for i in range(len(id_to_class_2.keys())):
        class_label = id_to_class_2[i]
        classes_list_2.append(class_label)
    dataset_dict = {"task_1": dataset_dict_1, "task_2": dataset_dict_2}
    id_to_class_dicts = {"task_1": id_to_class_1, "task_2": id_to_class_2}
    id_to_class = {"task_1": classes_list_1, "task_2": classes_list_2}

    multitask_model = MultitaskModel.create(
        model_name=model_name,
        model_type_dict={
            "task_1": transformers.AutoModelForSequenceClassification,
            "task_2": transformers.AutoModelForSequenceClassification,
        },
        model_config_dict={
            "task_1":
            transformers.AutoConfig.from_pretrained(
                model_name,
                num_labels=len(id_to_class_dicts["task_1"].keys())),
            "task_2":
            transformers.AutoConfig.from_pretrained(
                model_name,
                num_labels=len(id_to_class_dicts["task_2"].keys())),
        },
    )
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    feature_fn = convert_features_function(tokenizer, max_seq_length)
    convert_func_dict = {
        "task_1": feature_fn,
        "task_2": feature_fn,
    }
    columns_dict = {
        "task_1": ['input_ids', 'attention_mask', 'labels'],
        "task_2": ['input_ids', 'attention_mask', 'labels'],
    }
    features_dict = data_to_features(dataset_dict, convert_func_dict,
                                     columns_dict)

    train_dataset = {
        task_name: dataset["train"]
        for task_name, dataset in features_dict.items()
    }
    val_dataset_dict = {
        task_name: dataset["validation"]
        for task_name, dataset in features_dict.items()
    }

    trainer = MultitaskTrainer(
        model=multitask_model,
        args=transformers.TrainingArguments(
            output_dir="./models/multitask_model",
            overwrite_output_dir=True,
            learning_rate=learning_rate,
            do_train=True,
            num_train_epochs=num_epochs,
            # Adjust batch size if this doesn't fit on the Colab GPU
            per_device_train_batch_size=batch_size,
            save_steps=3000,
        ),
        # compute_metrics=classification_metrics,
        data_collator=NLPDataCollator(),
        train_dataset=train_dataset,
        eval_dataset=val_dataset_dict)
    trainer.train()

    validation_results = evaluate_classification(trainer, features_dict,
                                                 dataset_dict)
    for task_name, results_dict in validation_results.items():
        for metric_name, value in results_dict.items():
            print(f"Validation quality: After training, task: {task_name},"
                  f" {metric_name} : {value}")
    training_results = evaluate_classification(trainer,
                                               features_dict,
                                               dataset_dict,
                                               collection="train")
    for task_name, results_dict in training_results.items():
        for metric_name, value in results_dict.items():
            print(f"Training quality: After training, task: {task_name},"
                  f" {metric_name} : {value}")

    validation_predictions = get_predictions(trainer,
                                             features_dict,
                                             id_to_class,
                                             collection="validation")
    train_predictions = get_predictions(trainer,
                                        features_dict,
                                        id_to_class,
                                        collection="train")
    # print("Pred train", train_predictions.shape)
    # print("Pred val", validation_predictions.shape)
    # train_embeddings = get_last_layer_embedding(multitask_model, trainer, features_dict, collection="train")
    # validation_embeddings = get_last_layer_embedding(multitask_model, trainer, features_dict, collection="validation")

    train_embeddings = get_embeddings(
        multitask_model,
        features_dict,
        collection="train",
    )
    validation_embeddings = get_embeddings(
        multitask_model,
        features_dict,
        collection="validation",
    )
    # print("Embe train", train_embeddings.shape)
    # print("Embe val", validation_embeddings.shape)

    for task_name in ["task_1", "task_2"]:
        train_df = train_dfs[task_name]
        prediction_df = train_predictions[task_name]
        cls_emb_df = train_embeddings[task_name]["cls"]
        mean_emb_df = train_embeddings[task_name]["mean"]
        train_df = pd.concat(
            [train_df, prediction_df, cls_emb_df, mean_emb_df],
            axis=1,
        )
        output_path = os.path.join(output_dir, task_name, "train.csv")
        d = os.path.dirname(output_path)
        if not os.path.exists(d):
            os.makedirs(d)
        prediction_df.to_csv(os.path.join(output_dir, task_name,
                                          "tr_prediction.csv"),
                             encoding="utf-8",
                             index=False)
        cls_emb_df.to_csv(os.path.join(output_dir, task_name,
                                       "tr_cls_emb.csv"),
                          encoding="utf-8",
                          index=False)
        mean_emb_df.to_csv(os.path.join(output_dir, task_name,
                                        "tr_mean_emb.csv"),
                           encoding="utf-8",
                           index=False)
        train_df.to_csv(output_path, encoding="utf-8", index=False)

        val_df = val_dfs[task_name]
        prediction_df = validation_predictions[task_name]
        cls_emb_df = validation_embeddings[task_name]["cls"]
        mean_emb_df = validation_embeddings[task_name]["mean"]
        val_df = pd.concat([val_df, prediction_df, cls_emb_df, mean_emb_df],
                           axis=1)
        output_path = os.path.join(output_dir, task_name, "val.csv")
        d = os.path.dirname(output_path)
        if not os.path.exists(d):
            os.makedirs(d)
        prediction_df.to_csv(os.path.join(output_dir, task_name,
                                          "val_prediction.csv"),
                             encoding="utf-8",
                             index=False)
        cls_emb_df.to_csv(os.path.join(output_dir, task_name,
                                       "val_cls_emb.csv"),
                          encoding="utf-8",
                          index=False)
        mean_emb_df.to_csv(os.path.join(output_dir, task_name,
                                        "val_mean_emb.csv"),
                           encoding="utf-8",
                           index=False)
        val_df.to_csv(output_path, encoding="utf-8", index=False)
Пример #27
0
def edge_direction_evaluation(direction):
    """
    Evaluate impact of using different edge directions on dependency networks.

    Values for *direction*: ``forward``, ``backward``, and ``undirected``.
    """
    results = {'_edge-direction': direction}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        if i % 100 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_dependency_network(
            text, direction=direction)
        metric = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        if i % 100 == 0:
            print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, direction=direction)
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    results['retrieval'] = score

    data.pickle_to_file(results,
                        'output/dependencies/stop_words_retr_' + direction)

    pp.pprint(results)
    return results
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import numpy as np

from jsonio import read_infile_as_json, read_outfile_as_json
from features import TopicBOW, TopicNum
from evaluation import evaluate_classification


if __name__ == '__main__':
    train_json, test_json = read_infile_as_json('./data/answered_data_10k.in')
    train_y = np.asarray(map(lambda x: x['__ans__'], train_json))

    test_labels_json = read_outfile_as_json('./data/answered_data_10k.out')
    test_labels_dict = {i['question_key']: i['__ans__'] for i in test_labels_json}

    # Output should be ordered according to test_json
    test_y = np.asarray([test_labels_dict[x['question_key']] for x in test_json])

    feature_extractors = FeatureUnion([("TopicBOW", TopicBOW()), ("TopicNum", TopicNum())])
    m = Pipeline(steps=[("features", feature_extractors),
                        ('LR', LogisticRegression(penalty="l1"))])

    m.fit(train_json, train_y)
    pred = m.predict(test_json)

    print(evaluate_classification(pred, test_y))
Пример #29
0
def stop_word_evaluation(rem_stop_words):
    """
    Experiment for determining what effect removing stop words have on
    dependency networks.
    """
    results = {'_removing stop-words':rem_stop_words}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    total_nodes = 0
    for i, text in enumerate(texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words)
        total_nodes += len(g.nodes())
        metric  = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    print '(the networks had a total of',total_nodes,'nodes)'
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    total_nodes = 0
    for i, text in enumerate(description_texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words)
        total_nodes += len(g.nodes())
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    print '(the networks had a total of',total_nodes,'nodes)'
    results['retrieval'] = score

    if rem_stop_words:
        postfix = '_without'
    else:
        postfix = '_with'
    data.pickle_to_file(results, 'output/dependencies/stop_words_retr'+postfix)

    pp.pprint(results)
    return results