예제 #1
0
def classification_comparison_freq(dataset='reuters'):
    print '> Reading data..', dataset
    training_path = '../data/'+dataset+'/training_preprocessed'
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/'+dataset+'/test_preprocessed'
    test_docs, test_labels = data.read_files(test_path)

    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        training_dicts = freq_representation.text_to_dict(training_docs, metric)
        test_dicts = freq_representation.text_to_dict(test_docs, metric)
        print '    dicst -> vectors'
        keys = set()
        for d in training_dicts + test_dicts:
            keys = keys.union(d.keys())
        print '    vocabulary size:', len(keys)
        training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
        test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)
        reps = {'training':training_rep, 'test':test_rep}
        labels = {'training':training_labels, 'test':test_labels}
        score = evaluation.evaluate_classification(reps, labels, mode='split')
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
예제 #2
0
def classification_comparison_freq(dataset='reuters'):
    print '> Reading data..', dataset
    training_path = '../data/' + dataset + '/training_preprocessed'
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/' + dataset + '/test_preprocessed'
    test_docs, test_labels = data.read_files(test_path)

    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        training_dicts = freq_representation.text_to_dict(
            training_docs, metric)
        test_dicts = freq_representation.text_to_dict(test_docs, metric)
        print '    dicst -> vectors'
        keys = set()
        for d in training_dicts + test_dicts:
            keys = keys.union(d.keys())
        print '    vocabulary size:', len(keys)
        training_rep = graph_representation.dicts_to_vectors(
            training_dicts, keys)
        test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)
        reps = {'training': training_rep, 'test': test_rep}
        labels = {'training': training_labels, 'test': test_labels}
        score = evaluation.evaluate_classification(reps, labels, mode='split')
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
예제 #3
0
def do_retrieval_experiments(
        descriptions='air/problem_descriptions',
        solutions='air/solutions',
        graph_types=['co-occurrence', 'dependency', 'random'],
        use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on the retrieval task.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {
        '_solutions': solutions,
        '_descriptions': descriptions,
        '_evaluation': 'retrieval'
    }

    print '> Evaluation type: retrieval'
    print '> Reading cases..'
    descriptions_path = '../data/' + descriptions
    descriptiondata = data.read_data(descriptions_path, graph_types)

    solutions_path = '../data/' + solutions + '_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ', gtype
        docs, labels = descriptiondata[gtype]
        graphs = graph_representation.create_graphs(docs, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_retrieval(
                vectors, solution_vectors)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            docs, labels = data.read_files(descriptions_path + '_preprocessed')
            vectors = freq_representation.text_to_vector(docs, metric)
            results['freq'][metric] = evaluation.evaluate_retrieval(
                vectors, solution_vectors)

    print
    pp.pprint(results)
    return results
예제 #4
0
def freq_classification(dataset='tasa/TASA900'):
    results = {'_dataset':dataset,
                '_evaluation':'classification'}
    corpus_path = '../data/'+dataset
    results['results'] = {}
    for metric in freq_representation.get_metrics():
        print metric
        documents, labels = data.read_files(corpus_path+'_preprocessed')
        vectors = freq_representation.text_to_vector(documents, metric)
        r = evaluation.evaluate_classification(vectors, labels, mode='cross-validation')
        results['results'][metric] = r
        print '   ', r
        print
    pp.pprint(results)
    return results
예제 #5
0
def freq_classification(dataset='tasa/TASA900'):
    results = {'_dataset': dataset, '_evaluation': 'classification'}
    corpus_path = '../data/' + dataset
    results['results'] = {}
    for metric in freq_representation.get_metrics():
        print metric
        documents, labels = data.read_files(corpus_path + '_preprocessed')
        vectors = freq_representation.text_to_vector(documents, metric)
        r = evaluation.evaluate_classification(vectors,
                                               labels,
                                               mode='cross-validation')
        results['results'][metric] = r
        print '   ', r
        print
    pp.pprint(results)
    return results
예제 #6
0
def do_retrieval_experiments(descriptions='air/problem_descriptions',
                                solutions='air/solutions',
                                graph_types=['co-occurrence','dependency','random'],
                                use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on the retrieval task.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_solutions':solutions,
                '_descriptions':descriptions,
                '_evaluation':'retrieval'}

    print '> Evaluation type: retrieval'
    print '> Reading cases..'
    descriptions_path = '../data/'+descriptions
    descriptiondata = data.read_data(descriptions_path, graph_types)

    solutions_path = '../data/'+solutions+'_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ',gtype
        docs, labels = descriptiondata[gtype]
        graphs = graph_representation.create_graphs(docs, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            docs, labels = data.read_files(descriptions_path+'_preprocessed')
            vectors = freq_representation.text_to_vector(docs, metric)
            results['freq'][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors)

    print
    pp.pprint(results)
    return results
예제 #7
0
def do_classification_experiments(
        dataset='tasa/TASA900',
        graph_types=['co-occurrence', 'dependency', 'random'],
        use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_dataset': dataset, '_evaluation': 'classification'}
    print '> Evaluation type: classification'
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    docdata = data.read_data(corpus_path, graph_types)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ', gtype
        documents, labels = docdata[gtype]
        graphs = graph_representation.create_graphs(documents, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_classification(
                vectors, labels)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            documents, labels = data.read_files(corpus_path + '_preprocessed')
            vectors = freq_representation.text_to_vector(documents, metric)
            results['freq'][metric] = evaluation.evaluate_classification(
                vectors, labels)

    print
    pp.pprint(results)
    return results
예제 #8
0
def retrieval_comparison_freq(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/'+dataset+'/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/'+dataset+'/solutions_preprocessed'
    solutions_docs, _ = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_docs, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        descriptions_rep = freq_representation.text_to_vector(docs, metric)
        score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'retrieval comparison \nrepresentation: frequency\ndataset:'+dataset+' \nresult:\n'+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results
예제 #9
0
def do_classification_experiments(dataset='tasa/TASA900',
                                    graph_types = ['co-occurrence','dependency','random'],
                                    use_frequency = True):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_dataset':dataset,
                '_evaluation':'classification'}
    print '> Evaluation type: classification'
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    docdata = data.read_data(corpus_path, graph_types)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ',gtype
        documents, labels = docdata[gtype]
        graphs = graph_representation.create_graphs(documents, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_classification(vectors, labels)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            documents, labels = data.read_files(corpus_path+'_preprocessed')
            vectors = freq_representation.text_to_vector(documents, metric)
            results['freq'][metric] = evaluation.evaluate_classification(vectors, labels)

    print
    pp.pprint(results)
    return results
예제 #10
0
def retrieval_comparison_freq(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/' + dataset + '/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/' + dataset + '/solutions_preprocessed'
    solutions_docs, _ = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_docs, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        descriptions_rep = freq_representation.text_to_vector(docs, metric)
        score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'retrieval comparison \nrepresentation: frequency\ndataset:' + dataset + ' \nresult:\n' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results