示例#1
0
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20):
    def _print_terms(cents, rep, num):
        ts = _top_cents(cents, num)
        terms = []
        for t in ts:
            terms.append(t[0])
        print rep + ' & ' + ', '.join(terms) + ' \\\\'

    def _top_cents(cents, num):
        return sorted(cents.iteritems(),
                      key=operator.itemgetter(1),
                      reverse=True)[0:num]

    def _calc_cents(g, metric, gcents=None):
        if gcents: icc = graph_representation.calculate_icc_dict(gcents)
        else: icc = None
        return graph_representation.graph_to_dict(g, metric, icc)

    import operator
    import dependency_experiments
    import co_occurrence_experiments

    dataset = 'air/reports'
    path = '../data/' + doc
    doc = data.read_file(path)

    metric = graph.GraphMetrics.DEGREE
    context = 'window'
    g = graph_representation.construct_cooccurrence_network(doc,
                                                            context=context)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Co-occurrence TC', num)
    gcents = co_occurrence_experiments.retrieve_centralities(
        dataset, context, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Co-occurrence TC-ICC', num)

    metric = graph.GraphMetrics.EIGENVECTOR
    deps = data._text_to_dependencies(doc)
    g = graph_representation.construct_dependency_network(deps)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Dependency TC', num)
    gcents = dependency_experiments.retrieve_centralities(dataset, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Dependency TC-ICC', num)

    fdict = freq_representation.text_to_dict(
        [doc], freq_representation.FrequencyMetrics.TF_IDF)[0]
    _print_terms(fdict, 'TF-IDF', num)

    fdict = freq_representation.text_to_dict(
        [doc], freq_representation.FrequencyMetrics.TF)[0]
    _print_terms(fdict, 'TF', num)
示例#2
0
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20):
    def _print_terms(cents, rep, num):
        ts = _top_cents(cents, num)
        terms = []
        for t in ts:
            terms.append(t[0])
        print rep + ' & ' + ', '.join(terms) + ' \\\\'
    def _top_cents(cents,num):
        return sorted(cents.iteritems(), key = operator.itemgetter(1), reverse = True)[0:num]
    def _calc_cents(g, metric, gcents=None):
        if gcents: icc = graph_representation.calculate_icc_dict(gcents)
        else: icc = None
        return graph_representation.graph_to_dict(g, metric, icc)

    import operator
    import dependency_experiments
    import co_occurrence_experiments

    dataset = 'air/reports'
    path = '../data/'+doc
    doc = data.read_file(path)

    metric = graph.GraphMetrics.DEGREE
    context = 'window'
    g = graph_representation.construct_cooccurrence_network(doc, context=context)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Co-occurrence TC', num)
    gcents = co_occurrence_experiments.retrieve_centralities(dataset, context, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Co-occurrence TC-ICC', num)

    metric = graph.GraphMetrics.EIGENVECTOR
    deps = data._text_to_dependencies(doc)
    g = graph_representation.construct_dependency_network(deps)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Dependency TC', num)
    gcents = dependency_experiments.retrieve_centralities(dataset, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Dependency TC-ICC', num)

    fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF_IDF)[0]
    _print_terms(fdict, 'TF-IDF', num)

    fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF)[0]
    _print_terms(fdict, 'TF', num)
示例#3
0
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    import co_occurrence_experiments
    import dependency_experiments

    def make_dicts(docs, icc):
        rep = []
        for i, doc in enumerate(docs):
            if i%100==0: print '    graph',str(i)+'/'+str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'}
    gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network,
                'dependency':graph_representation.construct_dependency_network}
    metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE,
                'dependency':graph.GraphMetrics.CLOSENESS}

    print '--', graph_type
    print '> Reading data..', dataset
    training_path = '../data/'+dataset+'/training'+postfix[graph_type]
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/'+dataset+'/test'+postfix[graph_type]
    test_docs, test_labels = data.read_files(test_path)

    icc_training = None
    icc_test = None
    if icc:
        print '> Calculating ICC..'
        if graph_type is 'co-occurrence':
            icc_training = co_occurrence_experiments.retrieve_centralities(dataset+'/training', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_training = dependency_experiments.retrieve_centralities(dataset+'/training', metrics[graph_type])

        if graph_type is 'co-occurrence':
            icc_test = co_occurrence_experiments.retrieve_centralities(dataset+'/test', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_test = dependency_experiments.retrieve_centralities(dataset+'/test', metrics[graph_type])

    print '> Creating representations..'
    training_dicts = make_dicts(training_docs, icc_training)
    test_dicts = make_dicts(test_docs, icc_test)

    print '    dicts -> vectors'
    keys = set()
    for d in training_dicts + test_dicts:
        keys = keys.union(d.keys())
    keys = list(keys)
    print '    vocabulary size:', len(keys)

    training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
    test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)

    print '> Evaluating..'
    reps = {'training':training_rep, 'test':test_rep}
    labels = {'training':training_labels, 'test':test_labels}
    results = evaluation.evaluate_classification(reps, labels, mode='split')
    print results
    s = 'classification comparison '
    if icc: s += 'USING TC-ICC'
    s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
示例#4
0
def classification_comparison_graph(dataset='reuters',
                                    graph_type='co-occurrence',
                                    icc=None):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    import co_occurrence_experiments
    import dependency_experiments

    def make_dicts(docs, icc):
        rep = []
        for i, doc in enumerate(docs):
            if i % 100 == 0: print '    graph', str(i) + '/' + str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'}
    gfuns = {
        'co-occurrence': graph_representation.construct_cooccurrence_network,
        'dependency': graph_representation.construct_dependency_network
    }
    metrics = {
        'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE,
        'dependency': graph.GraphMetrics.CLOSENESS
    }

    print '--', graph_type
    print '> Reading data..', dataset
    training_path = '../data/' + dataset + '/training' + postfix[graph_type]
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/' + dataset + '/test' + postfix[graph_type]
    test_docs, test_labels = data.read_files(test_path)

    icc_training = None
    icc_test = None
    if icc:
        print '> Calculating ICC..'
        if graph_type is 'co-occurrence':
            icc_training = co_occurrence_experiments.retrieve_centralities(
                dataset + '/training', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_training = dependency_experiments.retrieve_centralities(
                dataset + '/training', metrics[graph_type])

        if graph_type is 'co-occurrence':
            icc_test = co_occurrence_experiments.retrieve_centralities(
                dataset + '/test', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_test = dependency_experiments.retrieve_centralities(
                dataset + '/test', metrics[graph_type])

    print '> Creating representations..'
    training_dicts = make_dicts(training_docs, icc_training)
    test_dicts = make_dicts(test_docs, icc_test)

    print '    dicts -> vectors'
    keys = set()
    for d in training_dicts + test_dicts:
        keys = keys.union(d.keys())
    keys = list(keys)
    print '    vocabulary size:', len(keys)

    training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
    test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)

    print '> Evaluating..'
    reps = {'training': training_rep, 'test': test_rep}
    labels = {'training': training_labels, 'test': test_labels}
    results = evaluation.evaluate_classification(reps, labels, mode='split')
    print results
    s = 'classification comparison '
    if icc: s += 'USING TC-ICC'
    s += '\nrepresentation: ' + graph_type + '\nresult: ' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results