def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20): def _print_terms(cents, rep, num): ts = _top_cents(cents, num) terms = [] for t in ts: terms.append(t[0]) print rep + ' & ' + ', '.join(terms) + ' \\\\' def _top_cents(cents, num): return sorted(cents.iteritems(), key=operator.itemgetter(1), reverse=True)[0:num] def _calc_cents(g, metric, gcents=None): if gcents: icc = graph_representation.calculate_icc_dict(gcents) else: icc = None return graph_representation.graph_to_dict(g, metric, icc) import operator import dependency_experiments import co_occurrence_experiments dataset = 'air/reports' path = '../data/' + doc doc = data.read_file(path) metric = graph.GraphMetrics.DEGREE context = 'window' g = graph_representation.construct_cooccurrence_network(doc, context=context) cents = _calc_cents(g, metric) _print_terms(cents, 'Co-occurrence TC', num) gcents = co_occurrence_experiments.retrieve_centralities( dataset, context, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Co-occurrence TC-ICC', num) metric = graph.GraphMetrics.EIGENVECTOR deps = data._text_to_dependencies(doc) g = graph_representation.construct_dependency_network(deps) cents = _calc_cents(g, metric) _print_terms(cents, 'Dependency TC', num) gcents = dependency_experiments.retrieve_centralities(dataset, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Dependency TC-ICC', num) fdict = freq_representation.text_to_dict( [doc], freq_representation.FrequencyMetrics.TF_IDF)[0] _print_terms(fdict, 'TF-IDF', num) fdict = freq_representation.text_to_dict( [doc], freq_representation.FrequencyMetrics.TF)[0] _print_terms(fdict, 'TF', num)
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20): def _print_terms(cents, rep, num): ts = _top_cents(cents, num) terms = [] for t in ts: terms.append(t[0]) print rep + ' & ' + ', '.join(terms) + ' \\\\' def _top_cents(cents,num): return sorted(cents.iteritems(), key = operator.itemgetter(1), reverse = True)[0:num] def _calc_cents(g, metric, gcents=None): if gcents: icc = graph_representation.calculate_icc_dict(gcents) else: icc = None return graph_representation.graph_to_dict(g, metric, icc) import operator import dependency_experiments import co_occurrence_experiments dataset = 'air/reports' path = '../data/'+doc doc = data.read_file(path) metric = graph.GraphMetrics.DEGREE context = 'window' g = graph_representation.construct_cooccurrence_network(doc, context=context) cents = _calc_cents(g, metric) _print_terms(cents, 'Co-occurrence TC', num) gcents = co_occurrence_experiments.retrieve_centralities(dataset, context, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Co-occurrence TC-ICC', num) metric = graph.GraphMetrics.EIGENVECTOR deps = data._text_to_dependencies(doc) g = graph_representation.construct_dependency_network(deps) cents = _calc_cents(g, metric) _print_terms(cents, 'Dependency TC', num) gcents = dependency_experiments.retrieve_centralities(dataset, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Dependency TC-ICC', num) fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF_IDF)[0] _print_terms(fdict, 'TF-IDF', num) fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF)[0] _print_terms(fdict, 'TF', num)
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None): """ Experiment used for comparative evaluation of different network representations on classification. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ import co_occurrence_experiments import dependency_experiments def make_dicts(docs, icc): rep = [] for i, doc in enumerate(docs): if i%100==0: print ' graph',str(i)+'/'+str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'} gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network, 'dependency':graph_representation.construct_dependency_network} metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency':graph.GraphMetrics.CLOSENESS} print '--', graph_type print '> Reading data..', dataset training_path = '../data/'+dataset+'/training'+postfix[graph_type] training_docs, training_labels = data.read_files(training_path) test_path = '../data/'+dataset+'/test'+postfix[graph_type] test_docs, test_labels = data.read_files(test_path) icc_training = None icc_test = None if icc: print '> Calculating ICC..' if graph_type is 'co-occurrence': icc_training = co_occurrence_experiments.retrieve_centralities(dataset+'/training', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_training = dependency_experiments.retrieve_centralities(dataset+'/training', metrics[graph_type]) if graph_type is 'co-occurrence': icc_test = co_occurrence_experiments.retrieve_centralities(dataset+'/test', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_test = dependency_experiments.retrieve_centralities(dataset+'/test', metrics[graph_type]) print '> Creating representations..' training_dicts = make_dicts(training_docs, icc_training) test_dicts = make_dicts(test_docs, icc_test) print ' dicts -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) keys = list(keys) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) print '> Evaluating..' reps = {'training':training_rep, 'test':test_rep} labels = {'training':training_labels, 'test':test_labels} results = evaluation.evaluate_classification(reps, labels, mode='split') print results s = 'classification comparison ' if icc: s += 'USING TC-ICC' s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None): """ Experiment used for comparative evaluation of different network representations on classification. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ import co_occurrence_experiments import dependency_experiments def make_dicts(docs, icc): rep = [] for i, doc in enumerate(docs): if i % 100 == 0: print ' graph', str(i) + '/' + str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'} gfuns = { 'co-occurrence': graph_representation.construct_cooccurrence_network, 'dependency': graph_representation.construct_dependency_network } metrics = { 'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency': graph.GraphMetrics.CLOSENESS } print '--', graph_type print '> Reading data..', dataset training_path = '../data/' + dataset + '/training' + postfix[graph_type] training_docs, training_labels = data.read_files(training_path) test_path = '../data/' + dataset + '/test' + postfix[graph_type] test_docs, test_labels = data.read_files(test_path) icc_training = None icc_test = None if icc: print '> Calculating ICC..' if graph_type is 'co-occurrence': icc_training = co_occurrence_experiments.retrieve_centralities( dataset + '/training', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_training = dependency_experiments.retrieve_centralities( dataset + '/training', metrics[graph_type]) if graph_type is 'co-occurrence': icc_test = co_occurrence_experiments.retrieve_centralities( dataset + '/test', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_test = dependency_experiments.retrieve_centralities( dataset + '/test', metrics[graph_type]) print '> Creating representations..' training_dicts = make_dicts(training_docs, icc_training) test_dicts = make_dicts(test_docs, icc_test) print ' dicts -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) keys = list(keys) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) print '> Evaluating..' reps = {'training': training_rep, 'test': test_rep} labels = {'training': training_labels, 'test': test_labels} results = evaluation.evaluate_classification(reps, labels, mode='split') print results s = 'classification comparison ' if icc: s += 'USING TC-ICC' s += '\nrepresentation: ' + graph_type + '\nresult: ' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results