def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/'+dataset+'/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/'+dataset+'/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict(training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training':training_rep, 'test':test_rep} labels = {'training':training_labels, 'test':test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/' + dataset + '/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/' + dataset + '/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict( training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors( training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training': training_rep, 'test': test_rep} labels = {'training': training_labels, 'test': test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20): def _print_terms(cents, rep, num): ts = _top_cents(cents, num) terms = [] for t in ts: terms.append(t[0]) print rep + ' & ' + ', '.join(terms) + ' \\\\' def _top_cents(cents, num): return sorted(cents.iteritems(), key=operator.itemgetter(1), reverse=True)[0:num] def _calc_cents(g, metric, gcents=None): if gcents: icc = graph_representation.calculate_icc_dict(gcents) else: icc = None return graph_representation.graph_to_dict(g, metric, icc) import operator import dependency_experiments import co_occurrence_experiments dataset = 'air/reports' path = '../data/' + doc doc = data.read_file(path) metric = graph.GraphMetrics.DEGREE context = 'window' g = graph_representation.construct_cooccurrence_network(doc, context=context) cents = _calc_cents(g, metric) _print_terms(cents, 'Co-occurrence TC', num) gcents = co_occurrence_experiments.retrieve_centralities( dataset, context, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Co-occurrence TC-ICC', num) metric = graph.GraphMetrics.EIGENVECTOR deps = data._text_to_dependencies(doc) g = graph_representation.construct_dependency_network(deps) cents = _calc_cents(g, metric) _print_terms(cents, 'Dependency TC', num) gcents = dependency_experiments.retrieve_centralities(dataset, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Dependency TC-ICC', num) fdict = freq_representation.text_to_dict( [doc], freq_representation.FrequencyMetrics.TF_IDF)[0] _print_terms(fdict, 'TF-IDF', num) fdict = freq_representation.text_to_dict( [doc], freq_representation.FrequencyMetrics.TF)[0] _print_terms(fdict, 'TF', num)
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20): def _print_terms(cents, rep, num): ts = _top_cents(cents, num) terms = [] for t in ts: terms.append(t[0]) print rep + ' & ' + ', '.join(terms) + ' \\\\' def _top_cents(cents,num): return sorted(cents.iteritems(), key = operator.itemgetter(1), reverse = True)[0:num] def _calc_cents(g, metric, gcents=None): if gcents: icc = graph_representation.calculate_icc_dict(gcents) else: icc = None return graph_representation.graph_to_dict(g, metric, icc) import operator import dependency_experiments import co_occurrence_experiments dataset = 'air/reports' path = '../data/'+doc doc = data.read_file(path) metric = graph.GraphMetrics.DEGREE context = 'window' g = graph_representation.construct_cooccurrence_network(doc, context=context) cents = _calc_cents(g, metric) _print_terms(cents, 'Co-occurrence TC', num) gcents = co_occurrence_experiments.retrieve_centralities(dataset, context, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Co-occurrence TC-ICC', num) metric = graph.GraphMetrics.EIGENVECTOR deps = data._text_to_dependencies(doc) g = graph_representation.construct_dependency_network(deps) cents = _calc_cents(g, metric) _print_terms(cents, 'Dependency TC', num) gcents = dependency_experiments.retrieve_centralities(dataset, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Dependency TC-ICC', num) fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF_IDF)[0] _print_terms(fdict, 'TF-IDF', num) fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF)[0] _print_terms(fdict, 'TF', num)