def do_context_sentence_evaluation_classification(): """ Experiment evaluating performance of sentences as contexts for co-occurrence networks in the classification task. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) print '> Evaluating..' graphs = [] results = {} for text in texts: g = graph_representation.construct_cooccurrence_network(text, context='sentence') graphs.append(g) for metric in graph_representation.get_metrics(): print ' ', metric vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True) score = evaluation.evaluate_classification(vectors, labels) results[metric+' (sentence)'] = score data.pickle_to_file(results, 'output/class_context_sentence') pp.pprint(results) return results
def classification_demo(): """Function intended to illustrate classification in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Classification' print 'Graph type: Co-occurrence w/2-word window context' print 'Centrality: Weighted degree' print print '> Reading data..' corpus_path = '../data/tasa/TASA900_preprocessed' docs, labels = data.read_files(corpus_path) print '> Creating representations..' dicts = [] for i, doc in enumerate(docs): print ' ',str(i)+'/'+str(len(docs)) g = graph_representation.construct_cooccurrence_network(doc) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) dicts.append(d) vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_classification(vectors, labels) print ' score:', score print
def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]): """ Test retrieval using different combinations of higher orders and weightings of these. The list *orders* define which higher order relations to include. The relative importance of the orders are defined by *order_weights*. """ print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) filenames = data.get_file_names(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] for i, text in enumerate(description_texts): print ' '+str(i)+"/"+str(len(description_texts)) g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i]) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print 'orders:', orders print 'score:', score fname = 'output/higher_order/results/retr' with open(fname, 'a+') as f: s = reduce(lambda x,y:str(x)+str(y), orders) f.write(str(s)+' '+str(score)+'\n') return score
def evaluate_tc_icc_retrieval(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'air/problem_descriptions' context = 'window' solutions_path = '../data/air/solutions_preprocessed' path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating solution representations..' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating problem description representations..' for i, text in enumerate(description_texts): if i % 1 == 0: print ' document', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_cooccurrence_network( text, already_preprocessed=True, context='window') for metric in graph_metrics: if not icc[metric]: continue #~ print ' ',metric d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solutions_rep) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file( results, 'output/tc_icc/cooccurrence/' + corpus + '/retrieval.res') return results
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]): """ Test classification using different combinations of higher orders and weightings of these. The list *orders* define which higher order relations to include. The relative importance of the orders are defined by *order_weights*. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) filenames = data.get_file_names(path) print '> Creating representations..' rep = [] for i, text in enumerate(texts): print ' '+str(i)+"/"+str(len(texts)) g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i]) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print 'orders:', orders print 'score:', score fname = 'output/higher_order/results/class' with open(fname, 'a+') as f: s = reduce(lambda x,y:str(x)+str(y), orders) f.write(str(s)+' '+str(score)+'\n') return score
def print_degree_distributions(dataset, context): """ Extracts degree distribution values from networks, and print them to cvs-file. **warning** overwrites if file exists. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset + '_text' (documents, labels) = data.read_files(corpus_path) degsfile = open( 'output/properties/cooccurrence/degrees_docs_' + dataset.replace('/', '.'), 'w') giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i % 10 == 0: print ' ', str(i) + '/' + str(len(documents)) g = graph_representation.construct_cooccurrence_network( text, context=context) giant.add_edges_from(g.edges()) degs = nx.degree(g).values() degs = [str(d) for d in degs] degsfile.write(','.join(degs) + '\n') degsfile.close() print '> Writing giant\'s distribution' with open( 'output/properties/cooccurrence/degrees_giant_' + dataset.replace('/', '.'), 'w') as f: ds = nx.degree(giant).values() ds = [str(d) for d in ds] f.write(','.join(ds))
def classification_demo(): """Function intended to illustrate classification in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Classification' print 'Graph type: Co-occurrence w/2-word window context' print 'Centrality: Weighted degree' print print '> Reading data..' corpus_path = '../data/tasa/TASA900_preprocessed' docs, labels = data.read_files(corpus_path) print '> Creating representations..' dicts = [] for i, doc in enumerate(docs): print ' ', str(i) + '/' + str(len(docs)) g = graph_representation.construct_cooccurrence_network(doc) d = graph_representation.graph_to_dict( g, graph.GraphMetrics.WEIGHTED_DEGREE) dicts.append(d) vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_classification(vectors, labels) print ' score:', score print
def test_scale_free(): import random import data import graph_representation import plfit import numpy corpus_path = '../data/air/problem_descriptions_text' (documents, labels) = data.read_files(corpus_path) g = graph_representation.construct_cooccurrence_network(documents[0], context='sentence') degree_sequence = sorted(nx.degree(g).values(), reverse=True) # degree sequence dmax = max(degree_sequence) degree_sequence = numpy.array(degree_sequence) print degree_sequence pl = plfit.plfit(degree_sequence) p, ksv = pl.test_pl() print print print print seq = [random.randrange(0, 100) for i in range(len(degree_sequence))] degree_sequence = numpy.array(seq) print degree_sequence pl = plfit.plfit(degree_sequence) p, ksv = pl.test_pl() print print print print
def do_context_sentence_evaluation_classification(): """ Experiment evaluating performance of sentences as contexts for co-occurrence networks in the classification task. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) print '> Evaluating..' graphs = [] results = {} for text in texts: g = graph_representation.construct_cooccurrence_network( text, context='sentence') graphs.append(g) for metric in graph_representation.get_metrics(): print ' ', metric vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True) score = evaluation.evaluate_classification(vectors, labels) results[metric + ' (sentence)'] = score data.pickle_to_file(results, 'output/class_context_sentence') pp.pprint(results) return results
def test_scale_free(): import random import data import graph_representation import plfit import numpy corpus_path = '../data/air/problem_descriptions_text' (documents, labels) = data.read_files(corpus_path) g = graph_representation.construct_cooccurrence_network(documents[0],context='sentence') degree_sequence=sorted(nx.degree(g).values(),reverse=True) # degree sequence dmax=max(degree_sequence) degree_sequence = numpy.array(degree_sequence) print degree_sequence pl = plfit.plfit(degree_sequence) p,ksv = pl.test_pl() print print print print seq = [random.randrange(0,100) for i in range(len(degree_sequence))] degree_sequence = numpy.array(seq) print degree_sequence pl = plfit.plfit(degree_sequence) p,ksv = pl.test_pl() print print print print
def print_degree_distributions(dataset, context): """ Extracts degree distribution values from networks, and print them to cvs-file. **warning** overwrites if file exists. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset+'_text' (documents, labels) = data.read_files(corpus_path) degsfile = open('output/properties/cooccurrence/degrees_docs_'+dataset.replace('/','.'), 'w') giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i%10==0: print ' ',str(i)+'/'+str(len(documents)) g = graph_representation.construct_cooccurrence_network(text,context=context) giant.add_edges_from(g.edges()) degs = nx.degree(g).values() degs = [str(d) for d in degs] degsfile.write(','.join(degs)+'\n') degsfile.close() print '> Writing giant\'s distribution' with open('output/properties/cooccurrence/degrees_giant_'+dataset.replace('/','.'), 'w') as f: ds = nx.degree(giant).values() ds = [str(d) for d in ds] f.write(','.join(ds))
def corpus_properties(dataset, context): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset+'_text' (documents, labels) = data.read_files(corpus_path) props = {} #~ giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i%10==0: print ' ',str(i)+'/'+str(len(documents)) g = graph_representation.construct_cooccurrence_network(text,context=context) #~ giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k,v in p.iteritems(): if i==0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: print ' ',key props_total[key+'_mean'] = numpy.mean(props[key]) props_total[key+'_std'] = numpy.std(props[key]) data_name = dataset.replace('/','.') #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name) data.pickle_to_file(props, 'output/properties/cooccurrence/stats_'+data_name) data.pickle_to_file(props_total, 'output/properties/cooccurrence/stats_tot_'+data_name)
def evaluate_tc_icc_retrieval(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'air/problem_descriptions' context = 'window' solutions_path = '../data/air/solutions_preprocessed' path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating solution representations..' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating problem description representations..' for i, text in enumerate(description_texts): if i%1==0: print ' document',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_cooccurrence_network(text, already_preprocessed=True, context='window') for metric in graph_metrics: if not icc[metric]: continue #~ print ' ',metric d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solutions_rep) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/retrieval.res') return results
def evaluate_tc_icc_classification(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'tasa/TASA900' #~ corpus = 'tasa/TASATest2' context = 'sentence' path = '../data/' + corpus + '_text' texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating graph representations..' for i, text in enumerate(texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(texts)) g = graph_representation.construct_cooccurrence_network( text, context=context) for metric in graph_metrics: print ' ', metric if not icc[metric]: continue d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file( results, 'output/tc_icc/cooccurrence/' + corpus + '/classification.res') return results
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20): def _print_terms(cents, rep, num): ts = _top_cents(cents, num) terms = [] for t in ts: terms.append(t[0]) print rep + ' & ' + ', '.join(terms) + ' \\\\' def _top_cents(cents, num): return sorted(cents.iteritems(), key=operator.itemgetter(1), reverse=True)[0:num] def _calc_cents(g, metric, gcents=None): if gcents: icc = graph_representation.calculate_icc_dict(gcents) else: icc = None return graph_representation.graph_to_dict(g, metric, icc) import operator import dependency_experiments import co_occurrence_experiments dataset = 'air/reports' path = '../data/' + doc doc = data.read_file(path) metric = graph.GraphMetrics.DEGREE context = 'window' g = graph_representation.construct_cooccurrence_network(doc, context=context) cents = _calc_cents(g, metric) _print_terms(cents, 'Co-occurrence TC', num) gcents = co_occurrence_experiments.retrieve_centralities( dataset, context, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Co-occurrence TC-ICC', num) metric = graph.GraphMetrics.EIGENVECTOR deps = data._text_to_dependencies(doc) g = graph_representation.construct_dependency_network(deps) cents = _calc_cents(g, metric) _print_terms(cents, 'Dependency TC', num) gcents = dependency_experiments.retrieve_centralities(dataset, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Dependency TC-ICC', num) fdict = freq_representation.text_to_dict( [doc], freq_representation.FrequencyMetrics.TF_IDF)[0] _print_terms(fdict, 'TF-IDF', num) fdict = freq_representation.text_to_dict( [doc], freq_representation.FrequencyMetrics.TF)[0] _print_terms(fdict, 'TF', num)
def store_corpus_network(corpus, context): print '> Constructing corpus network for', corpus path = '../data/'+corpus+'_text' store_path = 'output/giants/co-occurrence/'+corpus+'/'+context+'_graph.net' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping' return texts, labels = data.read_files(path) gdoc = ' '.join(texts) giant = graph_representation.construct_cooccurrence_network(gdoc, context=context, already_preprocessed=False, verbose=True) print '> Serializing and saving..' data.pickle_to_file(giant, store_path)
def evaluate_tc_icc_classification(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'tasa/TASA900' #~ corpus = 'tasa/TASATest2' context = 'sentence' path = '../data/'+corpus+'_text' texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating graph representations..' for i, text in enumerate(texts): if i%10==0: print ' ',str(i)+'/'+str(len(texts)) g = graph_representation.construct_cooccurrence_network(text, context=context) for metric in graph_metrics: print ' ', metric if not icc[metric]: continue d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res') return results
def do_context_size_evaluation_retrieval(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the retrieval task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) for window_size in range(1, 11) + [20, 40, 80]: print '-- window size:', window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for i, text in enumerate(description_texts): if i % 10 == 0: print i g = graph_representation.construct_cooccurrence_network( text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/retr_context_' + str(window_size)) pp.pprint(results) return results
def store_corpus_network(corpus, context): print '> Constructing corpus network for', corpus path = '../data/' + corpus + '_text' store_path = 'output/giants/co-occurrence/' + corpus + '/' + context + '_graph.net' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping' return texts, labels = data.read_files(path) gdoc = ' '.join(texts) giant = graph_representation.construct_cooccurrence_network( gdoc, context=context, already_preprocessed=False, verbose=True) print '> Serializing and saving..' data.pickle_to_file(giant, store_path)
def do_context_size_evaluation_retrieval(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the retrieval task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) for window_size in range(1,11)+[20,40,80]: print '-- window size:',window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for i, text in enumerate(description_texts): if i%10==0: print i g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/retr_context_'+str(window_size)) pp.pprint(results) return results
def do_context_size_evaluation_classification(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the classification task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' path = '../data/tasa/TASA900_preprocessed' texts, labels = data.read_files(path) for window_size in range(1, 11) + [20, 40, 80]: print '-- window size:', window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for text in texts: g = graph_representation.construct_cooccurrence_network( text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/class_context_' + str(window_size)) pp.pprint(results) return results
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20): def _print_terms(cents, rep, num): ts = _top_cents(cents, num) terms = [] for t in ts: terms.append(t[0]) print rep + ' & ' + ', '.join(terms) + ' \\\\' def _top_cents(cents,num): return sorted(cents.iteritems(), key = operator.itemgetter(1), reverse = True)[0:num] def _calc_cents(g, metric, gcents=None): if gcents: icc = graph_representation.calculate_icc_dict(gcents) else: icc = None return graph_representation.graph_to_dict(g, metric, icc) import operator import dependency_experiments import co_occurrence_experiments dataset = 'air/reports' path = '../data/'+doc doc = data.read_file(path) metric = graph.GraphMetrics.DEGREE context = 'window' g = graph_representation.construct_cooccurrence_network(doc, context=context) cents = _calc_cents(g, metric) _print_terms(cents, 'Co-occurrence TC', num) gcents = co_occurrence_experiments.retrieve_centralities(dataset, context, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Co-occurrence TC-ICC', num) metric = graph.GraphMetrics.EIGENVECTOR deps = data._text_to_dependencies(doc) g = graph_representation.construct_dependency_network(deps) cents = _calc_cents(g, metric) _print_terms(cents, 'Dependency TC', num) gcents = dependency_experiments.retrieve_centralities(dataset, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Dependency TC-ICC', num) fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF_IDF)[0] _print_terms(fdict, 'TF-IDF', num) fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF)[0] _print_terms(fdict, 'TF', num)
def do_context_size_evaluation_classification(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the classification task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' path = '../data/tasa/TASA900_preprocessed' texts, labels = data.read_files(path) for window_size in range(1,11)+[20,40,80]: print '-- window size:',window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for text in texts: g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/class_context_'+str(window_size)) pp.pprint(results) return results
def test_best_classification(): print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) rep = [] print '> Creating representations..' for i, text in enumerate(texts): if i%100==0: print ' ',i g = graph_representation.construct_cooccurrence_network(text, context='sentence') d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' ', score
def test_best_classification(): print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) rep = [] print '> Creating representations..' for i, text in enumerate(texts): if i % 100 == 0: print ' ', i g = graph_representation.construct_cooccurrence_network( text, context='sentence') d = graph_representation.graph_to_dict( g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' ', score
def complete_network(path='../data/air/problem_descriptions_text'): """ Create and pickle to file a giant co-occurrence network for all documents in the dataset pointed to by *path*. """ print '> Reading cases..' texts, labels = data.read_files(path) print '> Creating graph..' g = None for i, text in enumerate(texts): if i%10==0: print str(i)+'/'+str(len(texts)) tmp = graph_representation.construct_cooccurrence_network(text, context='sentence', already_preprocessed=False) if g is None: g = tmp else: g.add_nodes_from(tmp.nodes()) g.add_edges_from(tmp.edges()) data.pickle_to_file(g, 'output/complete_networks/air_descriptions.pkl') pp.pprint(g) return g
def corpus_properties(dataset, context): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset + '_text' (documents, labels) = data.read_files(corpus_path) props = {} #~ giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i % 10 == 0: print ' ', str(i) + '/' + str(len(documents)) g = graph_representation.construct_cooccurrence_network( text, context=context) #~ giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k, v in p.iteritems(): if i == 0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: print ' ', key props_total[key + '_mean'] = numpy.mean(props[key]) props_total[key + '_std'] = numpy.std(props[key]) data_name = dataset.replace('/', '.') #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name) data.pickle_to_file(props, 'output/properties/cooccurrence/stats_' + data_name) data.pickle_to_file( props_total, 'output/properties/cooccurrence/stats_tot_' + data_name)
def complete_network(path='../data/air/problem_descriptions_text'): """ Create and pickle to file a giant co-occurrence network for all documents in the dataset pointed to by *path*. """ print '> Reading cases..' texts, labels = data.read_files(path) print '> Creating graph..' g = None for i, text in enumerate(texts): if i % 10 == 0: print str(i) + '/' + str(len(texts)) tmp = graph_representation.construct_cooccurrence_network( text, context='sentence', already_preprocessed=False) if g is None: g = tmp else: g.add_nodes_from(tmp.nodes()) g.add_edges_from(tmp.edges()) data.pickle_to_file(g, 'output/complete_networks/air_descriptions.pkl') pp.pprint(g) return g