def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/' + dataset + '/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/' + dataset + '/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict( training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors( training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training': training_rep, 'test': test_rep} labels = {'training': training_labels, 'test': test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/'+dataset+'/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/'+dataset+'/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict(training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training':training_rep, 'test':test_rep} labels = {'training':training_labels, 'test':test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def edge_direction_evaluation(direction): """ Evaluate impact of using different edge directions on dependency networks. Values for *direction*: ``forward``, ``backward``, and ``undirected``. """ results = {'_edge-direction':direction} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = [] for i, text in enumerate(texts): if i%100==0: print ' ',str(i)+'/'+str(len(texts)) g = graph_representation.construct_dependency_network(text, direction=direction) metric = graph.GraphMetrics.CLOSENESS d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' score:', score results['classification'] = score print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] for i, text in enumerate(description_texts): if i%100==0: print ' ',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_dependency_network(text, direction=direction) metric = graph.GraphMetrics.EIGENVECTOR d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print ' score:', score results['retrieval'] = score data.pickle_to_file(results, 'output/dependencies/stop_words_retr_'+direction) pp.pprint(results) return results
def classification_demo(): """Function intended to illustrate classification in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Classification' print 'Graph type: Co-occurrence w/2-word window context' print 'Centrality: Weighted degree' print print '> Reading data..' corpus_path = '../data/tasa/TASA900_preprocessed' docs, labels = data.read_files(corpus_path) print '> Creating representations..' dicts = [] for i, doc in enumerate(docs): print ' ',str(i)+'/'+str(len(docs)) g = graph_representation.construct_cooccurrence_network(doc) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) dicts.append(d) vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_classification(vectors, labels) print ' score:', score print
def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]): """ Test retrieval using different combinations of higher orders and weightings of these. The list *orders* define which higher order relations to include. The relative importance of the orders are defined by *order_weights*. """ print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) filenames = data.get_file_names(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] for i, text in enumerate(description_texts): print ' '+str(i)+"/"+str(len(description_texts)) g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i]) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print 'orders:', orders print 'score:', score fname = 'output/higher_order/results/retr' with open(fname, 'a+') as f: s = reduce(lambda x,y:str(x)+str(y), orders) f.write(str(s)+' '+str(score)+'\n') return score
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]): """ Test classification using different combinations of higher orders and weightings of these. The list *orders* define which higher order relations to include. The relative importance of the orders are defined by *order_weights*. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) filenames = data.get_file_names(path) print '> Creating representations..' rep = [] for i, text in enumerate(texts): print ' '+str(i)+"/"+str(len(texts)) g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i]) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print 'orders:', orders print 'score:', score fname = 'output/higher_order/results/class' with open(fname, 'a+') as f: s = reduce(lambda x,y:str(x)+str(y), orders) f.write(str(s)+' '+str(score)+'\n') return score
def retrieval_demo(): """Function intended to illustrate retrieval in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Retrieval' print 'Graph type: Dependency' print 'Centrality: PageRank' print print '> Reading data..' desc_path = '../data/air/problem_descriptions_dependencies' sol_path = '../data/air/solutions_preprocessed' problems, _ = data.read_files(desc_path) solutions, _ = data.read_files(sol_path) print '> Creating solution representations..' metric = freq_representation.FrequencyMetrics.TF_IDF sol_vectors = freq_representation.text_to_vector(solutions, metric) print '> Creating problem description representations..' dicts = [] for i, doc in enumerate(problems): print ' ',str(i)+'/'+str(len(problems)) g = graph_representation.construct_dependency_network(doc) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.PAGERANK) dicts.append(d) desc_vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_retrieval(desc_vectors, sol_vectors) print ' score:', score print
def retrieval_demo(): """Function intended to illustrate retrieval in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Retrieval' print 'Graph type: Dependency' print 'Centrality: PageRank' print print '> Reading data..' desc_path = '../data/air/problem_descriptions_dependencies' sol_path = '../data/air/solutions_preprocessed' problems, _ = data.read_files(desc_path) solutions, _ = data.read_files(sol_path) print '> Creating solution representations..' metric = freq_representation.FrequencyMetrics.TF_IDF sol_vectors = freq_representation.text_to_vector(solutions, metric) print '> Creating problem description representations..' dicts = [] for i, doc in enumerate(problems): print ' ', str(i) + '/' + str(len(problems)) g = graph_representation.construct_dependency_network(doc) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.PAGERANK) dicts.append(d) desc_vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_retrieval(desc_vectors, sol_vectors) print ' score:', score print
def evaluate_tc_icc_retrieval(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'air/problem_descriptions' context = 'window' solutions_path = '../data/air/solutions_preprocessed' path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating solution representations..' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating problem description representations..' for i, text in enumerate(description_texts): if i % 1 == 0: print ' document', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_cooccurrence_network( text, already_preprocessed=True, context='window') for metric in graph_metrics: if not icc[metric]: continue #~ print ' ',metric d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solutions_rep) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file( results, 'output/tc_icc/cooccurrence/' + corpus + '/retrieval.res') return results
def classification_demo(): """Function intended to illustrate classification in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Classification' print 'Graph type: Co-occurrence w/2-word window context' print 'Centrality: Weighted degree' print print '> Reading data..' corpus_path = '../data/tasa/TASA900_preprocessed' docs, labels = data.read_files(corpus_path) print '> Creating representations..' dicts = [] for i, doc in enumerate(docs): print ' ', str(i) + '/' + str(len(docs)) g = graph_representation.construct_cooccurrence_network(doc) d = graph_representation.graph_to_dict( g, graph.GraphMetrics.WEIGHTED_DEGREE) dicts.append(d) vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_classification(vectors, labels) print ' score:', score print
def retrieval_comparison_graph(dataset='air', graph_type='co-occurrence', use_icc=False): """ Experiment used for comparative evaluation of different network representations on retrieval. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ def make_dicts(docs, icc=None): rep = [] for i, doc in enumerate(docs): if i%100==0: print ' graph',str(i)+'/'+str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'} gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network, 'dependency':graph_representation.construct_dependency_network} metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency':graph.GraphMetrics.EIGENVECTOR} print '--', graph_type print '> Reading data..', dataset path = '../data/'+dataset+'/problem_descriptions'+postfix[graph_type] docs, labels = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/'+dataset+'/solutions_preprocessed' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) icc = None if use_icc: print '> Calculating ICC..' m = metrics[graph_type].split()[0] print graph_type if graph_type == 'co-occurrence': p = 'output/centralities/co-occurrence/'+dataset+'/problem_descriptions/window/'+m+'.cent' elif graph_type == 'dependency': p = 'output/centralities/dependency/'+dataset+'/problem_descriptions/'+m+'.cent' print ' fetching', p icc = data.pickle_from_file(p) print ' icc:', type(icc) print '> Creating problem description representations..' dicts = make_dicts(docs, icc) descriptions_rep = graph_representation.dicts_to_vectors(dicts)#, remove_stop_words=True) print '> Evaluating..' results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) print results s = 'retrieval comparison ' if use_icc: s += 'USING TC-ICC' s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def centrality_weights_retrieval(weighted=True): """ Evaluate whether edge weights are beneficial to the depdendency network represenation for the retrieval task. """ results = {'_is_weighted': weighted, '_evaluation': 'retrieval'} graph_metrics = graph_representation.get_metrics(weighted) print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating graph representations..' for i, text in enumerate(description_texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_dependency_network( text, weighted=weighted) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. if i % 100 == 0: if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file( rep, 'output/dependencies/exp1_retr_tmp_' + str(i) + '_' + postfix) print '> Creating vector representations..' for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric] = score if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(results, 'output/dependencies/exp1_retr' + postfix) pp.pprint(results) return results
def evaluate_tc_icc_retrieval(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'air/problem_descriptions' context = 'window' solutions_path = '../data/air/solutions_preprocessed' path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating solution representations..' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating problem description representations..' for i, text in enumerate(description_texts): if i%1==0: print ' document',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_cooccurrence_network(text, already_preprocessed=True, context='window') for metric in graph_metrics: if not icc[metric]: continue #~ print ' ',metric d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solutions_rep) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/retrieval.res') return results
def evaluate_tc_icc_classification(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'tasa/TASA900' #~ corpus = 'tasa/TASATest2' context = 'sentence' path = '../data/' + corpus + '_text' texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating graph representations..' for i, text in enumerate(texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(texts)) g = graph_representation.construct_cooccurrence_network( text, context=context) for metric in graph_metrics: print ' ', metric if not icc[metric]: continue d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file( results, 'output/tc_icc/cooccurrence/' + corpus + '/classification.res') return results
def centrality_weights_retrieval(weighted=True): """ Evaluate whether edge weights are beneficial to the depdendency network represenation for the retrieval task. """ results = {'_is_weighted':weighted, '_evaluation':'retrieval'} graph_metrics = graph_representation.get_metrics(weighted) print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating graph representations..' for i, text in enumerate(description_texts): if i%10==0: print ' ',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_dependency_network(text, weighted=weighted) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. if i%100==0: if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(rep, 'output/dependencies/exp1_retr_tmp_'+str(i)+'_'+postfix) print '> Creating vector representations..' for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric] = score if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(results, 'output/dependencies/exp1_retr'+postfix) pp.pprint(results) return results
def evaluate_tc_icc_classification(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'tasa/TASA900' #~ corpus = 'tasa/TASATest2' context = 'sentence' path = '../data/'+corpus+'_text' texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating graph representations..' for i, text in enumerate(texts): if i%10==0: print ' ',str(i)+'/'+str(len(texts)) g = graph_representation.construct_cooccurrence_network(text, context=context) for metric in graph_metrics: print ' ', metric if not icc[metric]: continue d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res') return results
def do_context_size_evaluation_retrieval(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the retrieval task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) for window_size in range(1, 11) + [20, 40, 80]: print '-- window size:', window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for i, text in enumerate(description_texts): if i % 10 == 0: print i g = graph_representation.construct_cooccurrence_network( text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/retr_context_' + str(window_size)) pp.pprint(results) return results
def do_context_size_evaluation_retrieval(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the retrieval task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) for window_size in range(1,11)+[20,40,80]: print '-- window size:',window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for i, text in enumerate(description_texts): if i%10==0: print i g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/retr_context_'+str(window_size)) pp.pprint(results) return results
def do_context_size_evaluation_classification(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the classification task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' path = '../data/tasa/TASA900_preprocessed' texts, labels = data.read_files(path) for window_size in range(1, 11) + [20, 40, 80]: print '-- window size:', window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for text in texts: g = graph_representation.construct_cooccurrence_network( text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/class_context_' + str(window_size)) pp.pprint(results) return results
def do_context_size_evaluation_classification(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the classification task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' path = '../data/tasa/TASA900_preprocessed' texts, labels = data.read_files(path) for window_size in range(1,11)+[20,40,80]: print '-- window size:',window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for text in texts: g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/class_context_'+str(window_size)) pp.pprint(results) return results
def test_best_classification(): print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) rep = [] print '> Creating representations..' for i, text in enumerate(texts): if i%100==0: print ' ',i g = graph_representation.construct_cooccurrence_network(text, context='sentence') d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' ', score
def test_best_classification(): print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) rep = [] print '> Creating representations..' for i, text in enumerate(texts): if i % 100 == 0: print ' ', i g = graph_representation.construct_cooccurrence_network( text, context='sentence') d = graph_representation.graph_to_dict( g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' ', score
def edge_direction_evaluation(direction): """ Evaluate impact of using different edge directions on dependency networks. Values for *direction*: ``forward``, ``backward``, and ``undirected``. """ results = {'_edge-direction': direction} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = [] for i, text in enumerate(texts): if i % 100 == 0: print ' ', str(i) + '/' + str(len(texts)) g = graph_representation.construct_dependency_network( text, direction=direction) metric = graph.GraphMetrics.CLOSENESS d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' score:', score results['classification'] = score print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] for i, text in enumerate(description_texts): if i % 100 == 0: print ' ', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_dependency_network( text, direction=direction) metric = graph.GraphMetrics.EIGENVECTOR d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print ' score:', score results['retrieval'] = score data.pickle_to_file(results, 'output/dependencies/stop_words_retr_' + direction) pp.pprint(results) return results
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None): """ Experiment used for comparative evaluation of different network representations on classification. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ import co_occurrence_experiments import dependency_experiments def make_dicts(docs, icc): rep = [] for i, doc in enumerate(docs): if i%100==0: print ' graph',str(i)+'/'+str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'} gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network, 'dependency':graph_representation.construct_dependency_network} metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency':graph.GraphMetrics.CLOSENESS} print '--', graph_type print '> Reading data..', dataset training_path = '../data/'+dataset+'/training'+postfix[graph_type] training_docs, training_labels = data.read_files(training_path) test_path = '../data/'+dataset+'/test'+postfix[graph_type] test_docs, test_labels = data.read_files(test_path) icc_training = None icc_test = None if icc: print '> Calculating ICC..' if graph_type is 'co-occurrence': icc_training = co_occurrence_experiments.retrieve_centralities(dataset+'/training', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_training = dependency_experiments.retrieve_centralities(dataset+'/training', metrics[graph_type]) if graph_type is 'co-occurrence': icc_test = co_occurrence_experiments.retrieve_centralities(dataset+'/test', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_test = dependency_experiments.retrieve_centralities(dataset+'/test', metrics[graph_type]) print '> Creating representations..' training_dicts = make_dicts(training_docs, icc_training) test_dicts = make_dicts(test_docs, icc_test) print ' dicts -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) keys = list(keys) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) print '> Evaluating..' reps = {'training':training_rep, 'test':test_rep} labels = {'training':training_labels, 'test':test_labels} results = evaluation.evaluate_classification(reps, labels, mode='split') print results s = 'classification comparison ' if icc: s += 'USING TC-ICC' s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def evaluate_dep_types(): """ Leave-one-out evaluation of the various dependency types from the stanford parser. """ exclude_list = [ 'dep', 'aux', 'auxpass', 'cop', 'agent', 'acomp', 'attr', 'ccomp', 'xcomp', 'complm', 'dobj', 'iobj', 'pobj', 'mark', 'rel', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass', 'cc', 'conj', 'expl', 'abbrev', 'amod', 'appos', 'advcl', 'purpcl', 'det', 'predet', 'preconj', 'infmod', 'mwe', 'partmod', 'advmod', 'neg', 'rcmod', 'quantmod', 'tmod', 'nn', 'npadvmod', 'num', 'number', 'prep', 'poss', 'possessive', 'prt', 'parataxis', 'punct', 'ref', 'xsubj', 'pcomp', 'prepc' ] results = {'classification': [], 'retrieval': []} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = {} for exclude_label in exclude_list: rep[exclude_label] = [] metric = graph.GraphMetrics.CLOSENESS for i, text in enumerate(texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(texts)) full_graph = graph_representation.construct_dependency_network(text) for exclude_label in exclude_list: g = graph.reduce_edge_set(full_graph, exclude_label) d = graph_representation.graph_to_dict(g, metric) rep[exclude_label].append(d) g = None # just to make sure.. full_graph = None for exclude_label in exclude_list: rep[exclude_label] = graph_representation.dicts_to_vectors( rep[exclude_label]) print '> Evaluating..' for exclude_label in exclude_list: score = evaluation.evaluate_classification(rep[exclude_label], labels) print ' ', exclude_label, score results['classification'].append(score) data.pickle_to_file(results, 'output/dependencies/types_eval_tmp') print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = {} for exclude_label in exclude_list: rep[exclude_label] = [] metric = graph.GraphMetrics.EIGENVECTOR for i, text in enumerate(description_texts): if i % 1 == 0: print ' ', str(i) + '/' + str(len(description_texts)) full_graph = graph_representation.construct_dependency_network(text) for exclude_label in exclude_list: g = graph.reduce_edge_set(full_graph, exclude_label) d = graph_representation.graph_to_dict(g, metric) rep[exclude_label].append(d) g = None # just to make sure.. full_graph = None #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i)) for exclude_label in exclude_list: rep[exclude_label] = graph_representation.dicts_to_vectors( rep[exclude_label]) print '> Evaluating..' for exclude_label in exclude_list: score = evaluation.evaluate_retrieval(rep[exclude_label], solution_vectors) print ' ', exclude_label, score results['retrieval'].append(score) pp.pprint(results) data.pickle_to_file(results, 'output/dependencies/types_eval') return results
def evaluate_dep_types(): """ Leave-one-out evaluation of the various dependency types from the stanford parser. """ exclude_list = ['dep', 'aux', 'auxpass', 'cop', 'agent', 'acomp', 'attr', 'ccomp', 'xcomp', 'complm', 'dobj', 'iobj', 'pobj', 'mark', 'rel', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass', 'cc', 'conj', 'expl', 'abbrev', 'amod', 'appos', 'advcl', 'purpcl', 'det', 'predet', 'preconj', 'infmod', 'mwe', 'partmod', 'advmod', 'neg', 'rcmod', 'quantmod', 'tmod', 'nn', 'npadvmod', 'num', 'number', 'prep', 'poss', 'possessive', 'prt', 'parataxis', 'punct', 'ref', 'xsubj', 'pcomp', 'prepc'] results = {'classification':[], 'retrieval':[]} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = {} for exclude_label in exclude_list: rep[exclude_label] = [] metric = graph.GraphMetrics.CLOSENESS for i, text in enumerate(texts): if i%10==0: print ' ',str(i)+'/'+str(len(texts)) full_graph = graph_representation.construct_dependency_network(text) for exclude_label in exclude_list: g = graph.reduce_edge_set(full_graph, exclude_label) d = graph_representation.graph_to_dict(g, metric) rep[exclude_label].append(d) g = None # just to make sure.. full_graph = None for exclude_label in exclude_list: rep[exclude_label] = graph_representation.dicts_to_vectors(rep[exclude_label]) print '> Evaluating..' for exclude_label in exclude_list: score = evaluation.evaluate_classification(rep[exclude_label], labels) print ' ', exclude_label, score results['classification'].append(score) data.pickle_to_file(results, 'output/dependencies/types_eval_tmp') print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = {} for exclude_label in exclude_list: rep[exclude_label] = [] metric = graph.GraphMetrics.EIGENVECTOR for i, text in enumerate(description_texts): if i%1==0: print ' ',str(i)+'/'+str(len(description_texts)) full_graph = graph_representation.construct_dependency_network(text) for exclude_label in exclude_list: g = graph.reduce_edge_set(full_graph, exclude_label) d = graph_representation.graph_to_dict(g, metric) rep[exclude_label].append(d) g = None # just to make sure.. full_graph = None #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i)) for exclude_label in exclude_list: rep[exclude_label] = graph_representation.dicts_to_vectors(rep[exclude_label]) print '> Evaluating..' for exclude_label in exclude_list: score = evaluation.evaluate_retrieval(rep[exclude_label], solution_vectors) print ' ', exclude_label, score results['retrieval'].append(score) pp.pprint(results) data.pickle_to_file(results, 'output/dependencies/types_eval') return results
def evaluate_dep_type_sets(): """ Evaluation of various sets of dependency relations. Each set is excluded from the representation, and the performance recorded. The best strategy is to exclude those dependencies which removal lead to the greatest imporovement for the representation. """ strategies = { 'defensive': ['agent', 'advcl', 'parataxis'], 'aggressive': ['agent', 'advcl', 'parataxis', 'dep', 'aux', 'ccomp', 'xcomp', 'dobj', 'pobj', 'nsubj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'preconj', 'advmod', 'neg', 'rcmod', 'tmod', 'poss', 'prepc'], 'compromise_1': ['agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc'], 'compromise_2': ['agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc', 'attr', 'csubj', 'csubjpass', 'number', 'possessive', 'punct', 'ref'] } results = {'classification':{}, 'retrieval':{}} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = {} for strategy in strategies: rep[strategy] = [] metric = graph.GraphMetrics.CLOSENESS for i, text in enumerate(texts): if i%10==0: print ' ',str(i)+'/'+str(len(texts)) for strategy in strategies: g = graph_representation.construct_dependency_network(text, exclude=strategies[strategy]) d = graph_representation.graph_to_dict(g, metric) rep[strategy].append(d) g = None # just to make sure. I don't trust this damn garbage collector... for strategy in strategies: rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy]) print '> Evaluating..' for strategy in strategies: score = evaluation.evaluate_classification(rep[strategy], labels) print ' ', strategy, score results['classification'][strategy] = score data.pickle_to_file(results, 'output/dependencies/types_set_eval_tmp') print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = {} for strategy in strategies: rep[strategy] = [] metric = graph.GraphMetrics.EIGENVECTOR for i, text in enumerate(description_texts): if i%1==0: print ' ',str(i)+'/'+str(len(description_texts)) full_graph = graph_representation.construct_dependency_network(text) for strategy in strategies: g = graph_representation.construct_dependency_network(text, exclude=strategies[strategy]) d = graph_representation.graph_to_dict(g, metric) rep[strategy].append(d) g = None # just to make sure.. full_graph = None #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i)) for strategy in strategies: rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy]) print '> Evaluating..' for strategy in strategies: score = evaluation.evaluate_retrieval(rep[strategy], solution_vectors) print ' ', strategy, score results['retrieval'][strategy] = score pp.pprint(results) data.pickle_to_file(results, 'output/dependencies/types_set_eval') return results
def stop_word_evaluation(rem_stop_words): """ Experiment for determining what effect removing stop words have on dependency networks. """ results = {'_removing stop-words':rem_stop_words} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = [] total_nodes = 0 for i, text in enumerate(texts): if i%100==0: print ' ',str(i)+'/'+str(len(texts)) g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words) total_nodes += len(g.nodes()) metric = graph.GraphMetrics.CLOSENESS d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' score:', score print '(the networks had a total of',total_nodes,'nodes)' results['classification'] = score print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] total_nodes = 0 for i, text in enumerate(description_texts): if i%100==0: print ' ',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words) total_nodes += len(g.nodes()) metric = graph.GraphMetrics.EIGENVECTOR d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print ' score:', score print '(the networks had a total of',total_nodes,'nodes)' results['retrieval'] = score if rem_stop_words: postfix = '_without' else: postfix = '_with' data.pickle_to_file(results, 'output/dependencies/stop_words_retr'+postfix) pp.pprint(results) return results
def stop_word_evaluation(rem_stop_words): """ Experiment for determining what effect removing stop words have on dependency networks. """ results = {'_removing stop-words': rem_stop_words} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = [] total_nodes = 0 for i, text in enumerate(texts): if i % 100 == 0: print ' ', str(i) + '/' + str(len(texts)) g = graph_representation.construct_dependency_network( text, remove_stop_words=rem_stop_words) total_nodes += len(g.nodes()) metric = graph.GraphMetrics.CLOSENESS d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' score:', score print '(the networks had a total of', total_nodes, 'nodes)' results['classification'] = score print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] total_nodes = 0 for i, text in enumerate(description_texts): if i % 100 == 0: print ' ', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_dependency_network( text, remove_stop_words=rem_stop_words) total_nodes += len(g.nodes()) metric = graph.GraphMetrics.EIGENVECTOR d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print ' score:', score print '(the networks had a total of', total_nodes, 'nodes)' results['retrieval'] = score if rem_stop_words: postfix = '_without' else: postfix = '_with' data.pickle_to_file(results, 'output/dependencies/stop_words_retr' + postfix) pp.pprint(results) return results
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None): """ Experiment used for comparative evaluation of different network representations on classification. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ import co_occurrence_experiments import dependency_experiments def make_dicts(docs, icc): rep = [] for i, doc in enumerate(docs): if i % 100 == 0: print ' graph', str(i) + '/' + str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'} gfuns = { 'co-occurrence': graph_representation.construct_cooccurrence_network, 'dependency': graph_representation.construct_dependency_network } metrics = { 'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency': graph.GraphMetrics.CLOSENESS } print '--', graph_type print '> Reading data..', dataset training_path = '../data/' + dataset + '/training' + postfix[graph_type] training_docs, training_labels = data.read_files(training_path) test_path = '../data/' + dataset + '/test' + postfix[graph_type] test_docs, test_labels = data.read_files(test_path) icc_training = None icc_test = None if icc: print '> Calculating ICC..' if graph_type is 'co-occurrence': icc_training = co_occurrence_experiments.retrieve_centralities( dataset + '/training', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_training = dependency_experiments.retrieve_centralities( dataset + '/training', metrics[graph_type]) if graph_type is 'co-occurrence': icc_test = co_occurrence_experiments.retrieve_centralities( dataset + '/test', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_test = dependency_experiments.retrieve_centralities( dataset + '/test', metrics[graph_type]) print '> Creating representations..' training_dicts = make_dicts(training_docs, icc_training) test_dicts = make_dicts(test_docs, icc_test) print ' dicts -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) keys = list(keys) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) print '> Evaluating..' reps = {'training': training_rep, 'test': test_rep} labels = {'training': training_labels, 'test': test_labels} results = evaluation.evaluate_classification(reps, labels, mode='split') print results s = 'classification comparison ' if icc: s += 'USING TC-ICC' s += '\nrepresentation: ' + graph_type + '\nresult: ' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def retrieval_comparison_graph(dataset='air', graph_type='co-occurrence', use_icc=False): """ Experiment used for comparative evaluation of different network representations on retrieval. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ def make_dicts(docs, icc=None): rep = [] for i, doc in enumerate(docs): if i % 100 == 0: print ' graph', str(i) + '/' + str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'} gfuns = { 'co-occurrence': graph_representation.construct_cooccurrence_network, 'dependency': graph_representation.construct_dependency_network } metrics = { 'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency': graph.GraphMetrics.EIGENVECTOR } print '--', graph_type print '> Reading data..', dataset path = '../data/' + dataset + '/problem_descriptions' + postfix[graph_type] docs, labels = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/' + dataset + '/solutions_preprocessed' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) icc = None if use_icc: print '> Calculating ICC..' m = metrics[graph_type].split()[0] print graph_type if graph_type == 'co-occurrence': p = 'output/centralities/co-occurrence/' + dataset + '/problem_descriptions/window/' + m + '.cent' elif graph_type == 'dependency': p = 'output/centralities/dependency/' + dataset + '/problem_descriptions/' + m + '.cent' print ' fetching', p icc = data.pickle_from_file(p) print ' icc:', type(icc) print '> Creating problem description representations..' dicts = make_dicts(docs, icc) descriptions_rep = graph_representation.dicts_to_vectors( dicts) #, remove_stop_words=True) print '> Evaluating..' results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) print results s = 'retrieval comparison ' if use_icc: s += 'USING TC-ICC' s += '\nrepresentation: ' + graph_type + '\nresult: ' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def evaluate_dep_type_sets(): """ Evaluation of various sets of dependency relations. Each set is excluded from the representation, and the performance recorded. The best strategy is to exclude those dependencies which removal lead to the greatest imporovement for the representation. """ strategies = { 'defensive': ['agent', 'advcl', 'parataxis'], 'aggressive': [ 'agent', 'advcl', 'parataxis', 'dep', 'aux', 'ccomp', 'xcomp', 'dobj', 'pobj', 'nsubj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'preconj', 'advmod', 'neg', 'rcmod', 'tmod', 'poss', 'prepc' ], 'compromise_1': [ 'agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc' ], 'compromise_2': [ 'agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc', 'attr', 'csubj', 'csubjpass', 'number', 'possessive', 'punct', 'ref' ] } results = {'classification': {}, 'retrieval': {}} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = {} for strategy in strategies: rep[strategy] = [] metric = graph.GraphMetrics.CLOSENESS for i, text in enumerate(texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(texts)) for strategy in strategies: g = graph_representation.construct_dependency_network( text, exclude=strategies[strategy]) d = graph_representation.graph_to_dict(g, metric) rep[strategy].append(d) g = None # just to make sure. I don't trust this damn garbage collector... for strategy in strategies: rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy]) print '> Evaluating..' for strategy in strategies: score = evaluation.evaluate_classification(rep[strategy], labels) print ' ', strategy, score results['classification'][strategy] = score data.pickle_to_file(results, 'output/dependencies/types_set_eval_tmp') print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = {} for strategy in strategies: rep[strategy] = [] metric = graph.GraphMetrics.EIGENVECTOR for i, text in enumerate(description_texts): if i % 1 == 0: print ' ', str(i) + '/' + str(len(description_texts)) full_graph = graph_representation.construct_dependency_network(text) for strategy in strategies: g = graph_representation.construct_dependency_network( text, exclude=strategies[strategy]) d = graph_representation.graph_to_dict(g, metric) rep[strategy].append(d) g = None # just to make sure.. full_graph = None #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i)) for strategy in strategies: rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy]) print '> Evaluating..' for strategy in strategies: score = evaluation.evaluate_retrieval(rep[strategy], solution_vectors) print ' ', strategy, score results['retrieval'][strategy] = score pp.pprint(results) data.pickle_to_file(results, 'output/dependencies/types_set_eval') return results