def evaluate_tc_icc_retrieval(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'air/problem_descriptions' context = 'window' solutions_path = '../data/air/solutions_preprocessed' path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating solution representations..' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating problem description representations..' for i, text in enumerate(description_texts): if i % 1 == 0: print ' document', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_cooccurrence_network( text, already_preprocessed=True, context='window') for metric in graph_metrics: if not icc[metric]: continue #~ print ' ',metric d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solutions_rep) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file( results, 'output/tc_icc/cooccurrence/' + corpus + '/retrieval.res') return results
def do_context_sentence_evaluation_classification(): """ Experiment evaluating performance of sentences as contexts for co-occurrence networks in the classification task. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) print '> Evaluating..' graphs = [] results = {} for text in texts: g = graph_representation.construct_cooccurrence_network( text, context='sentence') graphs.append(g) for metric in graph_representation.get_metrics(): print ' ', metric vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True) score = evaluation.evaluate_classification(vectors, labels) results[metric + ' (sentence)'] = score data.pickle_to_file(results, 'output/class_context_sentence') pp.pprint(results) return results
def do_context_sentence_evaluation_classification(): """ Experiment evaluating performance of sentences as contexts for co-occurrence networks in the classification task. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) print '> Evaluating..' graphs = [] results = {} for text in texts: g = graph_representation.construct_cooccurrence_network(text, context='sentence') graphs.append(g) for metric in graph_representation.get_metrics(): print ' ', metric vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True) score = evaluation.evaluate_classification(vectors, labels) results[metric+' (sentence)'] = score data.pickle_to_file(results, 'output/class_context_sentence') pp.pprint(results) return results
def centrality_weights_retrieval(weighted=True): """ Evaluate whether edge weights are beneficial to the depdendency network represenation for the retrieval task. """ results = {'_is_weighted': weighted, '_evaluation': 'retrieval'} graph_metrics = graph_representation.get_metrics(weighted) print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating graph representations..' for i, text in enumerate(description_texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_dependency_network( text, weighted=weighted) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. if i % 100 == 0: if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file( rep, 'output/dependencies/exp1_retr_tmp_' + str(i) + '_' + postfix) print '> Creating vector representations..' for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric] = score if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(results, 'output/dependencies/exp1_retr' + postfix) pp.pprint(results) return results
def evaluate_tc_icc_retrieval(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'air/problem_descriptions' context = 'window' solutions_path = '../data/air/solutions_preprocessed' path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating solution representations..' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating problem description representations..' for i, text in enumerate(description_texts): if i%1==0: print ' document',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_cooccurrence_network(text, already_preprocessed=True, context='window') for metric in graph_metrics: if not icc[metric]: continue #~ print ' ',metric d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solutions_rep) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/retrieval.res') return results
def evaluate_tc_icc_classification(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'tasa/TASA900' #~ corpus = 'tasa/TASATest2' context = 'sentence' path = '../data/' + corpus + '_text' texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating graph representations..' for i, text in enumerate(texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(texts)) g = graph_representation.construct_cooccurrence_network( text, context=context) for metric in graph_metrics: print ' ', metric if not icc[metric]: continue d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file( results, 'output/tc_icc/cooccurrence/' + corpus + '/classification.res') return results
def centrality_weights_retrieval(weighted=True): """ Evaluate whether edge weights are beneficial to the depdendency network represenation for the retrieval task. """ results = {'_is_weighted':weighted, '_evaluation':'retrieval'} graph_metrics = graph_representation.get_metrics(weighted) print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating graph representations..' for i, text in enumerate(description_texts): if i%10==0: print ' ',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_dependency_network(text, weighted=weighted) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. if i%100==0: if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(rep, 'output/dependencies/exp1_retr_tmp_'+str(i)+'_'+postfix) print '> Creating vector representations..' for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric] = score if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(results, 'output/dependencies/exp1_retr'+postfix) pp.pprint(results) return results
def do_context_size_evaluation_retrieval(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the retrieval task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) for window_size in range(1, 11) + [20, 40, 80]: print '-- window size:', window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for i, text in enumerate(description_texts): if i % 10 == 0: print i g = graph_representation.construct_cooccurrence_network( text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/retr_context_' + str(window_size)) pp.pprint(results) return results
def evaluate_tc_icc_classification(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'tasa/TASA900' #~ corpus = 'tasa/TASATest2' context = 'sentence' path = '../data/'+corpus+'_text' texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating graph representations..' for i, text in enumerate(texts): if i%10==0: print ' ',str(i)+'/'+str(len(texts)) g = graph_representation.construct_cooccurrence_network(text, context=context) for metric in graph_metrics: print ' ', metric if not icc[metric]: continue d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res') return results
def do_retrieval_experiments( descriptions='air/problem_descriptions', solutions='air/solutions', graph_types=['co-occurrence', 'dependency', 'random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on the retrieval task. Toggle comparison with frequency-based methods using *use_frequency*. """ results = { '_solutions': solutions, '_descriptions': descriptions, '_evaluation': 'retrieval' } print '> Evaluation type: retrieval' print '> Reading cases..' descriptions_path = '../data/' + descriptions descriptiondata = data.read_data(descriptions_path, graph_types) solutions_path = '../data/' + solutions + '_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' for gtype in graph_types: print ' ', gtype docs, labels = descriptiondata[gtype] graphs = graph_representation.create_graphs(docs, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_retrieval( vectors, solution_vectors) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric docs, labels = data.read_files(descriptions_path + '_preprocessed') vectors = freq_representation.text_to_vector(docs, metric) results['freq'][metric] = evaluation.evaluate_retrieval( vectors, solution_vectors) print pp.pprint(results) return results
def do_context_size_evaluation_retrieval(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the retrieval task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) for window_size in range(1,11)+[20,40,80]: print '-- window size:',window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for i, text in enumerate(description_texts): if i%10==0: print i g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/retr_context_'+str(window_size)) pp.pprint(results) return results
def do_context_size_evaluation_classification(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the classification task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' path = '../data/tasa/TASA900_preprocessed' texts, labels = data.read_files(path) for window_size in range(1, 11) + [20, 40, 80]: print '-- window size:', window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for text in texts: g = graph_representation.construct_cooccurrence_network( text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/class_context_' + str(window_size)) pp.pprint(results) return results
def do_retrieval_experiments(descriptions='air/problem_descriptions', solutions='air/solutions', graph_types=['co-occurrence','dependency','random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on the retrieval task. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_solutions':solutions, '_descriptions':descriptions, '_evaluation':'retrieval'} print '> Evaluation type: retrieval' print '> Reading cases..' descriptions_path = '../data/'+descriptions descriptiondata = data.read_data(descriptions_path, graph_types) solutions_path = '../data/'+solutions+'_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' for gtype in graph_types: print ' ',gtype docs, labels = descriptiondata[gtype] graphs = graph_representation.create_graphs(docs, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric docs, labels = data.read_files(descriptions_path+'_preprocessed') vectors = freq_representation.text_to_vector(docs, metric) results['freq'][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors) print pp.pprint(results) return results
def do_context_size_evaluation_classification(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the classification task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' path = '../data/tasa/TASA900_preprocessed' texts, labels = data.read_files(path) for window_size in range(1,11)+[20,40,80]: print '-- window size:',window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for text in texts: g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/class_context_'+str(window_size)) pp.pprint(results) return results
def store_centralities(corpus, context): print '> Calculating and storing centralities for', corpus g = retrieve_corpus_network(corpus, context) metrics = graph_representation.get_metrics(True, exclude_flow=True) for metric in metrics: m = metric.split()[0] store_path = 'output/centralities/co-occurrence/'+corpus+'/'+context+'/'+m+'.cent' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping:', metric continue else: print ' calculating:', metric try: c = graph.centralities(g, metric) data.pickle_to_file(c, store_path) except MemoryError as e: print 'MemoryError :(' data.write_to_file('MemoryError while claculating '+metric+' on '+corpus+':\n'+str(e)+'\n\n', 'output/log/errors')
def store_centralities(corpus, context): print '> Calculating and storing centralities for', corpus g = retrieve_corpus_network(corpus, context) metrics = graph_representation.get_metrics(True, exclude_flow=True) for metric in metrics: m = metric.split()[0] store_path = 'output/centralities/co-occurrence/' + corpus + '/' + context + '/' + m + '.cent' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping:', metric continue else: print ' calculating:', metric try: c = graph.centralities(g, metric) data.pickle_to_file(c, store_path) except MemoryError as e: print 'MemoryError :(' data.write_to_file( 'MemoryError while claculating ' + metric + ' on ' + corpus + ':\n' + str(e) + '\n\n', 'output/log/errors')
def do_classification_experiments( dataset='tasa/TASA900', graph_types=['co-occurrence', 'dependency', 'random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on classification. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_dataset': dataset, '_evaluation': 'classification'} print '> Evaluation type: classification' print '> Reading data..', dataset corpus_path = '../data/' + dataset docdata = data.read_data(corpus_path, graph_types) print '> Evaluating..' for gtype in graph_types: print ' ', gtype documents, labels = docdata[gtype] graphs = graph_representation.create_graphs(documents, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_classification( vectors, labels) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric documents, labels = data.read_files(corpus_path + '_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) results['freq'][metric] = evaluation.evaluate_classification( vectors, labels) print pp.pprint(results) return results
def do_classification_experiments(dataset='tasa/TASA900', graph_types = ['co-occurrence','dependency','random'], use_frequency = True): """ Experiment used for comparative evaluation of different network representations on classification. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_dataset':dataset, '_evaluation':'classification'} print '> Evaluation type: classification' print '> Reading data..', dataset corpus_path = '../data/'+dataset docdata = data.read_data(corpus_path, graph_types) print '> Evaluating..' for gtype in graph_types: print ' ',gtype documents, labels = docdata[gtype] graphs = graph_representation.create_graphs(documents, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_classification(vectors, labels) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric documents, labels = data.read_files(corpus_path+'_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) results['freq'][metric] = evaluation.evaluate_classification(vectors, labels) print pp.pprint(results) return results