def get_new_edges(data_type, construction): tree_prop_file = 'd6.treeproperties' t2props_dict = get_t2props_dict(tree_prop_file) t2topsub_dict = get_t2topsub_dict(tree_prop_file) ## get predicted_dependencies and apply transformations predicted_dependencies = read_data(construction, data_type) unbounded_dependencies = read_unbounded(construction, data_type) sents = read_stags(construction, data_type, 'sents') predicted_stags = read_stags(construction, data_type) predicted_pos = read_stags(construction, data_type, 'predicted_pos') new_edges = [] for sent_idx in range(len(unbounded_dependencies)): #for sent_idx in [0]: sent = sents[sent_idx] ## TAG analysis predicted_dependencies_sent = predicted_dependencies[sent_idx] predicted_stags_sent = predicted_stags[sent_idx] predicted_pos_sent = predicted_pos[sent_idx] transformed_sent = transform(t2props_dict, t2topsub_dict, sent, predicted_dependencies_sent, predicted_stags_sent, predicted_pos_sent) new_edges_sent = list( set(transformed_sent) - set(predicted_dependencies_sent)) new_edges_sent = [x for x in new_edges_sent if x[0] != x[1]] #print(new_edges_sent) new_edges.append(new_edges_sent) return new_edges
def output_conllu(filename, sents, pos, stags, arcs, rels, dependencies, new_edges, output_dir, result_file): scores = {} with open(result_file) as fin: for line in fin: line = line.split() scores[(int(line[0]), int(line[1]))] = int(line[2]) tree_prop_file = 'd6.treeproperties' t2props_dict = get_t2props_dict(tree_prop_file) t2topsub_dict = get_t2topsub_dict(tree_prop_file) #for sent_idx in range(len(sents)): for sent_idx in [21]: deps_sent = dependencies[sent_idx] for dep_idx, dep in enumerate(deps_sent): unbounded_dep = dep #start = min(int(dep[0]), int(dep[1]))-1 start = 25 #end = max(int(dep[0]), int(dep[1]))+1 end = 33 conllu = '' sent = sents[sent_idx] pos_sent = pos[sent_idx] stags_sent = stags[sent_idx] arcs_sent = arcs[sent_idx] rels_sent = rels[sent_idx] token_idx = int(dep[1]) output_list = [ str(token_idx), sent[token_idx - 1] + '_' + stags_sent[token_idx - 1], '_', stags_sent[token_idx - 1], pos_sent[token_idx - 1], '_', str(dep[0]), dep[2], '_', '_' ] conllu += '\t'.join(output_list) conllu += '\n' for token_idx in range(len(sent)): if token_idx >= start and token_idx <= end: #if arcs_sent[token_idx] >= start and arcs_sent[token_idx] <= end: output_list = [ str(token_idx + 1), sent[token_idx] + '_' + stags_sent[token_idx], '_', stags_sent[token_idx], pos_sent[token_idx], '_', str(arcs_sent[token_idx]), rels_sent[token_idx], '_', '_' ] conllu += '\t'.join(output_list) conllu += '\n' for new_idx, dep in enumerate(new_edges[sent_idx]): if dep[0] >= start and dep[0] <= end: #if dep[1] >= start and dep[1] <= end: token_idx = int(dep[0]) output_list = [ str(token_idx), sent[token_idx - 1] + '_' + stags_sent[token_idx - 1], '_', stags_sent[token_idx - 1], pos_sent[token_idx - 1], '_', str(dep[1]), dep[2], '_', '_' ] conllu += '\t'.join(output_list) conllu += '\n' graph = DependencyGraph(conllu) if not os.path.isdir(output_dir): os.makedirs(output_dir) output_file = os.path.join( output_dir, 'sent{}_dep{}_correct{}.gv'.format(sent_idx, dep_idx, scores[(sent_idx, dep_idx)])) dot_string = graph.to_dot() ## add colors new_dot_string = '' new_lines = [ '{} -> {} [label="{}"]'.format(dep[1], dep[0], dep[2]) for dep in new_edges[sent_idx] ] for line in dot_string.split('\n'): line = line.strip() if line == '{} -> {} [label="{}"]'.format( unbounded_dep[0], unbounded_dep[1], unbounded_dep[2]): line = '{} -> {} [label="{}", color="red"]'.format( unbounded_dep[1], unbounded_dep[0], unbounded_dep[2]) elif line in new_lines: line = line[:-1] + ', color="blue"]' new_dot_string += line new_dot_string += '\n' with open(output_file, 'wt') as fout: fout.write(new_dot_string)
def evaluate(corpus_data_type, debug=False, input_data_type=None): if input_data_type is None: input_data_type = corpus_data_type tree_prop_file = 'd6.treeproperties' t2props_dict = get_t2props_dict(tree_prop_file) t2topsub_dict = get_t2topsub_dict(tree_prop_file) if debug: #constructions = ['sbj_embedded'] #constructions = ['obj_qus'] #constructions = ['obj_extract_red_rel'] constructions = ['right_node_raising'] else: constructions = ['obj_extract_rel_clause', 'obj_extract_red_rel', 'sbj_extract_rel_clause', 'obj_free_rels', 'obj_qus', 'right_node_raising', 'sbj_embedded'] #constructions = ['obj_qus'] all_total = 0 all_correct = 0 nb_constructions = 0 total_scores = 0 for construction in constructions: ## get predicted_dependencies and apply transformations result_dir = os.path.join(construction, 'results', 'test') if not os.path.isdir(result_dir): os.makedirs(result_dir) predicted_dependencies = read_data(construction, input_data_type) unbounded_dependencies = read_unbounded(construction, corpus_data_type) sents = read_stags(construction, input_data_type, 'sents') predicted_stags = read_stags(construction, input_data_type) predicted_pos = read_stags(construction, input_data_type, 'predicted_pos') #assert(len(predicted_dependencies) == len(unbounded_dependencies)) total = 0 correct = 0 if debug: sent_idxes = [70] else: sent_idxes = range(len(unbounded_dependencies)) with open(os.path.join(result_dir, 'results.txt'), 'wt') as fout: for sent_idx in sent_idxes: #for sent_idx in [73]: sent = sents[sent_idx] ## TAG analysis predicted_dependencies_sent = predicted_dependencies[sent_idx] predicted_stags_sent = predicted_stags[sent_idx] predicted_pos_sent = predicted_pos[sent_idx] transformed_sent = transform(t2props_dict, t2topsub_dict, sent, predicted_dependencies_sent, predicted_stags_sent, predicted_pos_sent) #transformed_sent = predicted_dependencies_sent #print(transformed_sent) assert(len(sent) == len(predicted_stags_sent)) unbounded_dependencies_sent = unbounded_dependencies[sent_idx] for dep_idx, dep in enumerate(unbounded_dependencies_sent): total += 1 all_total += 1 if 'nsubj' == dep[2]: new_dep = (dep[0], dep[1], '0') if construction == 'sbj_embedded': if (sent_idx, dep_idx) in [(77, 0), (42, 0)]: new_dep = tuple([dep[0], dep[1], '1']) ## causative-inchoative elif 'dobj' == dep[2]: new_dep = tuple([dep[0], dep[1], '1']) if construction == 'obj_qus': if sent[0].lower() in ['where']: new_dep = tuple([dep[0], dep[1], 'ADJ']) elif 'pobj' == dep[2]: new_dep = tuple([dep[0], dep[1], '1']) elif 'nsubjpass' in dep[2]: new_dep = (dep[0], dep[1], '1') elif 'advmod' in dep[2]: if sent[dep[0]-1] == 'out': new_dep = (dep[0], dep[1], 'ADJ') else: new_dep = (dep[0], dep[1], 'ADJ') elif 'prep' in dep[2]: new_dep = (dep[0], dep[1], 'ADJ') elif 'infmod' in dep[2]: new_dep = (dep[0], dep[1], 'ADJ') elif 'obj2' in dep[2]: new_dep = (dep[0], dep[1], '1') elif 'cop' in dep[2]: new_dep = (dep[0], dep[1], '0') else: new_dep = (dep[0], dep[1], 'ADJ') if new_dep in transformed_sent: correct += 1 all_correct += 1 success = 1 else: success = 0 fout.write(' '.join([str(sent_idx), str(dep_idx), str(success)])) fout.write('\n') print('Construction: {}'.format(construction)) print('# total: {}'.format(total)) print('# correct: {}'.format(correct)) print('Accuracy: {}'.format(float(correct)/total)) total_scores += float(correct)/total nb_constructions += 1 #print(predicted_dependencies[0]) #print(unbounded_dependencies[0]) #for predicted_dependencies_sent in predicted_dependencies: # predicted_dependencies_sent = transform(predicted_dependencies_sent) print('All constructions') print('# total: {}'.format(all_total)) print('# correct: {}'.format(all_correct)) print('Macro Accuracy: {}'.format(float(all_correct)/all_total)) print('Overall Accuracy: {}'.format(float(total_scores)/nb_constructions))
def evaluate(data_type): tree_prop_file = 'd6.treeproperties' t2props_dict = get_t2props_dict(tree_prop_file) t2topsub_dict = get_t2topsub_dict(tree_prop_file) constructions = ['obj_extract_rel_clause', 'obj_extract_red_rel', 'sbj_extract_rel_clause', 'obj_free_rels', 'obj_qus', 'right_node_raising', 'sbj_embedded'] #constructions = ['obj_qus'] all_total = 0 all_correct = 0 nb_constructions = 0 total_scores = 0 for construction in constructions: ## get predicted_dependencies and apply transformations predicted_dependencies = read_data(construction, data_type) unbounded_dependencies = read_unbounded(construction, data_type) sents = read_stags(construction, data_type, 'sents') predicted_stags = read_stags(construction, data_type) predicted_pos = read_stags(construction, data_type, 'predicted_pos') #assert(len(predicted_dependencies) == len(unbounded_dependencies)) total = 0 correct = 0 for sent_idx in xrange(len(unbounded_dependencies)): sent = sents[sent_idx] ## TAG analysis predicted_dependencies_sent = predicted_dependencies[sent_idx] predicted_stags_sent = predicted_stags[sent_idx] predicted_pos_sent = predicted_pos[sent_idx] transformed_sent = transform(t2props_dict, t2topsub_dict, sent, predicted_dependencies_sent, predicted_stags_sent, predicted_pos_sent) #transformed_sent = predicted_dependencies_sent #print(transformed_sent) assert(len(sent) == len(predicted_stags_sent)) unbounded_dependencies_sent = unbounded_dependencies[sent_idx] for dep in unbounded_dependencies_sent: total += 1 all_total += 1 if 'nsubj' == dep[2]: new_dep = (dep[0], dep[1], '0') elif 'dobj' == dep[2]: new_dep = tuple([dep[0], dep[1], '1']) elif 'pobj' == dep[2]: new_dep = tuple([dep[0], dep[1], '1']) elif 'nsubjpass' in dep[2]: new_dep = (dep[0], dep[1], '1') elif 'advmod' in dep[2]: new_dep = (dep[0], dep[1], '-unk-') elif 'prep' in dep[2]: new_dep = (dep[0], dep[1], 'ADJ') elif '' in dep[2]: new_dep = (dep[0], dep[1], 'ADJ') else: print(dep[2]) if new_dep in transformed_sent: correct += 1 all_correct += 1 print('Construction: {}'.format(construction)) print('# total: {}'.format(total)) print('# correct: {}'.format(correct)) print('Accuracy: {}'.format(float(correct)/total)) total_scores += float(correct)/total nb_constructions += 1 #print(predicted_dependencies[0]) #print(unbounded_dependencies[0]) #for predicted_dependencies_sent in predicted_dependencies: # predicted_dependencies_sent = transform(predicted_dependencies_sent) print('All constructions') print('# total: {}'.format(all_total)) print('# correct: {}'.format(all_correct)) print('Macro Accuracy: {}'.format(float(all_correct)/all_total)) print('Overall Accuracy: {}'.format(float(total_scores)/nb_constructions))
# if word2 is a verb, lemmatize it if word2_pos.startswith('V'): word2 = lemmatize(word2, pos[id2]) lex_parse.append((word1, word2, dep)) return (lex_parse) def _triples2par_child_dict(parse_t, sent_t): from collections import defaultdict par_child_dict = defaultdict(lambda: defaultdict(list)) for id1, id2, dep in parse_t: par_child_dict[id1]['parents_with_dep'].append((id2, dep)) par_child_dict[id2]['children_with_dep'].append((id1, dep)) return (par_child_dict) if __name__ == '__main__': print(lemmatize('stayed', 'V')) print(lemmatize('which', 'V')) print(lemmatize('what', 'V')) print(lemmatize('None', 'V')) print(lemmatize('lots', 'V')) from get_treeprops import get_t2props_dict, get_t2topsub_dict tree_prop_file = 'd6.treeproperties' t2props_dict = get_t2props_dict(tree_prop_file) t2topsub_dict = get_t2topsub_dict(tree_prop_file) stag = 3339 print('S#s#1' in t2props_dict[stag]['rfronts'])