def main(): path = sys.argv[1] with open(path + 'system.out', 'w') as fout: entailment_out = open(path + '_entailment.out', 'w') for sentence1, sentence2 in load_candidates(path): output = generate_natural_language(sentence1) + '\t' + generate_natural_language(sentence2) + '|||' try: alignments = linear_align(generate_natural_language(sentence1), generate_natural_language(sentence2)) except Exception as e: print e print >>fout, output + '\t' continue extraction1, extraction2 = generate_potential_extractions(sentence1, sentence2, alignments) extractions = [assign_slots(extraction1[0].replace('?', '>'), extraction1[4], extraction1[5]), assign_slots(extraction2[0].replace('?', '<'), extraction2[5], extraction2[4])] extractions = parse_entailments(extractions, sentence1, sentence2, 's1:', 's2:') sentence1 = extract_sentence(sentence1, extractions, '>') sentence2 = extract_sentence(sentence2, extractions, '<') entailments = generate_potential_entailments(sentence1, sentence2, alignments) entailments = [assign_slots(entailment[0].replace('?', '>'), entailment[4], entailment[5]) for entailment in entailments] + \ [assign_slots(entailment[0].replace('?', '<'), entailment[5], entailment[4]) for entailment in entailments] entailments = parse_entailments(entailments, sentence1, sentence2, 's1:', 's2:') entailments, edge_entailments = create_entailments_dictionary(sentence1, sentence2, entailments) intersections = intersection(sentence1, sentence2, entailments) intersections = sorted(set([generate_natural_language(s, edge_entailments=edge_entailments) for s in intersections])) output += '\t'.join(intersections) print >>entailment_out, entailments print >>entailment_out, edge_entailments print >>fout, output
def thanks(self, **kwargs): with self.lock: now = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') user_id = kwargs['user_id'] candidate_id = kwargs['candidate_id'] alignments = kwargs['alignments'] extractions = kwargs['extractions'] entailments = kwargs['entailments'] intersections = self.parse_intersections(kwargs) h = hashlib.new('sha1') h.update(''.join(('Lagi', user_id, candidate_id, 'Leshami'))) code = ''.join((user_id, 'X', candidate_id, 'X', h.hexdigest())) annotation = '\t'.join([now, user_id, candidate_id, code, alignments, extractions, entailments] + intersections) with open('web/annotations', 'a') as fout: print >>fout, annotation print annotation with open('../vsbkp/annotations', 'a') as fout: print >>fout, annotation print annotation html = self.THANKS_HTML.replace('CODE', code) sentence1, sentence2 = self.candidates[candidate_id] html = html.replace('SENTENCE1', generate_natural_language(sentence1)) html = html.replace('SENTENCE2', generate_natural_language(sentence2)) html = html.replace('INTERSECTIONS', '</br>'.join(self.clean_intersections(intersections))) return html
def load_candidates(path): candidates = [] files = [join(path, f) for f in listdir(path)] for f in files: sentence1, sentence2, entailments = read_file(f) h = hashlib.new('sha1') h.update(generate_natural_language(sentence1) + ' ' + generate_natural_language(sentence2)) candidates.append((h.hexdigest(), (sentence1, sentence2))) return dict(candidates)
def load_candidates(path): candidates = [] files = [join(path, f) for f in listdir(path)] for f in files: sentence1, sentence2, entailments = read_file(f) h = hashlib.new('sha1') h.update( generate_natural_language(sentence1) + ' ' + generate_natural_language(sentence2)) candidates.append((h.hexdigest(), (sentence1, sentence2))) return dict(candidates)
def validate_alignments(sentence1, sentence2, actual_alignments): actual_alignments = [(map(int, seq1), map(int, seq2)) for seq1, seq2 in actual_alignments] sentence1 = generate_natural_language(sentence1).split(' ') sentence2 = generate_natural_language(sentence2).split(' ') high_prob_alignments = aligned_unigrams(sentence1, sentence2, 3) covered_alignments = sum(1 if covers_alignment(expected_alignment, actual_alignments) else 0 for expected_alignment in high_prob_alignments) print print actual_alignments print covered_alignments print len(high_prob_alignments) print return ((float(covered_alignments) + 2) / (len(high_prob_alignments) + 3)) > 0.66
def generate_potential_entailments_local(subtree1, subtree2, sentence1, sentence2, prerequisite, alignments): entailment = generate_entailment_string(subtree1, subtree2) prerequisite = generate_entailment_string( *prerequisite) if prerequisite is not None else '' args1, template1 = generate_template(sentence1, subtree1, sentence2, subtree2, alignments) args2, template2 = generate_template(sentence2, subtree2, sentence1, subtree1, alignments) return entailment, prerequisite, generate_natural_language( sentence1, subtree1), generate_natural_language( sentence2, subtree2), args1, args2, template1, template2
def validate_alignments(sentence1, sentence2, actual_alignments): actual_alignments = [(map(int, seq1), map(int, seq2)) for seq1, seq2 in actual_alignments] sentence1 = generate_natural_language(sentence1).split(' ') sentence2 = generate_natural_language(sentence2).split(' ') high_prob_alignments = aligned_unigrams(sentence1, sentence2, 3) covered_alignments = sum( 1 if covers_alignment(expected_alignment, actual_alignments) else 0 for expected_alignment in high_prob_alignments) print print actual_alignments print covered_alignments print len(high_prob_alignments) print return ((float(covered_alignments) + 2) / (len(high_prob_alignments) + 3)) > 0.66
def index(self): with self.lock: html = self.LOGIN_HTML candidate_options = [ (candidate_id, ''.join( (generate_natural_language(sentence1)[:20], '... | ', generate_natural_language(sentence2)[:20], '...'))) for candidate_id, (sentence1, sentence2) in self.candidates.iteritems() ] candidate_options = [ CANDIDATE_HTML.replace('CANDIDATE_ID', candidate_id).replace( 'CANDIDATE', candidate) for candidate_id, candidate in candidate_options ] html = html.replace('CANDIDATES', ''.join(candidate_options)) return html
def index(self): with self.lock: html = self.LOGIN_HTML candidate_options = [(candidate_id, ''.join((generate_natural_language(sentence1)[:20], '... | ', generate_natural_language(sentence2)[:20], '...'))) for candidate_id, (sentence1, sentence2) in self.candidates.iteritems()] candidate_options = [CANDIDATE_HTML.replace('CANDIDATE_ID', candidate_id).replace('CANDIDATE', candidate) for candidate_id, candidate in candidate_options] html = html.replace('CANDIDATES', ''.join(candidate_options)) return html
def generate_template(hypothesis_tree, hypothesis_subtree, premise_tree, premise_subtree, alignments): hypothesis_tree = DynamicTree(None, hypothesis_tree.root) outgoing_edges = get_edges_from_subtree_to_tree(hypothesis_subtree, hypothesis_tree) aligned_edges = { edge: get_aligned_edge_id(edge, alignments) for edge in outgoing_edges } interesting_edges = [ edge for edge in outgoing_edges if edge.is_slot() or aligned_edges[edge] is not None ] # Generate the original arguments (children) of the hypothesis subtree args = [(edge.id_.split(':')[1], generate_natural_language(edge.modifier), aligned_edges[edge]) for edge in interesting_edges] # Replace hypothesis subtree's children with placeholders slot_i = 0 for edge in outgoing_edges: if edge in interesting_edges: edge.modifier.children = [] edge.modifier.word = 'SLOT' + str(slot_i) + '!' slot_i += 1 else: edge.head.children.remove(edge) # Replace hypothesis subtree's parents with the (aligned) premise subtree's parents hypothesis_subtree_root = hypothesis_tree.find_node( hypothesis_subtree.root) if premise_tree.root == premise_subtree.root: premise_tree = DynamicTree(None, hypothesis_subtree_root) else: premise_tree = DynamicTree(None, premise_tree.root) premise_tree.find_parent_edge( premise_subtree.root).modifier = hypothesis_subtree_root # Generate the template template = generate_natural_language(premise_tree, hypothesis_subtree) return args, template
def intersection(self, sentence1, sentence2, entailments_str): entailments = self.get_real_entailments(sentence1, sentence2, entailments_str) entailments, edge_entailments = create_entailments_dictionary( sentence1, sentence2, entailments) intersections = intersection(sentence1, sentence2, entailments) return sorted( set([ generate_natural_language(s, edge_entailments=edge_entailments) for s in intersections ]))
def thanks(self, **kwargs): with self.lock: now = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') user_id = kwargs['user_id'] candidate_id = kwargs['candidate_id'] alignments = kwargs['alignments'] extractions = kwargs['extractions'] entailments = kwargs['entailments'] intersections = self.parse_intersections(kwargs) h = hashlib.new('sha1') h.update(''.join(('Lagi', user_id, candidate_id, 'Leshami'))) code = ''.join((user_id, 'X', candidate_id, 'X', h.hexdigest())) annotation = '\t'.join([ now, user_id, candidate_id, code, alignments, extractions, entailments ] + intersections) with open('web/annotations', 'a') as fout: print >> fout, annotation print annotation with open('../vsbkp/annotations', 'a') as fout: print >> fout, annotation print annotation html = self.THANKS_HTML.replace('CODE', code) sentence1, sentence2 = self.candidates[candidate_id] html = html.replace('SENTENCE1', generate_natural_language(sentence1)) html = html.replace('SENTENCE2', generate_natural_language(sentence2)) html = html.replace( 'INTERSECTIONS', '</br>'.join(self.clean_intersections(intersections))) return html
def generate_sentence_html(sentence, si): return ' '.join([ TOKEN_HTML.replace('SINDEX', str(si + 1)).replace( 'TINDEX', str(ti)).replace('TOKEN', t) for ti, t in enumerate( generate_natural_language(sentence).split(' ')) ])
def generate_sentence_html(sentence, si): return ' '.join([TOKEN_HTML.replace('SINDEX', str(si+1)).replace('TINDEX', str(ti)).replace('TOKEN', t) for ti, t in enumerate(generate_natural_language(sentence).split(' '))])
def intersection(self, sentence1, sentence2, entailments_str): entailments = self.get_real_entailments(sentence1, sentence2, entailments_str) entailments, edge_entailments = create_entailments_dictionary(sentence1, sentence2, entailments) intersections = intersection(sentence1, sentence2, entailments) return sorted(set([generate_natural_language(s, edge_entailments=edge_entailments) for s in intersections]))