def output_triple(arg1, arg2, relation, raw, sources): arg1 = normalize(arg1).strip() arg2 = normalize(arg2).strip() relation = normalize_rel(relation).strip() found_relation = False if relation == 'be for': found_relation = True relation = 'UsedFor' if relation == 'be used for': found_relation = True relation = 'UsedFor' if relation == 'be not': found_relation = True relation = 'IsNot' if relation == 'be part of': found_relation = True relation = 'PartOf' if relation == 'be similar to': found_relation = True relation = 'SimilarTo' if relation.startswith('be ') and relation.endswith( ' of') and relation[3:-3] in TYPE_WORDS: found_relation = True relation = 'IsA' if found_relation: rel_node = GRAPH.get_or_create_relation(relation) else: rel_node = GRAPH.get_or_create_concept('en', relation) print '%s(%s, %s)' % \ (relation, arg1, arg2), assertion = GRAPH.get_or_create_assertion(rel_node, [ GRAPH.get_or_create_concept('en', arg1), GRAPH.get_or_create_concept('en', arg2) ], { 'dataset': 'reverb/en', 'license': 'CC-By-SA', 'normalized': True }) GRAPH.derive_normalized(raw, assertion) conjunction = GRAPH.get_or_create_conjunction([raw, reverb_triple]) GRAPH.justify(conjunction, assertion) for source in sources: # Put in context with Wikipedia articles. topic = article_url_to_topic(source) context = GRAPH.get_or_create_concept('en', topic) context_normal = GRAPH.get_or_create_concept('en', *normalize_topic(topic)) GRAPH.add_context(assertion, context_normal) GRAPH.get_or_create_edge('normalized', context, context_normal) print "in", context_normal return assertion
def output_triple(arg1, arg2, relation, raw, sources): arg1 = normalize(arg1).strip() arg2 = normalize(arg2).strip() relation = normalize_rel(relation).strip() found_relation = False if relation == 'be for': found_relation = True relation = 'UsedFor' if relation == 'be used for': found_relation = True relation = 'UsedFor' if relation == 'be not': found_relation = True relation = 'IsNot' if relation == 'be part of': found_relation = True relation = 'PartOf' if relation == 'be similar to': found_relation = True relation = 'SimilarTo' if relation.startswith('be ') and relation.endswith(' of') and relation[3:-3] in TYPE_WORDS: found_relation = True relation = 'IsA' if found_relation: rel_node = GRAPH.get_or_create_relation(relation) else: rel_node = GRAPH.get_or_create_concept('en', relation) print '%s(%s, %s)' % \ (relation, arg1, arg2), assertion = GRAPH.get_or_create_assertion( rel_node, [GRAPH.get_or_create_concept('en', arg1), GRAPH.get_or_create_concept('en', arg2)], {'dataset': 'reverb/en', 'license': 'CC-By-SA', 'normalized': True} ) GRAPH.derive_normalized(raw, assertion) conjunction = GRAPH.get_or_create_conjunction([raw, reverb_triple]) GRAPH.justify(conjunction, assertion) for source in sources: # Put in context with Wikipedia articles. topic = article_url_to_topic(source) context = GRAPH.get_or_create_concept('en', topic) context_normal = GRAPH.get_or_create_concept('en', *normalize_topic(topic)) GRAPH.add_context(assertion, context_normal) GRAPH.get_or_create_edge('normalized', context, context_normal) print "in", context_normal return assertion
def output_sentence(arg1, arg2, arg3, relation, raw, sources, prep=None): # arg3 is vestigial; we weren't getting sensible statements from it. if arg2.strip() == "": # Remove "A is for B" sentence return arg1 = normalize(arg1).strip() arg2 = normalize(arg2).strip() assertion = None if arg3 == None: print '%s(%s, %s)' % (relation, arg1, arg2) assertion = GRAPH.get_or_create_assertion('/relation/' + relation, [ GRAPH.get_or_create_concept('en', arg1), GRAPH.get_or_create_concept('en', arg2) ], { 'dataset': 'reverb/en', 'license': 'CC-By-SA', 'normalized': True }) assertions = (assertion, ) else: print '%s(%s, %s)' % \ (relation, arg1, arg2) assertion1 = GRAPH.get_or_create_assertion('/relation/' + relation, [ GRAPH.get_or_create_concept('en', arg1), GRAPH.get_or_create_concept('en', arg2) ], { 'dataset': 'reverb/en', 'license': 'CC-By-SA', 'normalized': True }) #arg3 = normalize(arg3).strip() #assertion2 = GRAPH.get_or_create_assertion( # GRAPH.get_or_create_concept('en', prep, 'p'), # [GRAPH.get_or_create_concept('en', arg2), # GRAPH.get_or_create_concept('en', arg3)], # {'dataset': 'reverb/en', 'license': 'CC-By-SA', # 'normalized': True} #) assertions = (assertion1, ) for assertion in assertions: conjunction = GRAPH.get_or_create_conjunction([raw, reverb_object]) GRAPH.justify(conjunction, assertion) for source in sources: # Put in context with Wikipedia articles. topic = article_url_to_topic(source) context = GRAPH.get_or_create_concept('en', *normalize_topic(topic)) GRAPH.add_context(assertion, context) return assertion
def output_sentence(arg1, arg2, arg3, relation, raw, sources, prep=None): # arg3 is vestigial; we weren't getting sensible statements from it. if arg2.strip() == "": # Remove "A is for B" sentence return arg1 = normalize(arg1).strip() arg2 = normalize(arg2).strip() assertion = None if arg3 == None: print '%s(%s, %s)' % (relation, arg1, arg2) assertion = GRAPH.get_or_create_assertion( '/relation/'+relation, [GRAPH.get_or_create_concept('en', arg1), GRAPH.get_or_create_concept('en', arg2)], {'dataset': 'reverb/en', 'license': 'CC-By-SA', 'normalized': True} ) assertions = (assertion,) else: print '%s(%s, %s)' % \ (relation, arg1, arg2) assertion1 = GRAPH.get_or_create_assertion( '/relation/'+relation, [GRAPH.get_or_create_concept('en', arg1), GRAPH.get_or_create_concept('en', arg2)], {'dataset': 'reverb/en', 'license': 'CC-By-SA', 'normalized': True} ) #arg3 = normalize(arg3).strip() #assertion2 = GRAPH.get_or_create_assertion( # GRAPH.get_or_create_concept('en', prep, 'p'), # [GRAPH.get_or_create_concept('en', arg2), # GRAPH.get_or_create_concept('en', arg3)], # {'dataset': 'reverb/en', 'license': 'CC-By-SA', # 'normalized': True} #) assertions = (assertion1,) for assertion in assertions: conjunction = GRAPH.get_or_create_conjunction( [raw, reverb_object] ) GRAPH.justify(conjunction, assertion) for source in sources: # Put in context with Wikipedia articles. topic = article_url_to_topic(source) context = GRAPH.get_or_create_concept('en', *normalize_topic(topic)) GRAPH.add_context(assertion, context) return assertion
def output_steps(goal, steps, source): # add raw assertions args = [] for step in steps: args.append(GRAPH.get_or_create_concept('en', step)) raw_sequence = GRAPH.get_or_create_assertion( '/relation/Sequence', args, {'dataset': 'goalnet/en', 'license': 'CC-By-SA'} ) args = [GRAPH.get_or_create_concept('en', goal)] args.append(raw_sequence) raw_assertion = GRAPH.get_or_create_assertion( '/relation/HasSteps', args, {'dataset': 'goalnet/en', 'license': 'CC-By-SA'} ) # add assertions args = [] goal = normalize(goal).strip().lower() steps = map(lambda x: normalize(x).strip().lower(), steps) for step in steps: args.append(GRAPH.get_or_create_concept('en', step)) sequence = GRAPH.get_or_create_assertion( '/relation/Sequence', args, {'dataset': 'goalnet/en', 'license': 'CC-By-SA'} ) args = [GRAPH.get_or_create_concept('en', goal)] args.append(sequence) assertion = GRAPH.get_or_create_assertion( '/relation/HasSteps', args, {'dataset': 'goalnet/en', 'license': 'CC-By-SA'} ) GRAPH.derive_normalized(raw_sequence, sequence) GRAPH.derive_normalized(raw_assertion, assertion) # add justification if source == 'wikihow': pass #conjunction = GRAPH.get_or_create_conjunction([wikihow, goalnet, raw_sequence]) #GRAPH.justify(conjunction, sequence, 0.8) #conjunction = GRAPH.get_or_create_conjunction([wikihow, goalnet, raw_assertion]) #GRAPH.justify(conjunction, assertion, 0.8) elif source == 'omics': conjunction = GRAPH.get_or_create_conjunction([omics, goalnet, raw_sequence]) GRAPH.justify(conjunction, sequence) conjunction = GRAPH.get_or_create_conjunction([omics, goalnet, raw_assertion]) GRAPH.justify(conjunction, assertion) return assertion
def output_steps(goal, steps, source): goal = normalize(goal).strip() steps = map(lambda x: normalize(x).strip(), steps) args = [GRAPH.get_or_create_concept('en', goal)] for step in steps: args.append(GRAPH.get_or_create_concept('en', step)) assertion = GRAPH.get_or_create_assertion( '/relation/HasSteps', args, {'dataset': 'goalnet/en', 'license': 'CC-By-SA'} ) if source == 'wikihow': conjunction = GRAPH.get_or_create_conjunction([wikihow, goalnet]) GRAPH.justify(conjunction, assertion, 0.8) elif source == 'omics': conjunction = GRAPH.get_or_create_conjunction([omics, goalnet]) GRAPH.justify(conjunction, assertion) return assertion
def output_triple(arg1, arg2, relation, raw): arg1 = normalize(arg1).strip() arg2 = normalize(arg2).strip() relation = normalize(relation).strip() print '%s(%s, %s)' % \ (relation, arg1, arg2) found_relation = False if relation == 'be for': found_relation = True relation = 'UsedFor' if relation == 'be used for': found_relation = True relation = 'UsedFor' if relation == 'be not': found_relation = True relation = 'IsNot' if relation == 'be part of': found_relation = True relation = 'PartOf' if relation == 'be similar to': found_relation = True relation = 'SimilarTo' if found_relation: rel_node = GRAPH.get_or_create_relation(relation) else: rel_node = GRAPH.get_or_create_concept('en', relation) assertion = GRAPH.get_or_create_assertion( rel_node, [GRAPH.get_or_create_concept('en', arg1), GRAPH.get_or_create_concept('en', arg2)], {'dataset': 'reverb/en', 'license': 'CC-By-SA'} ) GRAPH.derive_normalized(raw, assertion) conjunction = GRAPH.get_or_create_conjunction([raw, reverb_triple]) GRAPH.justify(conjunction, assertion) return assertion
def test_normalize(): assert normalize('this is a test') == 'this be test' # If we're using simplenlp, this will give "catherine havasus"; this is # one of the reasons to switch to using Morphy assert normalize('Catherine Havasi') == 'catherine havasi'
def english_normalize(text): if text.startswith('to '): text = text[3:] result = normalize(unicodedata.normalize('NFKC', text)) return result
if right.startswith("not "): right = right[4:] relation = "it is not" if relation == "it is the opposite of": relation = "it is not" freq = int(freq) orderscore = int(orderscore) if relation == "about the same size as": relation = "it is about the same size as" elif relation == "it looks like": relation = "it is related to" rel = mapping.get(relation) if rel is None: rel = "/concept/en/" + normalize(relation[3:]).replace(" ", "_") if relation == "it is" and (right.startswith("a ") or right.startswith("an ") or right.startswith("the ")): rel = "/relation/IsA" sls = sounds_like_score(left, right) text_similarities.append(sls) if sls > 0.35: # print "* %s sounds like %s (%4.4f)" % (left, right, sls) counts["text similarity"] += 1 similar_out.write("%4.4d\t%s" % (sls, line)) continue score = (freq * 2 - 1) * (1000 - orderscore) * (1 - sls) / 1000 if score <= 0: counts["low score"] += 1
def normalize_rel(text): parts = normalize(text).split() if len(parts) >= 2 and parts[1] == 'be' and parts[0] in ('be', 'have'): parts = parts[1:] parts = [p for p in parts if p != 'also'] return ' '.join(parts)
def probably_present_tense(text): return text in ('is', 'are') or normalize(text) == text
def check_line(line): parts = line.strip().split() norm = normalize(parts[0]) if norm != parts[1]: print "Original: %s / WordNet: %s / conceptnet: %s" %\ (parts[0], parts[1], norm)
if right.startswith('not '): right = right[4:] relation = 'it is not' if relation == 'it is the opposite of': relation = 'it is not' freq = int(freq) orderscore = int(orderscore) if relation == 'about the same size as': relation = 'it is about the same size as' elif relation == 'it looks like': relation = 'it is related to' rel = mapping.get(relation) if rel is None: rel = '/concept/en/'+normalize(relation[3:]).replace(' ', '_') if relation == 'it is' and\ (right.startswith('a ') or right.startswith('an ') or right.startswith('the ')): rel = '/relation/IsA' sls = sounds_like_score(left, right) text_similarities.append(sls) if sls > 0.35: #print "* %s sounds like %s (%4.4f)" % (left, right, sls) counts['text similarity'] += 1 similar_out.write('%4.4d\t%s' % (sls, line)) continue score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000
def english_normalize(text): if text.startswith("to "): text = text[3:] result = normalize(unicodedata.normalize("NFKC", text)) return result