def query_a(r, list_a, e1_type, e2_type, index): idx = open_dir(index) entity1 = "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">" t1 = query.Term("sentence", entity1) t2 = query.Term("sentence", r.patterns) t3 = query.Term("sentence", entity2) q1 = spans.SpanNear2([t1, t2, t3], slop=5, ordered=True) q2 = spans.SpanNear2([t1, t3], slop=5, ordered=True) with idx.searcher() as searcher: entities_r = searcher.search(q1) entities = searcher.search(q2) # TODO: fazer stemming ou normalização da palavra a usar no query if len(entities) > 0: pmi = float(len(entities_r)) / float(len(entities)) # TODO: qual o melhor valor de threshold ? if pmi >= 0.5: #print entity1, '\t', r.patterns, '\t', entity2, pmi list_a.append(r)
def calculate_pmi(e1, e2, r): # NOTE: this queries an index build over the whole AFT corpus """ # sentences with tagged entities are indexed in whoosh, perform the following query # ent1 NEAR:X r NEAR:X ent2 # X is the maximum number of words between the query elements. # """ idx = open_dir("index") t1 = query.Term("sentence", e1) t2 = query.Term("sentence", r) t3 = query.Term("sentence", e2) q1 = spans.SpanNear2([t1, t2, t3], slop=5, ordered=False) q2 = spans.SpanNear2([t1, t3], slop=5, ordered=False) with idx.searcher() as searcher: entities_r = searcher.search(q1) entities = searcher.search(q2) print len(entities_r) print len(entities) pmi = float(len(entities_r)) / float(len(entities)) idx.close() return pmi
def test_spannear2(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(id="a", text=u"alfa echo") w.add_document(id="b", text=u"alfa bravo echo") w.add_document(id="c", text=u"alfa bravo charlie echo") w.add_document(id="d", text=u"alfa bravo charlie delta echo") w.add_document(id="e", text=u"alfa bravo charlie fox delta echo") w.add_document(id="f", text=u"charlie delta echo fox golf hotel") with ix.searcher() as s: q = spans.SpanNear2([Term("text", "bravo"), Term("text", "echo")], slop=3) assert q.estimate_size(s.reader()) == 4 ids = "".join(sorted(hit["id"] for hit in s.search(q))) assert ids == "bcd"
def proximity_pmi_a(e1_type, e2_type, queue, index, results, not_found, rel_words_unigrams, rel_words_bigrams): idx = open_dir(index) count = 0 q_limit = 500 with idx.searcher() as searcher: while True: try: r = queue.get_nowait() count += 1 if count % 50 == 0: print multiprocessing.current_process(), \ "To Process", queue.qsize(), \ "Correct found:", len(results) # if its not in the database calculate the PMI entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">" t1 = query.Term('sentence', entity1) t3 = query.Term('sentence', entity2) # First count the proximity (MAX_TOKENS_AWAY) occurrences # of entities r.e1 and r.e2 q1 = spans.SpanNear2([t1, t3], slop=MAX_TOKENS_AWAY, ordered=True, mindist=1) hits = searcher.search(q1, limit=q_limit) # Entities proximity considering relational words # From the results above count how many contain a # valid relational word hits_with_r = 0 hits_without_r = 0 fact_bet_words_tokens = word_tokenize(r.bet_words) for s in hits: sentence = s.get("sentence") s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY, MIN_TOKENS_AWAY, CONTEXT_WINDOW) for s_r in s.relationships: if r.ent1.decode("utf8") == s_r.ent1 and \ r.ent2.decode("utf8") == s_r.ent2: unigrams_bef_words = word_tokenize(s_r.before) unigrams_bet_words = word_tokenize(s_r.between) unigrams_aft_words = word_tokenize(s_r.after) bigrams_rel_words = extract_bigrams(s_r.between) if fact_bet_words_tokens == unigrams_bet_words: hits_with_r += 1 elif any(x in rel_words_unigrams for x in unigrams_bef_words): hits_with_r += 1 elif any(x in rel_words_unigrams for x in unigrams_bet_words): hits_with_r += 1 elif any(x in rel_words_unigrams for x in unigrams_aft_words): hits_with_r += 1 elif rel_words_bigrams == bigrams_rel_words: hits_with_r += 1 else: hits_without_r += 1 if hits_with_r > 0 and hits_without_r > 0: pmi = float(hits_with_r) / float(hits_without_r) if pmi >= PMI: results.append(r) else: not_found.append(r) else: not_found.append(r) count += 1 except Queue.Empty: break
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results, rel_words_unigrams, rel_words_bigrams): idx = open_dir(index) count = 0 distance = MAX_TOKENS_AWAY q_limit = 500 with idx.searcher() as searcher: while True: try: r = queue.get_nowait() if count % 50 == 0: print "\n", multiprocessing.current_process(), \ "In Queue", queue.qsize(), \ "Total Matched: ", len(results) if (r.ent1, r.ent2) not in all_in_database: # if its not in the database calculate the PMI entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">" t1 = query.Term('sentence', entity1) t3 = query.Term('sentence', entity2) # Entities proximity query without relational words q1 = spans.SpanNear2([t1, t3], slop=distance, ordered=True, mindist=1) hits = searcher.search(q1, limit=q_limit) # Entities proximity considering relational words # From the results above count how many contain a # valid relational word hits_with_r = 0 hits_without_r = 0 for s in hits: sentence = s.get("sentence") s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY, MIN_TOKENS_AWAY, CONTEXT_WINDOW) for s_r in s.relationships: if r.ent1.decode("utf8") == s_r.ent1 and \ r.ent2.decode("utf8") == s_r.ent2: unigrams_rel_words = word_tokenize(s_r.between) bigrams_rel_words = extract_bigrams( s_r.between) if all(x in not_valid for x in unigrams_rel_words): hits_without_r += 1 continue elif any(x in rel_words_unigrams for x in unigrams_rel_words): hits_with_r += 1 elif any(x in rel_words_bigrams for x in bigrams_rel_words): hits_with_r += 1 else: hits_without_r += 1 if hits_with_r > 0 and hits_without_r > 0: pmi = float(hits_with_r) / float(hits_without_r) if pmi >= PMI: if word_tokenize(s_r.between)[-1] == 'by': tmp = s_r.ent2 s_r.ent2 = s_r.ent1 s_r.ent1 = tmp results.append(r) count += 1 except Queue.Empty: break
def query_thread(queue, database, g_minus_d, e1_type, e2_type, index): idx = open_dir(index) regex_tokenize = re.compile('\w+|-|<[A-Z]+>[^<]+</[A-Z]+>', re.U) tokenizer = RegexTokenizer(regex_tokenize) stopper = StopFilter() count = 0 with idx.searcher() as searcher: while True: r = queue.get_nowait() count += 1 if count % 25000 == 0: print multiprocessing.current_process(), count, queue.qsize() if len(database[(r.ent1, r.ent2)]) == 0: # if its not in the database calculate the PMI entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">" terms = list() for token in stopper( tokenizer((r.between.decode("utf8")), renumber=True)): terms.append(query.Term("sentence", token.text)) #print terms t1 = query.Term("sentence", entity1) t3 = query.Term("sentence", entity2) query_terms = list() query_terms.append(t1) for t in terms: query_terms.append(t) query_terms.append(t3) q1 = spans.SpanNear2(query_terms, slop=2, ordered=True) q2 = spans.SpanNear2([t1, t3], slop=8, ordered=True) entities_r = searcher.search(q1) entities = searcher.search(q2) """ print query_terms, len(entities_r) print [t1, t3], len(entities) print "\n" """ #print entity1, '\t', r.between, '\t', entity2, len(entities_r), len(entities) try: assert not len(entities_r) > len(entities) except AssertionError, e: print e print r.sentence print r.ent1 print r.ent2 print query_terms print[t1, t3] if len(entities) > 0: pmi = float(len(entities_r)) / float(len(entities)) if pmi >= 0.5: #print entity1, '\t', r.between, '\t', entity2, pmi g_minus_d.append(r) if queue.empty() is True: break
def proximity_pmi_a(e1_type, e2_type, queue, index, results, not_found, rel_words_unigrams, rel_words_bigrams): idx = open_dir(index) count = 0 # cache to store already evaluted triples cache_correct = set() cache_incorrect = set() with idx.searcher() as searcher: while True: try: r = queue.get_nowait() t = Triple(r.e1, r.e2, r.bet_words) count += 1 if count % 50 == 0: sys.stdout.write( str(multiprocessing.current_process()) + " To Process: " + str(queue.qsize()) + " Correct found: " + str(len(results)) + '\n') sys.stdout.flush() if t in cache_correct: results.append(r) continue if t in cache_incorrect: not_found.append(r) continue # relational phrase/word of the relationship/triple fact_bet_words_tokens = word_tokenize(r.bet_words) entity1 = "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">" t1 = query.Term('sentence', entity1) t3 = query.Term('sentence', entity2) # First count the proximity (MAX_TOKENS_AWAY) occurrences of entities r.e1 and r.e2 q1 = spans.SpanNear2([t1, t3], slop=MAX_TOKENS_AWAY, ordered=True, mindist=1) hits = searcher.search(q1, limit=q_limit) # Entities proximity considering relational word # From the results above count how many contain the relational word/phrase hits_with_r = 0 for s in hits: sentence = s.get("sentence") start_e1 = sentence.rindex(entity1) end_e1 = start_e1 + len(entity1) start_e2 = sentence.rindex(entity2) bet = sentence[end_e1:start_e2] bet_tokens = word_tokenize(bet) if not (MIN_TOKENS_AWAY <= len(bet_tokens) <= MAX_TOKENS_AWAY): continue if fact_bet_words_tokens == bet_tokens: hits_with_r += 1 assert len(hits) >= hits_with_r if len(hits) > 0: pmi = float(hits_with_r) / float(len(hits)) if pmi >= PMI: results.append(r) cache_correct.add(t) """ print "**VALID**:", entity1, '\t', entity2 print "hits_without_r ", float(hits_without_r) print "hits_with_r ", float(hits_with_r) print "discarded", discarded print "Index hits", len(hits) print "PMI", pmi print r.sentence print r.bet_words print """ else: # check against a list if r.bet_words.strip() in rel_words_unigrams: results.append(r) cache_correct.add(t) elif r.bet_words.strip() in rel_words_bigrams: results.append(r) cache_correct.add(t) else: not_found.append(r) cache_incorrect.add(t) """ print "**INVALID**:" print 'ExtractedFact:', entity1, '\t', entity2 print r.bet_words.strip() print """ else: not_found.append(r) cache_incorrect.add(t) except Queue.Empty: break
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results, rel_words_unigrams, rel_words_bigrams): idx = open_dir(index) count = 0 distance = MAX_TOKENS_AWAY with idx.searcher() as searcher: while True: try: r = queue.get_nowait() if count % 50 == 0: print multiprocessing.current_process( ), "In Queue", queue.qsize(), "Total Matched: ", len( results) #TODO: fazer uma cache t1 = query.Term( 'sentence', "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">") t3 = query.Term( 'sentence', "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">") # Entities proximity query without relational words q1 = spans.SpanNear2([t1, t3], slop=distance, ordered=True, mindist=1) hits = searcher.search(q1, limit=q_limit) # Entities proximity considering relational words # From the results above count how many contain a relational word hits_with_r = 0 total_hits = 0 for s in hits: sentence = s.get("sentence") s = SentenceEvaluation(sentence, e1_type, e2_type, MAX_TOKENS_AWAY, MIN_TOKENS_AWAY, CONTEXT_WINDOW, stopwords) for s_r in s.relationships: if r.e1.decode("utf8") == s_r.e1 and r.e2.decode( "utf8") == s_r.e2: total_hits += 1 unigrams_rel_words = s_r.between bigrams_rel_words = extract_bigrams(' '.join( s_r.between)) if any(x in rel_words_unigrams for x in unigrams_rel_words): hits_with_r += 1 continue if any(x in rel_words_bigrams for x in bigrams_rel_words): hits_with_r += 1 continue assert total_hits >= hits_with_r if total_hits > 0: pmi = float(hits_with_r) / float(total_hits) if pmi >= PMI: results.append(r) count += 1 except Queue.Empty: break
#find all combinations of our three words with itertools.product, which takes list of lists and gives all combinations, e.g. A1,B1,C1, A1,B1,C2...An,Bn,Cn as a list exhaustive_combinations = list(itertools.product(*list_containing_word_and_variant_lists)) #now we need only iterate through exhaustive_combinations, searching for each, and writing any hits to outfile for search_terms in exhaustive_combinations: #list of query components will start empty, we'll populate it, then submit our query list_of_query_components = [] #iterate through search terms and add each to our list_of_query_components for term in search_terms: query_component = Term("content", term) list_of_query_components.append(query_component) #now take all of those query components and submit them to the spans.SpanNear2 function, which (loosely speaking) facilitates proximity search (with high recall and low precision, thus why we have to iterate through all results and select only the true matches in our process_results() function defined above and called below) q = spans.SpanNear2(list_of_query_components, slop=proximity_value, ordered=False) #by default the results contains at most the first 10 matching documents. To get more results, use the limit keyword: results = searcher.search(q, limit=20). printing "results" object is handy because it gives runtime for each query. Terms=true allows us to determine which of the search terms each hit has results = searcher.search(q, limit=None, terms=True) #the following line allows one to retrieve hits from farther into the document than 32K characters (so if character 32,001 is the beginning of a new word that matches query, we can grab that hit with the following line but will fail to catch it without that line) results.fragmenter.charlimit = None #add the current results to a bag of results for our three terms (e.g. find all results for all variant spellings of "so so now", then count up the total number of times that series of words yielded hits) if results: if proximity_search_desired == 1: process_results_with_proximity_function(search_terms, results, proximity_value) if exact_search_desired == 1: process_results_with_exact_function(search_terms, results)
def proximity_pmi(e1_type, e2_type, queue, index, results): """ sentences with tagged entities are indexed in whoosh perform the following query ent1 NEAR:X r NEAR:X ent2 X is the maximum number of words between the query elements. """ tokenize = re.compile('\w+(?:-\w+)+|<[A-Z]+>[^<]+</[A-Z]+>|\w+', re.U) entity = re.compile('<[A-Z]+>[^<]+</[A-Z]+>', re.U) idx = open_dir(index) count = 0 distance = 9 q_limit = 500 with idx.searcher() as searcher: while True: count += 1 if count % 50 == 0: print multiprocessing.current_process( ), "In Queue", queue.qsize(), "Total Matched: ", len(results) r = queue.get_nowait() n_1 = set() n_2 = set() n_3 = set() # if its not in the database calculate the proximity PMI if (r.ent1, r.ent2) not in all_in_freebase: entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">" t1 = query.Term('sentence', entity1) t3 = query.Term('sentence', entity2) # Entities proximity query without relational words q1 = spans.SpanNear2([t1, t3], slop=distance, ordered=True, mindist=1) hits_1 = searcher.search(q1, limit=q_limit) # Entities proximity considering relational words if isinstance(r, ExtractedFact): tokens_rel = re.findall(tokenize, r.patterns) elif isinstance(r, Relationship): tokens_rel = re.findall(tokenize, r.between) token_terms = list() for t in tokens_rel: if re.search(entity, t) is None: token_terms.append(query.Term('sentence', t)) l1 = [t for t in token_terms] l1.insert(0, t1) l2 = [t for t in token_terms] l2.append(t3) q2 = spans.SpanNear2(l1, slop=distance - 1, ordered=True, mindist=1) hits_2 = searcher.search(q2, limit=q_limit) q3 = spans.SpanNear2(l2, slop=distance - 1, ordered=True, mindist=1) hits_3 = searcher.search(q3, limit=q_limit) for d in hits_1: n_1.add(d.get("sentence")) for d in hits_2: n_2.add(d.get("sentence")) for d in hits_3: n_3.add(d.get("sentence")) entities_occurr = len(hits_1) entities_occurr_with_r = len( n_1.intersection(n_2).intersection(n_3)) try: assert not entities_occurr_with_r > entities_occurr except AssertionError, e: print e print r.sentence print r.ent1 print r.ent2 print q1, len(hits_1) print q2, len(hits_2) print q3, len(hits_3) print "intersection", len( n_1.intersection(n_2).intersection(n_3)) sys.exit(0) if float(entities_occurr) > 0: if float(entities_occurr ) > 1 and entities_occurr_with_r > 1: pmi = float(entities_occurr_with_r) / float( entities_occurr) if pmi > PMI: """ # TODO: há coisas aqui que sao falsas, por exemplo: 'chief' if isinstance(r, ExtractedFact): print r.ent1, '\t', r.patterns, '\t', r.ent2, pmi elif isinstance(r, Relationship): print r.ent1, '\t', r.between, '\t', r.ent2, pmi """ results.append(r) if queue.empty() is True: break
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results, rel_words): """ #TODO: proximity_pmi with relation specific given relational words :param e1_type: :param e2_type: :param queue: :param index: :param results: :param rel_word: :return: """ """ sentences with tagged entities are indexed in whoosh perform the following query ent1 NEAR:X r NEAR:X ent2 X is the maximum number of words between the query elements. """ idx = open_dir(index) count = 0 distance = 9 q_limit = 500 with idx.searcher() as searcher: while True: count += 1 if count % 50 == 0: print multiprocessing.current_process( ), "In Queue", queue.qsize(), "Total Matched: ", len(results) r = queue.get_nowait() if (r.ent1, r.ent2) not in all_in_freebase: # if its not in the database calculate the PMI entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">" t1 = query.Term('sentence', entity1) t3 = query.Term('sentence', entity2) # Entities proximity query without relational words q1 = spans.SpanNear2([t1, t3], slop=distance, ordered=True, mindist=1) hits = searcher.search(q1, limit=q_limit) # Entities proximity considering relational words # From the results above count how many contain a relational word hits_with_r = 0 for s in hits: sentence = s.get("sentence") s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY, MIN_TOKENS_AWAY, CONTEXT_WINDOW) for s_r in s.relationships: if r.ent1.decode("utf8") == s_r.ent1 and r.ent2.decode( "utf8") == s_r.ent2: for rel in rel_words: if rel in r.between: hits_with_r += 1 break if not len(hits) >= hits_with_r: print "ERROR!" print "hits", len(hits) print "hits_with_r", hits_with_r print entity1, '\t', entity2 print "\n" sys.exit(0) if float(len(hits)) > 0: pmi = float(hits_with_r) / float(len(hits)) if pmi > PMI: results.append(r) """ if isinstance(r, ExtractedFact): print r.ent1, '\t', r.patterns, '\t', r.ent2, pmi elif isinstance(r, Relationship): print r.ent1, '\t', r.between, '\t', r.ent2, pmi """ if queue.empty() is True: break