def read_seeds(self, seeds_file, holder): for line in fileinput.input(seeds_file): if line.startswith("#") or len(line) == 1: continue if line.startswith("e1"): self.e1_type = line.split(":")[1].strip() elif line.startswith("e2"): self.e2_type = line.split(":")[1].strip() else: e1 = line.split(";")[0].strip() e2 = line.split(";")[1].strip() seed = Seed(e1, e2) holder.add(seed)
def init_bootstrap(self, tuples): # starts a bootstrap iteration if tuples is not None: f = open(tuples, "r") print("\nLoading processed tuples from disk_init_bootstap...") self.processed_tuples = cPickle.load(f) print(">>>>>>>>> self.processed_tuples= ", self.processed_tuples) f.close() print(len(self.processed_tuples), "tuples loaded") self.curr_iteration = 0 while self.curr_iteration <= self.config.number_iterations: print("==========================================") print("\nStarting iteration", self.curr_iteration) print("\nLooking for seed matches of:") for s in self.config.positive_seed_tuples: print(s.e1, '\t', s.e2) # Looks for sentences matching the seed instances count_matches, matched_tuples = self.match_seeds_tuples() if len(matched_tuples) == 0: print("\nNo seed matches found") sys.exit(0) else: print("\nNumber of seed matches found") sorted_counts = sorted(count_matches.items(), key=operator.itemgetter(1), reverse=True) for t in sorted_counts: print(t[0][0], '\t', t[0][1], t[1]) print("\n", len(matched_tuples), "tuples matched") # Cluster the matched instances, to generate # patterns/update patterns print("\nClustering matched instances to generate patterns") self.cluster_tuples(matched_tuples) # Eliminate patterns supported by less than # 'min_pattern_support' tuples new_patterns = [ p for p in self.patterns if len(p.tuples) > self.config.min_pattern_support ] self.patterns = new_patterns print("\n", len(self.patterns), "patterns generated") if PRINT_PATTERNS is True: count = 1 print("\nPatterns:") for p in self.patterns: print(count) for t in p.tuples: print("BEF", t.bef_words) print("BET", t.bet_words) print("AFT", t.aft_words) print("========") print("\n") count += 1 if self.curr_iteration == 0 and len(self.patterns) == 0: print("No patterns generated") sys.exit(0) # Look for sentences with occurrence of seeds # semantic types (e.g., ORG - LOC) # This was already collect and its stored in: # self.processed_tuples # # Measure the similarity of each occurrence with each # extraction pattern and store each pattern that has a # similarity higher than a given threshold # # Each candidate tuple will then have a number of patterns # that extracted it each with an associated degree of match. print("Number of tuples to be analyzed:", \ len(self.processed_tuples)) print("\nCollecting instances based on extraction patterns") count = 0 for t in self.processed_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() sim_best = 0 for extraction_pattern in self.patterns: accept, score = self.similarity_all( t, extraction_pattern) if accept is True: extraction_pattern.update_selectivity( t, self.config) if score > sim_best: sim_best = score pattern_best = extraction_pattern if sim_best >= self.config.threshold_similarity: # if this tuple was already extracted, check if this # extraction pattern is already associated with it, # if not, associate this pattern with it and store the # similarity score patterns = self.candidate_tuples[t] if patterns is not None: if pattern_best not in [x[0] for x in patterns]: self.candidate_tuples[t].append( (pattern_best, sim_best)) # If this tuple was not extracted before # associate this pattern with the instance # and the similarity score else: self.candidate_tuples[t].append( (pattern_best, sim_best)) # update all patterns confidence for p in self.patterns: p.update_confidence(self.config) if PRINT_PATTERNS is True: print("\nPatterns:") for p in self.patterns: for t in p.tuples: print("BEF", t.bef_words) print("BET", t.bet_words) print("AFT", t.aft_words) print("========") print("Positive", p.positive) print("Negative", p.negative) print("Unknown", p.unknown) print("Tuples", len(p.tuples)) print("Pattern Confidence", p.confidence) print("\n") # update tuple confidence based on patterns confidence print("\n\nCalculating tuples confidence") for t in self.candidate_tuples.keys(): confidence = 1 t.confidence_old = t.confidence for p in self.candidate_tuples.get(t): confidence *= 1 - (p[0].confidence * p[1]) t.confidence = 1 - confidence # sort tuples by confidence and print if PRINT_TUPLES is True: extracted_tuples = self.candidate_tuples.keys() tuples_sorted = sorted(extracted_tuples, key=lambda tpl: tpl.confidence, reverse=True) for t in tuples_sorted: print(t.sentence) print(t.e1, t.e2) print(t.confidence) print("\n") print("Adding tuples to seed with confidence >=" + \ str(self.config.instance_confidence)) for t in self.candidate_tuples.keys(): if t.confidence >= self.config.instance_confidence: seed = Seed(t.e1, t.e2) self.config.positive_seed_tuples.add(seed) # increment the number of iterations self.curr_iteration += 1 self.write_relationships_to_disk()
def init_bootstrap(self, tuples): # starts a bootstrap iteration if tuples is not None: f = open(tuples, "r") print "Loading pre-processed sentences", tuples self.processed_tuples = cPickle.load(f) f.close() print len(self.processed_tuples), "tuples loaded" self.curr_iteration = 0 while self.curr_iteration <= self.config.number_iterations: print "==========================================" print "\nStarting iteration", self.curr_iteration print "\nLooking for seed matches of:" for s in self.config.positive_seed_tuples: print s.e1, '\t', s.e2 # Looks for sentences matching the seed instances count_matches, matched_tuples = self.match_seeds_tuples() if len(matched_tuples) == 0: print "\nNo seed matches found" sys.exit(0) else: print "\nNumber of seed matches found" sorted_counts = sorted(count_matches.items(), key=operator.itemgetter(1), reverse=True) for t in sorted_counts: print t[0][0], '\t', t[0][1], t[1] print "\n", len(matched_tuples), "tuples matched" # Cluster the matched instances: generate patterns print "\nClustering matched instances to generate patterns" if len(self.patterns) == 0: self.cluster_tuples(matched_tuples) # Eliminate patterns supported by less than # 'min_pattern_support' tuples new_patterns = [ p for p in self.patterns if len(p.tuples) > self.config.min_pattern_support ] self.patterns = new_patterns else: # Parallelize single-pass clustering # Each tuple must be compared with each extraction pattern # Map: # - Divide the tuples into smaller lists, # accordingly to the number of CPUs # - Pass to each CPU a sub-list of tuples and all the # patterns, comparison is done by each CPU # Merge: # - Each CPU sends to the father process the updated # patterns and new patterns # - Merge patterns based on a pattern_id # - Cluster new created patterns with single-pass clustering # make a copy of the extraction patterns to be # passed to each CPU patterns = [ list(self.patterns) for _ in range(self.num_cpus) ] # distribute tuples per different CPUs chunks = [list() for _ in range(self.num_cpus)] n_tuples_per_child = int( math.ceil(float(len(matched_tuples)) / self.num_cpus)) print "\n#CPUS", self.num_cpus, '\t', \ "Tuples per CPU", n_tuples_per_child chunk_n = 0 chunck_begin = 0 chunck_end = n_tuples_per_child while chunk_n < self.num_cpus: chunks[chunk_n] = matched_tuples[ chunck_begin:chunck_end] chunck_begin = chunck_end chunck_end += n_tuples_per_child chunk_n += 1 count = 0 for c in chunks: print "CPU_" + str(count), " ", len(c), "patterns" count += 1 pipes = [ multiprocessing.Pipe(False) for _ in range(self.num_cpus) ] processes = [ multiprocessing.Process( target=self.cluster_tuples_parallel, args=(patterns[i], chunks[i], pipes[i][1])) for i in range(self.num_cpus) ] print "\nRunning", len(processes), " processes" for proc in processes: proc.start() # Receive and merge all patterns by 'pattern_id' # new created patterns (new pattern_id) go into # 'child_patterns' and then are merged # by single-pass clustering between patterns child_patterns = list() for i in range(len(pipes)): data = pipes[i][0].recv() patterns = data[1] for p_updated in patterns: pattern_exists = False for p_original in self.patterns: if p_original.id == p_updated.id: p_original.tuples.update(p_updated.tuples) pattern_exists = True break if pattern_exists is False: child_patterns.append(p_updated) for proc in processes: proc.join() print "\nSELF Patterns:" for p in self.patterns: p.merge_all_tuples_bet() print '\n' + str(p.id) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print "BET", bet_words.encode("utf8") print "\nChild Patterns:" for p in child_patterns: p.merge_all_tuples_bet() print '\n' + str(p.id) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print "BET", bet_words.encode("utf8") print len(child_patterns), "new created patterns" # merge/aggregate similar patterns generated by # the child processes # start comparing smaller ones with greater ones child_patterns.sort(key=lambda y: len(y.tuples), reverse=False) count = 0 new_list = list(self.patterns) for p1 in child_patterns: print "\nNew Patterns", len(child_patterns), \ "Processed", count print "New List", len(new_list) print "Pattern:", p1.id, "Tuples:", len(p1.tuples) max_similarity = 0 max_similarity_cluster = None for p2 in new_list: if p1 == p2: continue score = self.similarity_cluster(p1, p2) if score > max_similarity: max_similarity = score max_similarity_cluster = p2 if max_similarity >= self.config.threshold_similarity: for t in p1.tuples: max_similarity_cluster.tuples.add(t) else: new_list.append(p1) count += 1 # add merged patterns to main patterns structure for p in new_list: if p not in self.patterns: self.patterns.append(p) if self.curr_iteration == 0 and len(self.patterns) == 0: print "No patterns generated" sys.exit(0) print "\n", len(self.patterns), "patterns generated" # merge equal tuples inside patterns to make # less comparisons in collecting instances for p in self.patterns: # if only the BET context is being used, # merge only based on BET contexts if self.config.alpha == 0 and self.config.gamma == 0: p.merge_all_tuples_bet() if PRINT_PATTERNS is True: print "\nPatterns:" for p in self.patterns: print '\n' + str(p.id) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print "BET", bet_words else: for t in p.tuples: print "BEF", t.bef_words print "BET", t.bet_words print "AFT", t.aft_words print "========" # Look for sentences with occurrence of # seeds semantic types (e.g., ORG - LOC) # This was already collect and its stored in # self.processed_tuples # # Measure the similarity of each occurrence with # each extraction pattern and store each pattern that has a # similarity higher than a given threshold # # Each candidate tuple will then have a number of patterns # that extracted it each with an associated degree of match. print "\nNumber of tuples to be analyzed:", \ len(self.processed_tuples) print "\nCollecting instances based on", \ len(self.patterns), "extraction patterns" # create copies of generated extraction patterns # to be passed to each process patterns = [list(self.patterns) for _ in range(self.num_cpus)] # copy all tuples into a Queue shared by all processes manager = multiprocessing.Manager() queue = manager.Queue() for t in self.processed_tuples: queue.put(t) # each distinct process receives as arguments: # - a list, copy of all the original extraction patterns # - a Queue of the tuples # - a pipe to return the collected tuples and updated # patterns to the parent process pipes = [ multiprocessing.Pipe(False) for _ in range(self.num_cpus) ] processes = [ multiprocessing.Process(target=self.find_instances, args=(patterns[i], queue, pipes[i][1])) for i in range(self.num_cpus) ] print "Running", len(processes), " processes" for proc in processes: proc.start() # structures to store each process altered patterns # and collected tuples patterns_updated = list() collected_tuples = list() for i in range(len(pipes)): data = pipes[i][0].recv() child_pid = data[0] patterns = data[1] tuples = data[2] print child_pid, "patterns", len(patterns), \ "tuples", len(tuples) patterns_updated.extend(patterns) collected_tuples.extend(tuples) for proc in processes: proc.join() # Extraction patterns aggregation happens here: for p_updated in patterns_updated: for p_original in self.patterns: if p_original.id == p_updated.id: p_original.positive += p_updated.positive p_original.negative += p_updated.negative p_original.unknown += p_updated.unknown # Index the patterns in an hashtable for later use for p in self.patterns: self.patterns_index[p.id] = p # update all patterns confidence for p in self.patterns: p.update_confidence(self.config) if PRINT_PATTERNS is True: print "\nPatterns:" for p in self.patterns: print p.id print "Positive", p.positive print "Negative", p.negative print "Pattern Confidence", p.confidence print "\n" # Candidate tuples aggregation happens here: print "Collecting generated candidate tuples" for e in collected_tuples: t = e[0] pattern_best = e[1] sim_best = e[2] # if this tuple was already extracted, check if this # extraction pattern is already associated with it, if not, # associate this pattern with it and similarity score if t in self.candidate_tuples: t_patterns = self.candidate_tuples[t] if t_patterns is not None: if pattern_best not in [x[0] for x in t_patterns]: self.candidate_tuples[t].append( (self.patterns_index[pattern_best.id], sim_best)) # if this tuple was not extracted before, associate this # pattern with the instance and the similarity score else: self.candidate_tuples[t].append( (self.patterns_index[pattern_best.id], sim_best)) # update tuple confidence based on patterns confidence print "\n\nCalculating tuples confidence" for t in self.candidate_tuples.keys(): confidence = 1 t.confidence_old = t.confidence for p in self.candidate_tuples.get(t): confidence *= 1 - (p[0].confidence * p[1]) t.confidence = 1 - confidence if self.curr_iteration > 0: t.confidence = \ t.confidence * self.config.wUpdt + \ t.confidence_old * (1 - self.config.wUpdt) # sort tuples by confidence and print if PRINT_TUPLES is True: extracted_tuples = self.candidate_tuples.keys() tuples_sorted = sorted(extracted_tuples, key=lambda tl: tl.confidence, reverse=True) for t in tuples_sorted: print t.sentence print t.e1, t.e2 print t.confidence print "\n" # update seed set of tuples to use in next iteration # seeds = { T | conf(T) > instance_confidence } print "Adding tuples to seed with confidence >=" + \ str(self.config.instance_confidence) for t in self.candidate_tuples.keys(): if t.confidence >= self.config.instance_confidence: seed = Seed(t.e1, t.e2) self.config.positive_seed_tuples.add(seed) # increment the number of iterations self.curr_iteration += 1 self.write_relationships_to_disk()