def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file): phrase2id = {} for i in range(len(unranked_phrases)): phrase2id[unranked_phrases[i]] = i id2phrase = {} for i in range(len(unranked_phrases)): id2phrase[i] = unranked_phrases[i] id2pattern = {} for i in range(len(unranked_patterns)): id2pattern[i] = unranked_patterns[i] seedIdwConfidence = {} for key, val in phrase2id.items(): if key in T_0: seedIdwConfidence[val] = 0.0 id2patterns = defaultdict(set) pattern2ids = defaultdict(set) context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns))) # find c (t, p) with open(file, 'r') as f: file_chunk = partition(f) matcher = Matcher(nlp.vocab) for t in file_chunk: doc = nlp(t) for i in range(len(unranked_patterns)): offset = 0 for pattern_dict in unranked_patterns[i]: if 'POS' in pattern_dict: break offset += 1 matcher.add("extraction", None, unranked_patterns[i]) matches = matcher(doc) for match_id, start, end in matches: span = doc[start + offset:end].text j = unranked_phrases.index(span) context_matrix[j, i] += 1 id2patterns[j].add(i) pattern2ids[i].add(j) matcher.remove("extraction") id2sup = {} pattern2sup = {} for id in id2patterns.keys(): sum = 0 for col in range(len(unranked_patterns)): sum += context_matrix[id, col] id2sup[id] = sum for pattern in pattern2ids.keys(): sum = 0 for row in range(len(unranked_phrases)): sum += context_matrix[row, pattern] pattern2sup[pattern] = sum l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [], id2patterns, pattern2ids, {}, {}, {}, {}, id2phrase, context_matrix.tolist(), id2sup, pattern2sup, FLAGS_VERBOSE=False, FLAGS_DEBUG=False) return l1, l2, l3, l4, m1, m2, m3, m4
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file): global final_patterns, final_keywords, pattern_to_score_map, keyword_to_score_map, ngram_prob_map, phrase_seg_score, removed_phrases, wiki_score_cache, error_count, total_ngram_counts phrase2id = {} for i in range(len(unranked_phrases)): phrase2id[unranked_phrases[i]] = i id2phrase = {} for i in range(len(unranked_phrases)): id2phrase[i] = unranked_phrases[i] id2pattern = {} for i in range(len(unranked_patterns)): id2pattern[i] = unranked_patterns[i] seedIdwConfidence = {} for key, val in phrase2id.items(): if key in T_0: seedIdwConfidence[val] = 0.0 id2patterns = defaultdict(set) pattern2ids = defaultdict(set) with open('../development_ipynbs/context_matrix.pickle', 'rb') as f: context_matrix = pickle.load(f) print("[LOG] Loaded the context matrix. Shape: " + str(context_matrix.shape)) for i in range(len(unranked_patterns)): for j in range(len(unranked_phrases)): if context_matrix[j, i] > 0: id2patterns[j].add(i) pattern2ids[i].add(j) id2sup = {} for i in range(len(unranked_phrases)): id2sup[i] = 0 pattern2sup = {} for i in range(len(unranked_patterns)): pattern2sup[i] = 0 for id in id2patterns.keys(): sum = 0 for col in range(len(unranked_patterns)): sum += context_matrix[id, col] id2sup[id] = sum for pattern in pattern2ids.keys(): sum = 0 for row in range(len(unranked_phrases)): sum += context_matrix[row, pattern] pattern2sup[pattern] = sum print("[LOG] Initiating PR Dual Rank inference.") l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [], id2patterns, pattern2ids, {}, {}, {}, {}, id2phrase, context_matrix.tolist(), id2sup, pattern2sup, FLAGS_VERBOSE=True, FLAGS_DEBUG=True) print("[LOG] Ended PR Dual Rank inference.") return l1, l2, l3, l4, m1, m2, m3, m4