def checkingKnownPhrases(self, repeat=False): rawscores = fileHandler.getwords( '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False) run_dict = self.checkzeroscores(rawscores) cnt = 0 # this step is to refill the 0 values due to google block if repeat: while (len(run_dict) > round(0.000 * len(rawscores))) and (cnt < 10): rawscores = knownphrase.secondrun(run_dict, rawscores) run_dict = self.checkzeroscores(rawscores) cnt += 1 fileHandler.writeListToFile( rawscores, '../../outputs/knownphrase/knowphrase_all.txt') # update all the patterns with rawscore # ppp = [] # for i in range(len(self.patterns)): # self.patterns[i].is_know_phrase = rawscores[i] # ppp.append(self.patterns[i].phrase) # fileHandler.writeListToFile(ppp, '../../tmp/phrase_check_fm.txt') # made the score into dic ## attention: migrate the step into removePosFromDict() # print("len of scores dict: ", len(self.scoresdict)) # print("len of patterns: ", len(self.patterns)) assert len(self.scoresdict) == len(self.patterns) for i in range(len(self.patterns)): phrase = self.patterns[i].phrase self.patterns[i].is_know_phrase = self.scoresdict[phrase] return rawscores
def simplerun(): phrases = list(getPhrases().keys()) st1 = 4633 end1 = 5633 tmp = phrases[st1:end1] # print("tmp: ", tmp) output = [checkgoogle(t) for t in tmp] filehandler.writeListToFile( output, "../../outputs/knownphrase/knowphrase_{}.txt".format(5), append=False)
def partition_worker(phrases, pid): work = [] with open ('../../tmp/wiki_quality_sentences_{}.txt'.format(pid), 'w') as f: for phrase in tqdm(phrases): try: score = checkgoogle(phrase) work.append(score) except Exception as ex: print(ex) filehandler.writeListToFile(work, "../../outputs/knownphrases/work_{}.txt".format(pid))
def partition_worker(words, pid): work = [] with open ('../../tmp/wiki_quality_sentences_{}.txt'.format(pid), 'w') as f: for word in tqdm(words): try: sent = wikipedia.summary(word, sentences=2, auto_suggest=True) sent = sent.replace('\n', ' ') f.write("%s\n" % sent) work.append(word) except Exception as ex: print(ex) filehandler.writeListToFile(work, "../../outputs/wiki_work_{}.txt".format(pid))
def run(): t5_list = filehandler.getwords('../input/t5.txt') t6_list = filehandler.getwords('../input/t6.txt') # list_diff = [item for item in t5_list if item not in t6_list] list_diff = list(set(t5_list) - set(t6_list)) filehandler.writeListToFile(list_diff, "t4Dt5.txt") t5_dict = filehandler.getwordswithscore('../input/t5.txt') list_diff_score = [(t5_dict[t] + '\t' + t) for t in list_diff] list_diff_score = sorted(list_diff_score, key=lambda t: t.split('\t')[0], reverse=True) print(list_diff_score) filehandler.writeListToFile(list_diff_score, "t4Dt5_score.txt")
def anotherrun(repeat=False): # run_dict {idx, score} rawscores = filehandler.getwords( '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False) run_dict = checkzeroscores(rawscores) cnt = 0 # this step is to refill the 0 values due to google block if repeat: while (len(run_dict) > round(0.0 * len(rawscores))) and (cnt < 10): rawscores = secondrun(run_dict, rawscores) run_dict = checkzeroscores(rawscores) cnt += 1 filehandler.writeListToFile( rawscores, '../../outputs/knownphrase/knowphrase_all_v2.txt') # update all the patterns with rawscore return rawscores
def printHighQPhrases(debug=False): phrases = list(getPhrases().keys()) phrases = [ ' '.join([removePosFromWord(t) for t in phrase.split(' ')]) for phrase in phrases ] scores = filehandler.getwords( '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False) output = [] for i in range(len(scores)): if int(scores[i]) == 4: output.append(phrases[i]) print("len of high quality phrase: ", len(output)) if debug: print(phrases) print("length of total phrases: ", len(phrases)) filehandler.writeListToFile(output, '../../tmp/kp4.txt')
def run(n): phrases = list(getPhrases().keys()) ed1 = 5150 for i in range(n): try: st1 = ed1 ed1 = st1 + 700 tmp = phrases[st1:ed1] # print("tmp: ", tmp) output = [checkgoogle(t) for t in tmp] filehandler.writeListToFile( output, "../../outputs/knownphrase/knowphrase_{}.txt".format(i + 12), append=False) except Exception as e: print(e) continue
def integratelist(): scores = filehandler.getwords('../../outputs/knownphrase/knowphrase_0.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_1.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_2.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_3.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_4.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_5.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_6.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_7.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_8.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_9.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_10.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_11.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_12.txt', split=False) print(scores) print("len of scores: ", len(scores)) # pick out those zero score items and run again filehandler.writeListToFile( scores, '../../outputs/knownphrase/knowphrase_all_v2.txt')
def clustering(self): label_dict = getLabels(method='ward') group1 = [ self.patterns[i].phrase for i in label_dict if label_dict[i] == 0 ] group2 = [ self.patterns[i].phrase for i in label_dict if label_dict[i] == 1 ] group3 = [ self.patterns[i].phrase for i in label_dict if label_dict[i] == 2 ] group4 = [ self.patterns[i].phrase for i in label_dict if label_dict[i] == 3 ] group5 = [ self.patterns[i].phrase for i in label_dict if label_dict[i] == 4 ] # group6 = [self.patterns[i].phrase for i in label_dict if label_dict[i] == 5] # group7 = [self.patterns[i].phrase for i in label_dict if label_dict[i] == 6] print("clustering=====") print("group1 length: ", len(group1)) print("group2 length: ", len(group2)) print("group3 length: ", len(group3)) print("group4 length: ", len(group4)) print("group5 length: ", len(group5)) # print("group6 length: " , len(group6)) # print("group7 length: " , len(group7)) fileHandler.writeListToFile(group1, "../../outputs/features_group1_part.txt") fileHandler.writeListToFile(group2, "../../outputs/features_group2_part.txt") fileHandler.writeListToFile(group3, "../../outputs/features_group3_part.txt") fileHandler.writeListToFile(group4, "../../outputs/features_group4_part.txt") fileHandler.writeListToFile(group5, "../../outputs/features_group5_part.txt")
# way 3: use trees all_phrases_dict = run(freq=True, all_phrase_dict=all_phrases_dict) # remove the empty keys all_phrases_dict = { k: v for k, v in all_phrases_dict.items() if v is not None } print(all_phrases_dict) list1 = list(all_phrases_dict.keys()) list2 = list(all_phrases_dict.values()) all_phrases_freq_list = ['\t'.join(map(str, i)) for i in zip(list2, list1)] # sort the list all_phrases_freq_list = sorted(all_phrases_freq_list, key=lambda x: int(x.split('\t')[0]), reverse=True) all_phrases_freq_list2 = sorted(all_phrases_freq_list, key=lambda x: (x.split('\t')[1]), reverse=False) # write output to file writeListToFile(all_phrases_freq_list, '../../outputs/np_extract_with_freq.txt') writeListToFile(all_phrases_freq_list2, '../../outputs/np_extract_with_freq_alpha.txt') # also pickle the file for future use pickle.dump(dict(all_phrases_dict), open('../../tmp/phrases_freq', 'wb'))
""" chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) tree = chunker.parse(postoks) terms = get_terms(tree, freq=freq) output = [] for term in terms: name_entity = ' '.join(term) name_entity = Normalizer().cleanPhrase(name_entity) output.append(name_entity) if freq: all_phrase_dict[name_entity] += 1 if freq: return all_phrase_dict return output if __name__ == '__main__': # nltk.download('wordnet') output = run() print(output) print("length of output: " , len(output)) writeListToFile(output, '../../outputs/np_extract_r3.txt') # test = run(freq=True, all_phrase_dict=defaultdict(lambda:0)) # print(test)
for token, pos in subtree.leaves() ])) elif current_chunk: named_entity = " ".join(current_chunk) named_entity = Normalizer().cleanPhrase(named_entity) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk if __name__ == '__main__': # Defining a grammar & Parser NP = "NP: {(<V\w+>|<NN\w?>)+.*<NN\w?>}" chunker = RegexpParser(NP) nz = Normalizer() # apply to our file sent = getsent('/Users/beidan/RASHIP/PDFs-TextExtract/output/section.txt') print(sent) # way 1 : use ner chunker chunks = get_continuous_chunks(sent) # way 2 :use custom chunker #chunks = get_continuous_chunks(sent, chunker.parse) print(chunks) writeListToFile(chunks, '../../outputs/np_extract_r1.txt')
def writePhrasesWithoutDuplicates(): phrases = filehandler.getwords("../../tmp/kp4.txt", split=False) phrases = list(dict.fromkeys(phrases)) phrases = [t for t in phrases if len(t.split(' ')) > 1] filehandler.writeListToFile(phrases, "../../outputs/is_known_phrase_nodup.txt")