def run(): # only write once, in order to keep data consistency # output = getAllPhrases() # output = [t for t in output if t != ''] # output = sorted(output) # fileHandler.writeListToFile(output, '../../outputs/np_extract_all_normalized.txt') ## read from file to make the result consistent output = fileHandler.getwords('../../outputs/np_extract_all_normalized.txt', split=False) extractFeatures(output)
def anotherrun(repeat=False): # run_dict {idx, score} rawscores = filehandler.getwords( '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False) run_dict = checkzeroscores(rawscores) cnt = 0 # this step is to refill the 0 values due to google block if repeat: while (len(run_dict) > round(0.0 * len(rawscores))) and (cnt < 10): rawscores = secondrun(run_dict, rawscores) run_dict = checkzeroscores(rawscores) cnt += 1 filehandler.writeListToFile( rawscores, '../../outputs/knownphrase/knowphrase_all_v2.txt') # update all the patterns with rawscore return rawscores
def printHighQPhrases(debug=False): phrases = list(getPhrases().keys()) phrases = [ ' '.join([removePosFromWord(t) for t in phrase.split(' ')]) for phrase in phrases ] scores = filehandler.getwords( '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False) output = [] for i in range(len(scores)): if int(scores[i]) == 4: output.append(phrases[i]) print("len of high quality phrase: ", len(output)) if debug: print(phrases) print("length of total phrases: ", len(phrases)) filehandler.writeListToFile(output, '../../tmp/kp4.txt')
def removePosTagFromDict(self, all_phrases_raw_pos): all_phrases_raw = {} scoresraw = fileHandler.getwords( '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False) scoresdict = {} print("all_phrases_raw_pos: ", all_phrases_raw_pos) i = 0 for phrase_pos_key in all_phrases_raw_pos: phrase_key = re.sub(r'%[A-Z]+\b', '', phrase_pos_key) all_phrases_raw[phrase_key] = all_phrases_raw_pos[phrase_pos_key] if phrase_key not in scoresdict: scoresdict[phrase_key] = scoresraw[i] else: tmp = scoresdict[phrase_key] scoresdict[phrase_key] = max(tmp, scoresraw[i]) i += 1 return all_phrases_raw, scoresdict
print(ex) def partition_worker(words, pid): work = [] with open('../../tmp/wiki_quality_sentences_{}.txt'.format(pid), 'w') as f: for word in tqdm(words): try: sent = wikipedia.summary(word, sentences=2, auto_suggest=True) sent = sent.replace('\n', ' ') f.write("%s\n" % sent) work.append(word) except Exception as ex: print(ex) filehandler.writeListToFile(work, "../../outputs/wiki_work_{}.txt".format(pid)) if __name__ == '__main__': #print (wikipedia.summary("new york city", sentences=2, auto_suggest=False)) words = filehandler.getwords('../../input/wiki_quality.txt', split=False) # print(words) partition_worker(words[3401:3500], 1) # try_word("Henry Billings Brown") # print(sentences==True)
def getAllPhrases(): output = set(fileHandler.getwords('../../outputs/np_extract_r1.txt', split=False)).union( set(fileHandler.getwords('../../outputs/np_extract_r2.txt', split=False))).union( set(fileHandler.getwords('../../outputs/np_extract_r3.txt', split=False)) ) return list(output)
def integratelist(): scores = filehandler.getwords('../../outputs/knownphrase/knowphrase_0.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_1.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_2.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_3.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_4.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_5.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_6.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_7.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_8.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_9.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_10.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_11.txt', split=False) + \ filehandler.getwords('../../outputs/knownphrase/knowphrase_12.txt', split=False) print(scores) print("len of scores: ", len(scores)) # pick out those zero score items and run again filehandler.writeListToFile( scores, '../../outputs/knownphrase/knowphrase_all_v2.txt')
def writePhrasesWithoutDuplicates(): phrases = filehandler.getwords("../../tmp/kp4.txt", split=False) phrases = list(dict.fromkeys(phrases)) phrases = [t for t in phrases if len(t.split(' ')) > 1] filehandler.writeListToFile(phrases, "../../outputs/is_known_phrase_nodup.txt")