def gen_sum(document, words=12): with open("tmp.txt", 'w') as f: f.write(document) extractor = pke.TopicRank("tmp.txt") extractor.read_document(format="raw", stemmer=None) extractor.candidate_selection() extractor.candidate_weighting() keyphrases = extractor.get_n_best(n=10) return str(keyphrases[0][0])
def getKeyphrases(file_name, N): extr = pke.TopicRank(input_file=file_name) extr_2 = pke.TfIdf(input_file=file_name) extr.read_document(format='raw') extr.candidate_selection() extr.candidate_weighting() extr_2.read_document(format='raw') extr_2.candidate_selection() extr_2.candidate_weighting() keyphrase = extr.get_n_best(n=2 * N) keyphrase = normalize(keyphrase) keyphrases2 = extr_2.get_n_best(n=2 * N) keyphrases2 = normalize(keyphrases2) keyphrase += keyphrases2 sorted(keyphrase, key=min) return keyphrase[0:N]
def get_keyphrases_pke(infile, mode='topic', stoplist_path=None, postags=None, ntop=100): if stoplist_path == None: stoplist_path = 'SmartStoplist.txt' stoplist = load_stop_words(stoplist_path) if postags == None: postags = [ 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS', 'VBN', 'VBD' ] # Run keyphrase extractor - Topic_Rank unsupervised method if mode == 'topic': try: extractor = pke.TopicRank(input_file=infile, language='english') extractor.read_document(format='raw', stemmer=None) extractor.candidate_selection(stoplist=stoplist, pos=postags) extractor.candidate_weighting(threshold=0.25, method='average') phrases = extractor.get_n_best(300, redundancy_removal=True) except: phrases = [] # Run keyphrase extractor - Single_Rank unsupervised method elif mode == 'single': try: extractor = pke.SingleRank(input_file=infile, language='english') extractor.read_document(format='raw', stemmer=None) extractor.candidate_selection(stoplist=stoplist) extractor.candidate_weighting(normalized=True) except: phrases = [] # Run keyphrase extractor - TfIdf unsupervised method elif mode == 'tfidf': try: extractor = pke.TfIdf(input_file=infile, language='english') extractor.read_document(format='raw', stemmer=None) extractor.candidate_selection(stoplist=stoplist) extractor.candidate_weighting() except: phrases = [] # Run keyphrase extractor - KP_Miner unsupervised method elif mode == 'kpminer': try: extractor = pke.KPMiner(input_file=infile, language='english') extractor.read_document(format='raw', stemmer=None) extractor.candidate_selection(stoplist=stoplist) extractor.candidate_weighting() except: phrases = [] else: # invalid mode print("Invalid keyphrase extraction algorithm: %s" % mode) print("Valid PKE algorithms: [topic, single, kpminer, tfidf]") exit(1) phrases = extractor.get_n_best(ntop, redundancy_removal=True) return phrases
def main(): begin = time.time() os.chdir('..') os.chdir('./crowd_data') files = os.listdir('.') init_txt = [f for f in files if f.endswith('.txt')] init_txt = random.sample(init_txt, 30) text_name = [] for i in init_txt: text_name.append(re.sub('.txt', '', i)) # Removing '.txt' init_key = [f for f in files if f.endswith('.key')] key_name = [] for i in init_key: key_name.append(re.sub('.key', '', i)) # Removing '.key' pos_inp_in_out = [] txt = [] for i in text_name: if i in key_name: pos_inp_in_out.append(key_name.index(i)) txt.append(i + ".txt") key_file = [] for i in pos_inp_in_out: key_file.append(init_key[i]) precision = [] recall = [] f1_score = [] for i in range(len(txt)): print "File = " + str(txt[i]) # initialize keyphrase extraction model, here TopicRank extractor = pke.TopicRank(input_file=txt[i]) # load the content of the document, here document is expected to be in raw # format (i.e. a simple text file) and preprocessing is carried out using nltk extractor.read_document(format='raw') # keyphrase candidate selection, in the case of TopicRank: sequences of nouns # and adjectives extractor.candidate_selection() # candidate weighting, in the case of TopicRank: using a random walk algorithm extractor. candidate_weighting() # N-best selection, keyphrases contains the 10 highest scored candidates as # (keyphrase, score) tuples keyphrases = extractor.get_n_best(n=10) key = [] val = [] for k, v in keyphrases: key.append(k) val.append(v) phrases = open(key_file[i], mode='r') phr = phrases.read() y_true = phr.split('\n') if len(y_true) > len(key): y_true = y_true[0:len(key)] else: key = key[0:len(y_true)] da = [] for y in y_true: da.append(y.lower()) y_true = da print(key) print(y_true) tp = 0 fp = 0 fn = 0 for k in key: if k in y_true: tp += 1 elif k not in y_true: fp += 1 for k in y_true: if k not in key: fn += 1 p = tp * 1.0 / (tp + fp) print("p= " + str(p)) r = tp * 1.0 / (tp + fn) print("r= " + str(r)) f1 = 2 * p * r * 1.0 / (p + r + 0.1) precision.append(p) recall.append(r) f1_score.append(f1) print "\n\n" print("######################################################3") print "LEn" + str(len(txt)) print("Precision = " + str(np.mean(precision))) print("Recall = " + str(np.mean(recall))) print("F1-score = " + str(np.mean(f1_score))) print "Time " + str(time.time() - begin)