def __init__( self, posset, feature_bittag, dir_text, text_suffix, dir_feature, feature_suffix, dir_manualkp, manualkp_suffix, wordsmap_file, featurenum, nonposfeature, ): # receving input self.posset = posset self.feature_bittag = feature_bittag self.dir_text = dir_text self.text_suffix = text_suffix self.dir_feature = dir_feature self.feature_suffix = feature_suffix self.dir_manualkp = dir_manualkp self.manualkp_suffix = manualkp_suffix self.wordsmap_file = wordsmap_file self.featurenum = featurenum self.nonposfeature = nonposfeature # inner variable setting self.poslist = list(posset) self.featuretype = sum(feature_bittag) self.stemmer = PorterStemmer() self.wordsmap = self.loadmap() self.wordfilter = WordFilter() self.doctext, self.doctags, self.worddf = self.loaddoctext() self.manualkeywords = self.getmanualkeywords()
def __init__(self, src_corp_dir, lda_doc, docsuffix): self.src_corp_dir = src_corp_dir self.lda_doc = lda_doc self.docsuffix = docsuffix self.stopWords = Stopwords() self.stemmer = PorterStemmer() self.pattern = re.compile(REG_EXP) self.doclist = [] self.getdoclist() self.genmodel_inputfile()
def __init__(self, dir_text_file, output_wordmap_file, \ pos_sets, reg_exp, docsuffix, kpdocsuffix): self.dir_text_file = dir_text_file self.output_wordmap_file = output_wordmap_file self.pos_sets = pos_sets self.docsuffix = docsuffix self.kpdocsuffix = kpdocsuffix self.stemmer = PorterStemmer() self.dict_words = [] self.pattern = re.compile(reg_exp) self.doclist = [] self.doclist = self.getdoclist(1)
def main(test=None): # files setting semwiki_file = "../cleanData/Hulth2003/words_pair.simvalue.dict" clean_semwiki_file = "../cleanData/Hulth2003/words_pair.wiki.clean.dict" if test: semwiki_file = "../unitTest/dataPreprocess/cleanWikiminerResult/input.txt" clean_semwiki_file = "../unitTest/dataPreprocess/cleanWikiminerResult/output.txt" # stemmer stemmer = PorterStemmer() raw_numitems = 0 new_numitems = 0 tri_dict = {} for line in open(semwiki_file): raw_numitems += 1 triparts = line.strip('\n').split(' ') # to lower case triparts[0] = triparts[0].lower() triparts[1] = triparts[1].lower() # stemming triparts[0] = stemmer.stem(triparts[0], 0, len(triparts[0])-1) triparts[1] = stemmer.stem(triparts[1], 0, len(triparts[1])-1) synth_key = triparts[0] + '_' + triparts[1] if synth_key not in tri_dict: tri_dict[synth_key] = triparts[2] new_numitems += 1 wfd = open(clean_semwiki_file, 'w') for key in tri_dict.keys(): triparts = key.split('_') triparts.append(tri_dict[key]) wfd.write("%s %s %s\n" % (triparts[0], triparts[1], triparts[2])) wfd.close() print "Raw number of items: %d. After cleaning, number of items: %d.\n"\ % (raw_numitems, new_numitems)
def __init__(self, wordsmap_file, dir_keywords, dir_result, topk, \ keywords_suffix, result_suffix, completekwnum_suffix): self.wordsmap_file = wordsmap_file self.dir_keywords = dir_keywords self.dir_result = dir_result self.keywords_suffix = keywords_suffix self.result_suffix = result_suffix self.topk = topk self.completekwnum_suffix = completekwnum_suffix self.stemmer = PorterStemmer() self.wordsmap = self.loadmap() self.manualkeywords = self.getmanualkeywords() self.extractkeywords = self.self.getextractkeywords()
def __init__(self, dir_text_file, dir_output_file,\ windowsize, docsuffix, wordmap_file): self.mapfile_suffix = ".idmap" self.graphfile_suffix = ".graph" self.stopWords = Stopwords() self.stemmer = PorterStemmer() self.pattern = re.compile(REG_EXP) self.dir_text_file = dir_text_file self.dir_output_file = dir_output_file self.windowsize = windowsize self.docsuffix = docsuffix self.wordmap_file = wordmap_file self.doclist = [] self.getdoclist() self.corp_wordmap = {} self.readwordmap()
def __init__(self, dir_text_file, text_suffix,\ dir_feature_file, feature_suffix,\ dir_manualkp_file, manualkp_suffix,\ words_map_file): self.dir_text_file = dir_text_file self.text_suffix = text_suffix self.dir_feature_file = dir_feature_file self.feature_suffix = feature_suffix self.dir_manualkp_file = dir_manualkp_file self.manualkp_suffix = manualkp_suffix self.words_map_file = words_map_file self.featurenum = 3 self.stemmer = PorterStemmer() self.wordmap = self.loadmap() self.doclist = self.getdoclist(self.substr_text,\ dir_text_file) self.wordFilter = WordFilter() self.doctext,self.doctags,self.worddf=self.loaddoctext() self.manualkeywords = self.getmanuallabels() corpfeature = self.mkdocfeatures() self.corpfeature = self.normalization(corpfeature,\ method='minmax')
def __init__(self, words_map_file, dir_keywords_file,\ dir_results_file, dir_wholetext_file,\ keywords_suffix, results_suffix, wholetext_suffix,\ kwnum_suffix, topk): self.words_map_file = words_map_file self.dir_keywords_file = dir_keywords_file self.dir_results_file = dir_results_file self.dir_wholetext_file = dir_wholetext_file self.keywords_suffix = keywords_suffix self.results_suffix = results_suffix self.wholetext_suffix = wholetext_suffix self.kwnum_suffix = kwnum_suffix self.topk = topk self.stemmer = PorterStemmer() self.wordsmap = self.loadmap() self.doclist = self.getdoclist(self.dir_wholetext_file,\ self.substr_wholetext) self.corpkeyphrase = {} self.getkeyphrase() self.manualkwnum = {} self.manuallabels = {} self.getmanuallabels()
class GetTopicDis: ''' Call ldaGibbs++, the opensource software, to get the topic distribution for words and documents. Before calling, we should first convert the file to the format which meets the requirements. ''' def __init__(self, src_corp_dir, lda_doc, docsuffix): self.src_corp_dir = src_corp_dir self.lda_doc = lda_doc self.docsuffix = docsuffix self.stopWords = Stopwords() self.stemmer = PorterStemmer() self.pattern = re.compile(REG_EXP) self.doclist = [] self.getdoclist() self.genmodel_inputfile() def getdoclist(self): for subdir in self.src_corp_dir: candi_files = os.listdir(subdir) # filter file by suffix if existed if self.docsuffix: candi_files = filter(self.substr, candi_files) candi_files = map(lambda x: subdir+x, candi_files) self.doclist = self.doclist + candi_files # generate the file meeting the requirements of the ldaGibbs++ def genmodel_inputfile(self): wfd = open(self.lda_doc, "w") wfd.write("%d\n" % len(self.doclist)) for doc in self.doclist: docwordlist = [] for line in open(doc): line = line.strip("\n\r ") docwordlist = docwordlist + self.filterwords(line) docwordlist = sorted(docwordlist, reverse=False) wfd.write("%s\n" % " ".join(docwordlist)) wfd.close() # call Gibbs LDA def call_lda(self, topicnum, maxiter): # lda model parameter setting alpha = 1.0*topicnum / 50 cmd = CMD + "-alpha " + str(alpha) + " -ntopics " + \ str(topicnum) + " -niters " + str(maxiter) + \ " -dfile " + self.lda_doc print "Calling Gibbs LDA" #os.popen(cmd) os.system(cmd) print "Finishing calling" # filter words based on stopwords list and character rule def filterwords(self, textline): save_words = [] words = textline.split(" ") for word in words: if word == " ": continue biparts = word.split("_") # words processing (stopword, stemming, lower) # ============================================ biparts[0] = biparts[0].lower() biparts[0] = self.stemmer.stem(biparts[0], 0, \ len(biparts[0])-1) if len(biparts) == 2 and biparts[1] in TOPIC_POS: if not self.stopWords.is_stopword(biparts[0])\ and self.pattern.match(biparts[0]): save_words.append(biparts[0]) # ============================================ return save_words def substr(self, candi_file): if candi_file.find(self.docsuffix) != -1: return True return False
class FeatureGenerator(): '''This class serves for the supervised keyphrase extraction method, i.e., logistic regression for keyword extraction and then merge keywords to keyphrases. Features: as we use "POS tags" to merge keywords when generating keyphrases, we don't use "POS" tags as features, 1.TF; 2.DF; 3.POSITION. Note: all features should be normalized. ''' def __init__(self, dir_text_file, text_suffix,\ dir_feature_file, feature_suffix,\ dir_manualkp_file, manualkp_suffix,\ words_map_file): self.dir_text_file = dir_text_file self.text_suffix = text_suffix self.dir_feature_file = dir_feature_file self.feature_suffix = feature_suffix self.dir_manualkp_file = dir_manualkp_file self.manualkp_suffix = manualkp_suffix self.words_map_file = words_map_file self.featurenum = 3 self.stemmer = PorterStemmer() self.wordmap = self.loadmap() self.doclist = self.getdoclist(self.substr_text,\ dir_text_file) self.wordFilter = WordFilter() self.doctext,self.doctags,self.worddf=self.loaddoctext() self.manualkeywords = self.getmanuallabels() corpfeature = self.mkdocfeatures() self.corpfeature = self.normalization(corpfeature,\ method='minmax') def loaddoctext(self): doctext = {} doctags = {} worddf = {} for doc in self.doclist: docwordlist = [] doctaglist = [] docwordset = set([]) for line in open(doc): clean_words,tags=self.wordFilter.filterwords(\ line.strip('\r\n ')) docwordlist += clean_words doctaglist += tags #docname = doc.split('/')[-1].split('.')[0] doctext[doc] = docwordlist doctags[doc] = doctaglist for word in set(docwordlist): if word in self.wordmap: wordid = self.wordmap[word] if wordid not in docwordset: if wordid in worddf: worddf[wordid] += 1 else: worddf[wordid] = 1 docwordset.add(wordid) return doctext, doctags, worddf def getmanuallabels(self): '''segment each keyphrase into keywords ''' manualkeywords = {} doclist = self.getdoclist(self.substr_manualkp,\ self.dir_manualkp_file) for doc in doclist: docname = doc.split('/')[-1].split('.')[0] keywordset = set([]) for line in open(doc): for word in line.strip('\r\n ').split(' '): word = word.lower() word = self.stemmer.stem(word, 0, \ len(word)-1) if word not in keywordset: keywordset.add(word) keywordset = map(lambda x:self.wordmap[x],\ keywordset) manualkeywords[docname] = set(keywordset) return manualkeywords def getdoclist(self, substr_func, dir_file): doclist = [] for subdir in dir_file: candi_files = os.listdir(subdir) # filter file by suffix if existed candi_files = filter(substr_func, candi_files) candi_files = map(lambda x: subdir+x, candi_files) doclist = doclist + candi_files return doclist def substr_text(self, candi_file): if candi_file.find(self.text_suffix) != -1: return True return False def substr_manualkp(self, candi_file): if candi_file.find(self.manualkp_suffix) != -1: return True return False def loadmap(self): wordsmap = {} for line in open(self.words_map_file): biparts = line.strip('\n').split(' ') wordsmap[biparts[0]] = biparts[1] return wordsmap def mkdocfeatures(self): '''Feature format: 1.tf; 2.df; 3.position ''' corpfeature = {} for key in self.doctext.keys(): docfeature = {} doctext = self.doctext[key] doctags = self.doctags[key] for i, word in enumerate(doctext): if word in self.wordmap and doctags[i] in POS: if self.wordmap[word] not in docfeature: docfeature[self.wordmap[word]] = \ [1, self.worddf[self.wordmap[word]], i] else: docfeature[self.wordmap[word]][0] += 1 corpfeature[key] = docfeature return corpfeature def outputdocfeatures(self): for dockey in self.corpfeature.keys(): docfeature = self.corpfeature[dockey] manuallabelkey = dockey.split('/')[-1].split('.')[0] if dockey.find('Train') != -1: dir_feature_file = self.dir_feature_file[0] elif dockey.find('Validation') != -1: dir_feature_file = self.dir_feature_file[1] elif dockey.find('Test') != -1: dir_feature_file = self.dir_feature_file[2] else: dir_feature_file = self.dir_feature_file[0] output_feature_file = dir_feature_file\ + manuallabelkey + '.' + self.feature_suffix wfd = open(output_feature_file, 'w') for word in docfeature.keys(): #print word #print self.manualkeywords[manuallabelkey] #raw_input() if word in self.manualkeywords[manuallabelkey]: wfd.write('%s 1 %f %f %f\n' % (word,\ docfeature[word][0], docfeature[word][1],\ docfeature[word][2])) else: wfd.write('%s 0 %f %f %f\n' % (word,\ docfeature[word][0], docfeature[word][1],\ docfeature[word][2])) wfd.close() def normalization(self, features, method): ''' feature normalization: 1.document frequency features are normalized in the whole corpus; 2.words frequency and position are normalized in their corresponding document. ''' if method == 'minmax': features = self.minmax(features) elif method == 'norm': features = self.norm(features) elif method == 'original': pass else: print 'Invalid method choice' sys.exit(0) return features def minmax(self, features): std_feature = {} mindf = min(map(lambda x:x[1], self.worddf.items())) maxdf = max(map(lambda x:x[1], self.worddf.items())) #maxdf = 3 #mindf = 1 for dockey in features.keys(): docfeature = features[dockey] mintf = min(map(lambda x:x[1][0],\ docfeature.items())) maxtf = max(map(lambda x:x[1][0],\ docfeature.items())) minpos = min(map(lambda x:x[1][2],\ docfeature.items())) maxpos = max(map(lambda x:x[1][2],\ docfeature.items())) for word in docfeature.keys(): docfeature[word][0] = 1.0*(docfeature[word][0]-mintf)\ /max(1, (maxtf-mintf)) docfeature[word][1] = 1.0*(docfeature[word][1]-mindf)\ /(maxdf-mindf) docfeature[word][2] = 1.0*(docfeature[word][2]-minpos)\ /(maxpos-minpos) std_feature[dockey] = docfeature return std_feature def norm(self, features): meandf = np.mean(np.array(map(lambda x:x[1],\ self.worddf.items()))) stddf = np.std(np.array(map(lambda x:x[1],\ self.worddf.items()))) for dockey in features.keys(): docfeature = features[dockey] meantf = np.mean(np.array(map(lambda x:x[1][0],\ docfeature.items()))) stdtf = np.std(np.array(map(lambda x:x[1][0],\ docfeature.items()))) meanpos = np.mean(np.array(map(lambda x:x[1][2],\ docfeature.items()))) stdpos = np.std(np.array(map(lambda x:x[1][2],\ docfeature.items()))) for word in docfeature.keys(): docfeature[word][0] = (docfeature[word][0]-meantf)\ / stdtf docfeature[word][1] = (docfeature[word][1]-meandf)\ / stddf docfeature[word][2] = (docfeature[word][2]-meanpos)\ / stdpos features[dockey] = docfeature return features
class EvalResult: '''This class is used to evalute the results of keywords extraction. Currently, I adopt Precision@K and F-score to evaluate. In the keywords extraction task, we only consider words occured in the text. ''' def __init__(self, wordsmap_file, dir_keywords, dir_result, topk, \ keywords_suffix, result_suffix, completekwnum_suffix): self.wordsmap_file = wordsmap_file self.dir_keywords = dir_keywords self.dir_result = dir_result self.keywords_suffix = keywords_suffix self.result_suffix = result_suffix self.topk = topk self.completekwnum_suffix = completekwnum_suffix self.stemmer = PorterStemmer() self.wordsmap = self.loadmap() self.manualkeywords = self.getmanualkeywords() self.extractkeywords = self.self.getextractkeywords() def getdoclist(self, dir_file, filter_func): doclist = [] candi_files = os.listdir(dir_file) # filter file by suffix if existed candi_files = filter(filter_func, candi_files) candi_files = map(lambda x: dir_file+x, candi_files) doclist = doclist + candi_files return doclist def loadmap(self): wordsmap = {} for line in open(self.wordsmap_file): biparts = line.strip('\n').split(' ') wordsmap[biparts[0]] = biparts[1] return wordsmap def substr_keywords(self, candi_file): if candi_file.find(self.keywords_suffix) != -1: return True return False def substr_results(self, candi_file): if candi_file.find(self.result_suffix) != -1: return True return False def substr_kwnum(self, candi_file): if candi_file.find(self.completekwnum_suffix) != -1: return True return False def getmanualkeywords(self): '''Get manual keywords occured in the text. ''' doclist = self.getdoclist(self.dir_keywords, self.substr_keywords) manuallabels = {} for doc in doclist: docname = doc.split('/')[-1].split('.')[0] keywords_set = set([]) for line in open(doc): words = line.strip('\r\n ').split(' ') for word in words: word = word.lower() word = self.stemmer.stem(word, 0, len(word)-1) if word not in self.wrodsmap: print 'Invalid keyword' sys.exit(0) word_id = self.wordsmap[word] keywords_set.add(word_id) manuallabels[docname] = keywords_set return manuallabels def getextractkeywords(self): '''Get the extracted keywords accroding to the ranking value of candidate keywords. ''' doclist = self.getdoclist(self.dir_result, self.substr_results) extractkeywords = {} for doc in doclist: docname = doc.split('/')[-1].split('.')[0] tempwords = [] for line in open(doc): biparts = line.strip('\n\r ').split(' ') tempwords.append([biparts[0], float(biparts[1])]) tempwords = sorted(tempwords, key=lambda x: x[1],reverse=True) sortedwords = map(lambda x: x[0], tempwords) dockeywords = set(sortedwords[0:self.topk]) extractkeywords[docname] = dockeywords return extractkeywords def evaluation(self, eval_choice=None): if eval_choice == 'F-score': precision, recall, f_score = self.eval_fscore() print 'Precision: %f, Recall: %f, F-score: %f\n'\ % (precision, recall, f_score) elif eval_choice == 'Bpref': bpref = self.eval_bpref() print 'Bpref: %f\n' % bpref elif eval_choice == 'MRR': mrr = self.eval_mrr() print 'Mrr: %f\n' % mrr else: precision, recall, f_score = self.eval_fscore() bpref = self.eval_bpref() mrr = self.eval_mrr() print 'Precision: %f, Recall: %f, F-score: %f\n'\ % (precision, recall, f_score) print 'Bpref: %f\n' % bpref print 'Mrr: %f\n' % mrr # Using F-score to evaluate def eval_fscore(self): total_accnum = 0 ext_accnum = 0 ext_num = 0 for doc in self.manualkeywords.keys(): manual_kw = self.manualkeywords[doc] ext_kw = self.extractkeywords[doc] total_accnum += len(manual_kw) ext_num += len(ext_kw) ext_accnum += len(manual_kw&ext_kw) print 'Manual annotated keyphrases: %d' % total_accnum print 'Extracted total keyphrases: %d' % ext_num print 'Extracted accurate keyphrases: %d' % ext_accnum precision = ext_accnum*1.0/ext_num recall = ext_accnum*1.0/total_accnum fscore = 2*precision*recall/(precision+recall) return precision, recall, fscore # Using MRR to evaluate def eval_mrr(self): pass # Using Bpref to evaluate def eval_bpref(self): pass
class GraphGenerator: '''This class mainly construct graph for basic random walk method. Method: graph construction based on sliding window strategy. Ouput : two files-->1.graph file for each document; 2.word id in document to word map dictionary. Currently, we implements two graph representation, i.e., dense graph representation and sprase graph representation. ''' def __init__(self, dir_text_file, dir_output_file,\ windowsize, docsuffix, wordmap_file): self.mapfile_suffix = ".idmap" self.graphfile_suffix = ".graph" self.stopWords = Stopwords() self.stemmer = PorterStemmer() self.pattern = re.compile(REG_EXP) self.dir_text_file = dir_text_file self.dir_output_file = dir_output_file self.windowsize = windowsize self.docsuffix = docsuffix self.wordmap_file = wordmap_file self.doclist = [] self.getdoclist() self.corp_wordmap = {} self.readwordmap() def getdoclist(self): for rootdir in self.dir_text_file: candi_files = os.listdir(rootdir) # filter file by suffix if existed if self.docsuffix: candi_files = filter(self.substr, candi_files) candi_files = map(lambda x: rootdir+x, candi_files) self.doclist = self.doclist + candi_files def substr(self, candi_file): if candi_file.find(self.docsuffix) != -1: return True return False def readwordmap(self): for line in open(self.wordmap_file): biparts = line.strip("\r\n ").split(" ") self.corp_wordmap[biparts[0]] = int(biparts[1]) # filter words based on stopwords list and character rule def filterwords(self, textline): stemmed_words = [] saved_words = [] words = textline.split(" ") for word in words: if word == " ": continue biparts = word.split("_") # words processing (stopword, stemming, lower) # ============================================ biparts[0] = biparts[0].lower() biparts[0] = self.stemmer.stem(biparts[0], 0, \ len(biparts[0])-1) stemmed_words.append(biparts[0]) if biparts[1] in POS: if not self.stopWords.is_stopword(biparts[0])\ and self.pattern.match(biparts[0]): saved_words.append(biparts[0]) # ============================================ return stemmed_words, saved_words # graph construction # strategy: 1.filter words accroding to POS tags; # 2.construct graph based on sliding window. def construct(self): for doc in self.doclist: doc_prefix = doc.split('/')[-1].split('.')[0] output_graphfile = self.dir_output_file + doc_prefix \ + self.graphfile_suffix output_mapfile = self.dir_output_file + doc_prefix \ + self.mapfile_suffix cleaned_wordslist = [] stemmed_wordslist = [] for line in open(doc): line = line.strip('\n\r ') stemmed_words, cleaned_words = self.filterwords(line) cleaned_wordslist = cleaned_wordslist + cleaned_words stemmed_wordslist = stemmed_wordslist + stemmed_words wordsmap_indoc = self.numword_indoc(cleaned_wordslist) #print wordsmap_indoc pairids = self.mapwordspair(wordsmap_indoc) pairids = sorted(pairids, key=lambda x: x[0]) dense_graph = self.slidingwindow(stemmed_wordslist, wordsmap_indoc) self.output_graph('dense', dense_graph, output_graphfile) self.output_graph('sparse', dense_graph, output_graphfile) self.output_map(pairids, output_mapfile) def mapwordspair(self, ids_indoc): pairids = [] for key in ids_indoc.keys(): pairids.append([ids_indoc[key], self.corp_wordmap[key]]) return pairids def slidingwindow(self, stemmed_wordslist, wordsmap_indoc): dense_graph = np.array([0.0 for i in range(len(wordsmap_indoc)\ *len(wordsmap_indoc))]) dense_graph = dense_graph.reshape(len(wordsmap_indoc), len(wordsmap_indoc)) for i, word in enumerate(stemmed_wordslist): if stemmed_wordslist[i] in wordsmap_indoc: sliding_text = stemmed_wordslist[max(0, i-self.windowsize):\ min(len(stemmed_wordslist), i+self.windowsize+1)] for j in range(len(sliding_text)): if stemmed_wordslist[i] == sliding_text[j]: continue if sliding_text[j] in wordsmap_indoc: dense_graph[wordsmap_indoc[stemmed_wordslist[i]]-1,\ wordsmap_indoc[sliding_text[j]]-1] += 1 return dense_graph def numword_indoc(self, wordslist_indoc): wordsmap_indoc = {} word_id = 1 for word in wordslist_indoc: if word not in wordsmap_indoc: wordsmap_indoc[word] = word_id word_id += 1 return wordsmap_indoc def output_graph(self, choice, graphdata, graphfile): #print graphfile if choice == "dense": wfd = open(graphfile+'.dense', 'w') for i in range(len(graphdata)): wfd.write("%s\n"%' '.join(map(lambda x: str(x), graphdata[i]))) elif choice == "sparse": wfd = open(graphfile+'.sparse', 'w') for i in range(len(graphdata)): for j in range(len(graphdata)): if graphdata[i,j] != 0: wfd.write("%d %d %d\n" % (i, j, graphdata[i, j])) wfd.close() def output_map(self, mapdata, mapfile): #print mapfile wfd = open(mapfile, 'w') for i in range(len(mapdata)): wfd.write("%d\n" % mapdata[i][1]) wfd.close()
class NodeFeatureGenerator: """This class provides a framework which is easy for feature addition and deletion. The following features are common features that have been used in keywords/keyphrase extraction task. Features: the first three features listed below must be generated. 1.TF; 2.DF; 3.POSITION; 4.TF-IDF; 5.lenText; 6.POS-Tagging; Note: all features should be normalized to 0-1. """ def __init__( self, posset, feature_bittag, dir_text, text_suffix, dir_feature, feature_suffix, dir_manualkp, manualkp_suffix, wordsmap_file, featurenum, nonposfeature, ): # receving input self.posset = posset self.feature_bittag = feature_bittag self.dir_text = dir_text self.text_suffix = text_suffix self.dir_feature = dir_feature self.feature_suffix = feature_suffix self.dir_manualkp = dir_manualkp self.manualkp_suffix = manualkp_suffix self.wordsmap_file = wordsmap_file self.featurenum = featurenum self.nonposfeature = nonposfeature # inner variable setting self.poslist = list(posset) self.featuretype = sum(feature_bittag) self.stemmer = PorterStemmer() self.wordsmap = self.loadmap() self.wordfilter = WordFilter() self.doctext, self.doctags, self.worddf = self.loaddoctext() self.manualkeywords = self.getmanualkeywords() def generatefeature(self, norm_method): # running self.corpfeature = self.mkfeatures() self.normalization(norm_method) self.outputdocfeatures() def loadmap(self): wordsmap = {} for line in open(self.wordsmap_file): biparts = line.strip("\n").split(" ") wordsmap[biparts[0]] = biparts[1] return wordsmap def loaddoctext(self): doctext = {} doctags = {} worddf = {} doclist = self.getdoclist(self.substr_text, self.dir_text) for doc in doclist: docwordlist = [] doctaglist = [] for line in open(doc): clean_words, tags = self.wordfilter.filterwords(line.strip("\r\n ")) docwordlist += clean_words doctaglist += tags doctext[doc] = docwordlist doctags[doc] = doctaglist # compute document frequency for words for word in set(docwordlist): if word in self.wordsmap: wordid = self.wordsmap[word] if wordid in worddf: worddf[wordid] += 1 else: worddf[wordid] = 1 return doctext, doctags, worddf def getdoclist(self, substr_func, dir_file): doclist = [] for subdir in dir_file: candi_files = os.listdir(subdir) # filter file by suffix if existed candi_files = filter(substr_func, candi_files) candi_files = map(lambda x: subdir + x, candi_files) doclist = doclist + candi_files return doclist def substr_text(self, candi_file): if candi_file.find(self.text_suffix) != -1: return True return False def substr_manualkp(self, candi_file): if candi_file.find(self.manualkp_suffix) != -1: return True return False def getmanualkeywords(self): """segment each keyphrase into keywords """ manualkeywords = {} doclist = self.getdoclist(self.substr_manualkp, self.dir_manualkp) for doc in doclist: docname = doc.split("/")[-1].split(".")[0] keywordset = set([]) for line in open(doc): for word in line.strip("\r\n ").split(" "): word = word.lower() word = self.stemmer.stem(word, 0, len(word) - 1) if word not in keywordset: keywordset.add(word) keywordset = map(lambda x: self.wordsmap[x], keywordset) manualkeywords[docname] = set(keywordset) return manualkeywords def mkfeatures(self): corpfeature = {} for dockey in self.doctext.keys(): docfeature = {} doctext = self.doctext[dockey] doctags = self.doctags[dockey] for i, word in enumerate(doctext): if word in self.wordsmap and doctags[i] in self.posset: if self.wordsmap[word] not in docfeature: # class # wordfeature = Feature() # wordfeature.tf = 1 # wordfeature.df = self.worddf[word] # wordfeature.position = i # list wordfeature = [0 for j in range(self.featurenum)] wordfeature[0] = 1 wordfeature[1] = self.worddf[self.wordsmap[word]] wordfeature[2] = i # word's length feature if self.feature_bittag[4] == 1: # wordfeature.lentext = len(word) wordfeature[4] = len(word) # word's pos feature if self.feature_bittag[5] == 1: posidx = self.poslist.index(doctags[i]) if posidx < 0: print "Invalid pos tags" sys.exit(1) wordfeature[self.nonposfeature + posidx] = 1 docfeature[self.wordsmap[word]] = wordfeature else: docfeature[self.wordsmap[word]][0] += 1 # word's tfidf feature if self.feature_bittag[3] == 1: for wordkey in docfeature.keys(): docfeature[wordkey][3] = self.comptfidf( docfeature[wordkey][0], docfeature[wordkey][1], len(self.doctext.keys()) ) corpfeature[dockey] = docfeature return corpfeature def comptfidf(self, tf, df, docnum): return tf * math.log((docnum * 1.0) / df) def normalization(self, method): """ feature normalization: 1.document frequency features are normalized in the whole corpus; 2.words frequency and position are normalized in their corresponding document. """ if method == "minmax": self.minmax() elif method == "norm": self.norm() elif method == "original": pass else: print "Invalid method choice" sys.exit(0) def minmax(self): std_feature = {} # words' df feature mindf = min(map(lambda x: x[1], self.worddf.items())) maxdf = max(map(lambda x: x[1], self.worddf.items())) for dockey in self.corpfeature.keys(): docfeature = self.corpfeature[dockey] mintf = min(map(lambda x: x[1][0], docfeature.items())) maxtf = max(map(lambda x: x[1][0], docfeature.items())) minpos = min(map(lambda x: x[1][2], docfeature.items())) maxpos = max(map(lambda x: x[1][2], docfeature.items())) if self.feature_bittag[3] == 1: mintfidf = min(map(lambda x: x[1][3], docfeature.items())) maxtfidf = max(map(lambda x: x[1][3], docfeature.items())) if self.feature_bittag[4] == 1: minlength = min(map(lambda x: x[1][4], docfeature.items())) maxlength = max(map(lambda x: x[1][4], docfeature.items())) for word in docfeature.keys(): docfeature[word][0] = 1.0 * (docfeature[word][0] - mintf) / max(1, (maxtf - mintf)) docfeature[word][1] = 1.0 * (docfeature[word][1] - mindf) / (maxdf - mindf) docfeature[word][2] = 1.0 * (docfeature[word][2] - minpos) / (maxpos - minpos) if self.feature_bittag[3] == 1: docfeature[word][3] = 1.0 * (docfeature[word][3] - mintfidf) / (maxtfidf - mintfidf) if self.feature_bittag[4] == 1: docfeature[word][4] = 1.0 * (docfeature[word][4] - minlength) / (maxlength - minlength) std_feature[dockey] = docfeature self.corpfeature = std_feature def norm(self): pass def outputdocfeatures(self): for dockey in self.corpfeature.keys(): docfeature = self.corpfeature[dockey] manuallabelkey = dockey.split("/")[-1].split(".")[0] if dockey.find("Train") != -1: dir_feature = self.dir_feature[0] elif dockey.find("Validation") != -1: dir_feature = self.dir_feature[1] elif dockey.find("Test") != -1: dir_feature = self.dir_feature[2] else: dir_feature = self.dir_feature[0] output_feature_file = dir_feature + manuallabelkey + "." + self.feature_suffix wfd = open(output_feature_file, "w") for word in docfeature.keys(): # print word # print self.manualkeywords[manuallabelkey] # raw_input() if word in self.manualkeywords[manuallabelkey]: wfd.write("%s 1 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2])) else: wfd.write("%s 0 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2])) if self.feature_bittag[3] == 1: wfd.write(" %f" % docfeature[word][3]) if self.feature_bittag[4] == 1: wfd.write(" %f" % docfeature[word][4]) if self.feature_bittag[5] == 1: for i in range(5, self.featurenum): wfd.write(" %d" % docfeature[word][i]) wfd.write("\n") wfd.close()
class EvalResult: '''This class is responsible to do evaluation on the extraction results. ''' def __init__(self, words_map_file, dir_keywords_file,\ dir_results_file, dir_wholetext_file,\ keywords_suffix, results_suffix, wholetext_suffix,\ kwnum_suffix, topk): self.words_map_file = words_map_file self.dir_keywords_file = dir_keywords_file self.dir_results_file = dir_results_file self.dir_wholetext_file = dir_wholetext_file self.keywords_suffix = keywords_suffix self.results_suffix = results_suffix self.wholetext_suffix = wholetext_suffix self.kwnum_suffix = kwnum_suffix self.topk = topk self.stemmer = PorterStemmer() self.wordsmap = self.loadmap() self.doclist = self.getdoclist(self.dir_wholetext_file,\ self.substr_wholetext) self.corpkeyphrase = {} self.getkeyphrase() self.manualkwnum = {} self.manuallabels = {} self.getmanuallabels() def getdoclist(self, dir_file, filter_func): doclist = [] candi_files = os.listdir(dir_file) # filter file by suffix if existed candi_files = filter(filter_func, candi_files) candi_files = map(lambda x: dir_file+x, candi_files) doclist = doclist + candi_files return doclist def substr_keywords(self, candi_file): if candi_file.find(self.keywords_suffix) != -1: return True return False def substr_results(self, candi_file): if candi_file.find(self.results_suffix) != -1: return True return False def substr_wholetext(self, candi_file): if candi_file.find(self.wholetext_suffix) != -1: return True return False def substr_kwnum(self, candi_file): if candi_file.find(self.kwnum_suffix) != -1: return True return False def loadmap(self): wordsmap = {} for line in open(self.words_map_file): biparts = line.strip('\n').split(' ') wordsmap[biparts[0]] = biparts[1] return wordsmap def getkeyphrase(self): for doc in self.doclist: results_file = self.dir_results_file \ + doc.split('/')[-1].split('.')[0]\ + '.' + self.results_suffix #print results_file #print doc docwordlist = [] docwordtags = [] for line in open(doc): line = line.strip("\n\r ") save_words, save_postags = self.filterwords(line) docwordlist = docwordlist + save_words docwordtags = docwordtags + save_postags dockeywords, sortedwords, wordsvalue = \ self.getkeywords(results_file) #print sortedwords kp_num, dockeyphrase = self.mergekeywords(dockeywords,\ docwordlist, wordsvalue, docwordtags) #print dockeyphrase #print dockeyphrase #raw_input() dockeyphrase = sorted(dockeyphrase.items(), \ key=lambda x:x[1], reverse=True) dockeyphrase = map(lambda x:x[0], dockeyphrase) self.corpkeyphrase[doc.split('/')[-1].split('.')[0]] = \ set(dockeyphrase[0:int(10.0*len(dockeyphrase)/10+1)+1]) def getmanuallabels(self): doclist = self.getdoclist(self.dir_keywords_file,\ self.substr_kwnum) for doc in doclist: docname = doc.split('/')[-1].split('.')[0] manual_kwnum = int(open(doc).readline().strip('\n')) self.manualkwnum[docname] = manual_kwnum doclist = self.getdoclist(self.dir_keywords_file,\ self.substr_keywords) for doc in doclist: docname = doc.split('/')[-1].split('.')[0] keyphrase_set = set([]) for line in open(doc): words_id =[] words = line.strip('\r\n ').split(' ') for word in words: word = word.lower() word = self.stemmer.stem(word, 0, \ len(word)-1) if word not in self.wordsmap: print 'Invalid keyword' sys.exit(0) word_id = self.wordsmap[word] words_id.append(word_id) keyphrase = '_'.join(words_id) keyphrase_set.add(keyphrase) self.manuallabels[docname] = keyphrase_set def mergekeywords(self, dockeywords, docwordlist, wordsvalue,\ docwordtags): dockeyphrase = {} kp_num = 0 kp_tag = False kp_start = 0 for i,word in enumerate(docwordlist): if not kp_tag: if word in dockeywords: kp_start = i kp_tag = True else: if word not in dockeywords or i == len(docwordlist)-1: kp_end = i if word not in dockeywords else i+1 kp_tag = False keywords_segment = docwordlist[kp_start:kp_end] keywords_postags = docwordtags[kp_start:kp_end] keywords_segment = self.postag_verify(keywords_segment,\ keywords_postags) if keywords_segment: keyphrase = '_'.join(keywords_segment) keyphrase_val = self.getkpvalue(keywords_segment,\ wordsvalue) if keyphrase not in dockeyphrase: dockeyphrase[keyphrase] = keyphrase_val kp_num += 1 return kp_num, dockeyphrase def getkpvalue(self, keywords_segment, wordsvalue): kpvalue = 0.0 for keyword in keywords_segment: kpvalue += wordsvalue[keyword] return kpvalue def getkeywords(self, keywords_file): tempwords = [] for line in open(keywords_file): biparts = line.strip('\n\r ').split(' ') tempwords.append([biparts[0], float(biparts[1])]) tempwords = sorted(tempwords, key=lambda x: x[1],\ reverse=True) #print tempwordss) sortedwords = map(lambda x: x[0], tempwords) #print sortedwords #raw_input() wordsvalue = dict(tempwords) wordsnum = len(sortedwords) keywordsnum = int(1.0*wordsnum) dockeywords = set(sortedwords[0:keywordsnum]) return dockeywords, sortedwords, wordsvalue def filterwords(self, textline): save_words = [] save_postags = [] words = textline.split(" ") for word in words: biparts = word.split("_") if len(biparts) != 2: print 'Invalid words occurence.' sys.exit(0) # words processing (stemming, lower) # ============================================ biparts[0] = biparts[0].lower() biparts[0] = self.stemmer.stem(biparts[0], 0, \ len(biparts[0])-1) # ============================================ if biparts[0] not in self.wordsmap: save_words.append('-1') save_postags.append(biparts[1]) else: save_words.append(self.wordsmap[biparts[0]]) save_postags.append(biparts[1]) return save_words, save_postags # filter candidate keyphrase with invalid postag sequence def postag_verify(self, save_words, save_postags): #if len(save_words) == 1 and save_postags[0] in ADJ_POS: # return None return save_words '''def postag_verify(self, save_words, save_postags): state = 0 end_idx = -1 for i, postag in enumerate(save_postags): if state == 0: if postag in ADJ_POS: continue elif postag in NOUN_POS: state = 1 elif state == 1: if postag in ADJ_POS: end_idx = i-1 elif postag in NOUN_POS: end_idx = i continue if end_idx == -1: return None else: return save_words[0:end_idx+1] ''' def evaluation(self, eval_choice=None): if eval_choice == 'F-score': precision, recall, f_score = self.eval_fscore() print 'Precision: %f, Recall: %f, F-score: %f\n'\ % (precision, recall, f_score) elif eval_choice == 'Bpref': bpref = self.eval_bpref() print 'Bpref: %f\n' % bpref elif eval_choice == 'MRR': mrr = self.eval_mrr() print 'Mrr: %f\n' % mrr else: precision, recall, f_score = self.eval_fscore() bpref = self.eval_bpref() mrr = self.eval_mrr() print 'Precision: %f, Recall: %f, F-score: %f\n'\ % (precision, recall, f_score) print 'Bpref: %f\n' % bpref print 'Mrr: %f\n' % mrr # Using F-score to evaluate def eval_fscore(self): total_accnum = 0 ext_accnum = 0 ext_num = 0 for doc in self.corpkeyphrase.keys(): ext_kp = self.corpkeyphrase[doc] manual_kp = self.manuallabels[doc] #print ext_kp #print manual_kp #raw_input() #total_accnum += len(manual_kp) total_accnum += self.manualkwnum[doc] ext_num += len(ext_kp) ext_accnum += len(manual_kp&ext_kp) print 'Manual annotated keyphrases: %d' % total_accnum print 'Extracted total keyphrases: %d' % ext_num print 'Extracted accurate keyphrases: %d' % ext_accnum precision = ext_accnum*1.0/ext_num recall = ext_accnum*1.0/total_accnum fscore = 2*precision*recall/(precision+recall) return precision, recall, fscore # Using MRR to evaluate def eval_mrr(self): pass # Using Bpref to evaluate def eval_bpref(self): pass
class WordmapGenerator: '''This class generates word map for the specified corpus. Because of the task for keyphrase extraction, we need to specify the POS sets of which the words will be saved. It also needs to be general so that all keyphrase extraction methods can utilize it. As we will use pos tags, we construct word map from cleaned text. Note that some keyphrases are not existed in the abstract, so we need to index them in dictionary. ''' def __init__(self, dir_text_file, output_wordmap_file, \ pos_sets, reg_exp, docsuffix, kpdocsuffix): self.dir_text_file = dir_text_file self.output_wordmap_file = output_wordmap_file self.pos_sets = pos_sets self.docsuffix = docsuffix self.kpdocsuffix = kpdocsuffix self.stemmer = PorterStemmer() self.dict_words = [] self.pattern = re.compile(reg_exp) self.doclist = [] self.doclist = self.getdoclist(1) def getdoclist(self, choice): temp_doclist = [] for rootdir in self.dir_text_file: candi_files = os.listdir(rootdir) # filter file by suffix if existed if choice == 1: candi_files = filter(self.doc_substr, candi_files) elif choice == 2: candi_files = filter(self.kpdoc_substr, candi_files) candi_files = map(lambda x: rootdir+x, candi_files) temp_doclist = temp_doclist + candi_files return temp_doclist def doc_substr(self, candi_file): if candi_file.find(self.docsuffix) != -1: return True return False def kpdoc_substr(self, candi_file): if candi_file.find(self.kpdocsuffix) != -1: return True return False def genwordmap(self): temp_words = set([]) for doc in self.doclist: for line in open(doc): words = line.strip("\n\r ").split(' ') for word in words: biparts = word.split('_') # words processing # ================ biparts[0] = biparts[0].lower() biparts[0] = self.stemmer.stem(biparts[0], 0,\ len(biparts[0])-1) if len(biparts) == 2 and biparts[1] in self.pos_sets\ and self.pattern.match(biparts[0]): temp_words.add(biparts[0]) # ================ self.doclist = self.getdoclist(2) temp_keywords = set([]) miss_keywords = 0 for doc in self.doclist: for line in open(doc): textunits = line.strip('\n\r ').split(" ") for textunit in textunits: # words processing # ================ textunit = textunit.lower() textunit = self.stemmer.stem(textunit, 0, len(textunit)-1) # ================ temp_keywords.add(textunit) if not textunit in temp_words: temp_words.add(textunit) miss_keywords += 1 self.dict_words = sorted(temp_words) print "Number of unique keywords: %d, " % len(temp_keywords) print "number of left missing words: %d.\n" % miss_keywords def output_wordmap(self): wfd = open(self.output_wordmap_file, 'w') for i,word in enumerate(self.dict_words): wfd.write("%s %d\n" % (word, i+1)) wfd.close()