def __init__( self, posset, feature_bittag, dir_text, text_suffix, dir_feature, feature_suffix, dir_manualkp, manualkp_suffix, wordsmap_file, featurenum, nonposfeature, ): # receving input self.posset = posset self.feature_bittag = feature_bittag self.dir_text = dir_text self.text_suffix = text_suffix self.dir_feature = dir_feature self.feature_suffix = feature_suffix self.dir_manualkp = dir_manualkp self.manualkp_suffix = manualkp_suffix self.wordsmap_file = wordsmap_file self.featurenum = featurenum self.nonposfeature = nonposfeature # inner variable setting self.poslist = list(posset) self.featuretype = sum(feature_bittag) self.stemmer = PorterStemmer() self.wordsmap = self.loadmap() self.wordfilter = WordFilter() self.doctext, self.doctags, self.worddf = self.loaddoctext() self.manualkeywords = self.getmanualkeywords()
def __init__(self, dir_text_file, text_suffix,\ dir_feature_file, feature_suffix,\ dir_manualkp_file, manualkp_suffix,\ words_map_file): self.dir_text_file = dir_text_file self.text_suffix = text_suffix self.dir_feature_file = dir_feature_file self.feature_suffix = feature_suffix self.dir_manualkp_file = dir_manualkp_file self.manualkp_suffix = manualkp_suffix self.words_map_file = words_map_file self.featurenum = 3 self.stemmer = PorterStemmer() self.wordmap = self.loadmap() self.doclist = self.getdoclist(self.substr_text,\ dir_text_file) self.wordFilter = WordFilter() self.doctext,self.doctags,self.worddf=self.loaddoctext() self.manualkeywords = self.getmanuallabels() corpfeature = self.mkdocfeatures() self.corpfeature = self.normalization(corpfeature,\ method='minmax')
class FeatureGenerator(): '''This class serves for the supervised keyphrase extraction method, i.e., logistic regression for keyword extraction and then merge keywords to keyphrases. Features: as we use "POS tags" to merge keywords when generating keyphrases, we don't use "POS" tags as features, 1.TF; 2.DF; 3.POSITION. Note: all features should be normalized. ''' def __init__(self, dir_text_file, text_suffix,\ dir_feature_file, feature_suffix,\ dir_manualkp_file, manualkp_suffix,\ words_map_file): self.dir_text_file = dir_text_file self.text_suffix = text_suffix self.dir_feature_file = dir_feature_file self.feature_suffix = feature_suffix self.dir_manualkp_file = dir_manualkp_file self.manualkp_suffix = manualkp_suffix self.words_map_file = words_map_file self.featurenum = 3 self.stemmer = PorterStemmer() self.wordmap = self.loadmap() self.doclist = self.getdoclist(self.substr_text,\ dir_text_file) self.wordFilter = WordFilter() self.doctext,self.doctags,self.worddf=self.loaddoctext() self.manualkeywords = self.getmanuallabels() corpfeature = self.mkdocfeatures() self.corpfeature = self.normalization(corpfeature,\ method='minmax') def loaddoctext(self): doctext = {} doctags = {} worddf = {} for doc in self.doclist: docwordlist = [] doctaglist = [] docwordset = set([]) for line in open(doc): clean_words,tags=self.wordFilter.filterwords(\ line.strip('\r\n ')) docwordlist += clean_words doctaglist += tags #docname = doc.split('/')[-1].split('.')[0] doctext[doc] = docwordlist doctags[doc] = doctaglist for word in set(docwordlist): if word in self.wordmap: wordid = self.wordmap[word] if wordid not in docwordset: if wordid in worddf: worddf[wordid] += 1 else: worddf[wordid] = 1 docwordset.add(wordid) return doctext, doctags, worddf def getmanuallabels(self): '''segment each keyphrase into keywords ''' manualkeywords = {} doclist = self.getdoclist(self.substr_manualkp,\ self.dir_manualkp_file) for doc in doclist: docname = doc.split('/')[-1].split('.')[0] keywordset = set([]) for line in open(doc): for word in line.strip('\r\n ').split(' '): word = word.lower() word = self.stemmer.stem(word, 0, \ len(word)-1) if word not in keywordset: keywordset.add(word) keywordset = map(lambda x:self.wordmap[x],\ keywordset) manualkeywords[docname] = set(keywordset) return manualkeywords def getdoclist(self, substr_func, dir_file): doclist = [] for subdir in dir_file: candi_files = os.listdir(subdir) # filter file by suffix if existed candi_files = filter(substr_func, candi_files) candi_files = map(lambda x: subdir+x, candi_files) doclist = doclist + candi_files return doclist def substr_text(self, candi_file): if candi_file.find(self.text_suffix) != -1: return True return False def substr_manualkp(self, candi_file): if candi_file.find(self.manualkp_suffix) != -1: return True return False def loadmap(self): wordsmap = {} for line in open(self.words_map_file): biparts = line.strip('\n').split(' ') wordsmap[biparts[0]] = biparts[1] return wordsmap def mkdocfeatures(self): '''Feature format: 1.tf; 2.df; 3.position ''' corpfeature = {} for key in self.doctext.keys(): docfeature = {} doctext = self.doctext[key] doctags = self.doctags[key] for i, word in enumerate(doctext): if word in self.wordmap and doctags[i] in POS: if self.wordmap[word] not in docfeature: docfeature[self.wordmap[word]] = \ [1, self.worddf[self.wordmap[word]], i] else: docfeature[self.wordmap[word]][0] += 1 corpfeature[key] = docfeature return corpfeature def outputdocfeatures(self): for dockey in self.corpfeature.keys(): docfeature = self.corpfeature[dockey] manuallabelkey = dockey.split('/')[-1].split('.')[0] if dockey.find('Train') != -1: dir_feature_file = self.dir_feature_file[0] elif dockey.find('Validation') != -1: dir_feature_file = self.dir_feature_file[1] elif dockey.find('Test') != -1: dir_feature_file = self.dir_feature_file[2] else: dir_feature_file = self.dir_feature_file[0] output_feature_file = dir_feature_file\ + manuallabelkey + '.' + self.feature_suffix wfd = open(output_feature_file, 'w') for word in docfeature.keys(): #print word #print self.manualkeywords[manuallabelkey] #raw_input() if word in self.manualkeywords[manuallabelkey]: wfd.write('%s 1 %f %f %f\n' % (word,\ docfeature[word][0], docfeature[word][1],\ docfeature[word][2])) else: wfd.write('%s 0 %f %f %f\n' % (word,\ docfeature[word][0], docfeature[word][1],\ docfeature[word][2])) wfd.close() def normalization(self, features, method): ''' feature normalization: 1.document frequency features are normalized in the whole corpus; 2.words frequency and position are normalized in their corresponding document. ''' if method == 'minmax': features = self.minmax(features) elif method == 'norm': features = self.norm(features) elif method == 'original': pass else: print 'Invalid method choice' sys.exit(0) return features def minmax(self, features): std_feature = {} mindf = min(map(lambda x:x[1], self.worddf.items())) maxdf = max(map(lambda x:x[1], self.worddf.items())) #maxdf = 3 #mindf = 1 for dockey in features.keys(): docfeature = features[dockey] mintf = min(map(lambda x:x[1][0],\ docfeature.items())) maxtf = max(map(lambda x:x[1][0],\ docfeature.items())) minpos = min(map(lambda x:x[1][2],\ docfeature.items())) maxpos = max(map(lambda x:x[1][2],\ docfeature.items())) for word in docfeature.keys(): docfeature[word][0] = 1.0*(docfeature[word][0]-mintf)\ /max(1, (maxtf-mintf)) docfeature[word][1] = 1.0*(docfeature[word][1]-mindf)\ /(maxdf-mindf) docfeature[word][2] = 1.0*(docfeature[word][2]-minpos)\ /(maxpos-minpos) std_feature[dockey] = docfeature return std_feature def norm(self, features): meandf = np.mean(np.array(map(lambda x:x[1],\ self.worddf.items()))) stddf = np.std(np.array(map(lambda x:x[1],\ self.worddf.items()))) for dockey in features.keys(): docfeature = features[dockey] meantf = np.mean(np.array(map(lambda x:x[1][0],\ docfeature.items()))) stdtf = np.std(np.array(map(lambda x:x[1][0],\ docfeature.items()))) meanpos = np.mean(np.array(map(lambda x:x[1][2],\ docfeature.items()))) stdpos = np.std(np.array(map(lambda x:x[1][2],\ docfeature.items()))) for word in docfeature.keys(): docfeature[word][0] = (docfeature[word][0]-meantf)\ / stdtf docfeature[word][1] = (docfeature[word][1]-meandf)\ / stddf docfeature[word][2] = (docfeature[word][2]-meanpos)\ / stdpos features[dockey] = docfeature return features
class NodeFeatureGenerator: """This class provides a framework which is easy for feature addition and deletion. The following features are common features that have been used in keywords/keyphrase extraction task. Features: the first three features listed below must be generated. 1.TF; 2.DF; 3.POSITION; 4.TF-IDF; 5.lenText; 6.POS-Tagging; Note: all features should be normalized to 0-1. """ def __init__( self, posset, feature_bittag, dir_text, text_suffix, dir_feature, feature_suffix, dir_manualkp, manualkp_suffix, wordsmap_file, featurenum, nonposfeature, ): # receving input self.posset = posset self.feature_bittag = feature_bittag self.dir_text = dir_text self.text_suffix = text_suffix self.dir_feature = dir_feature self.feature_suffix = feature_suffix self.dir_manualkp = dir_manualkp self.manualkp_suffix = manualkp_suffix self.wordsmap_file = wordsmap_file self.featurenum = featurenum self.nonposfeature = nonposfeature # inner variable setting self.poslist = list(posset) self.featuretype = sum(feature_bittag) self.stemmer = PorterStemmer() self.wordsmap = self.loadmap() self.wordfilter = WordFilter() self.doctext, self.doctags, self.worddf = self.loaddoctext() self.manualkeywords = self.getmanualkeywords() def generatefeature(self, norm_method): # running self.corpfeature = self.mkfeatures() self.normalization(norm_method) self.outputdocfeatures() def loadmap(self): wordsmap = {} for line in open(self.wordsmap_file): biparts = line.strip("\n").split(" ") wordsmap[biparts[0]] = biparts[1] return wordsmap def loaddoctext(self): doctext = {} doctags = {} worddf = {} doclist = self.getdoclist(self.substr_text, self.dir_text) for doc in doclist: docwordlist = [] doctaglist = [] for line in open(doc): clean_words, tags = self.wordfilter.filterwords(line.strip("\r\n ")) docwordlist += clean_words doctaglist += tags doctext[doc] = docwordlist doctags[doc] = doctaglist # compute document frequency for words for word in set(docwordlist): if word in self.wordsmap: wordid = self.wordsmap[word] if wordid in worddf: worddf[wordid] += 1 else: worddf[wordid] = 1 return doctext, doctags, worddf def getdoclist(self, substr_func, dir_file): doclist = [] for subdir in dir_file: candi_files = os.listdir(subdir) # filter file by suffix if existed candi_files = filter(substr_func, candi_files) candi_files = map(lambda x: subdir + x, candi_files) doclist = doclist + candi_files return doclist def substr_text(self, candi_file): if candi_file.find(self.text_suffix) != -1: return True return False def substr_manualkp(self, candi_file): if candi_file.find(self.manualkp_suffix) != -1: return True return False def getmanualkeywords(self): """segment each keyphrase into keywords """ manualkeywords = {} doclist = self.getdoclist(self.substr_manualkp, self.dir_manualkp) for doc in doclist: docname = doc.split("/")[-1].split(".")[0] keywordset = set([]) for line in open(doc): for word in line.strip("\r\n ").split(" "): word = word.lower() word = self.stemmer.stem(word, 0, len(word) - 1) if word not in keywordset: keywordset.add(word) keywordset = map(lambda x: self.wordsmap[x], keywordset) manualkeywords[docname] = set(keywordset) return manualkeywords def mkfeatures(self): corpfeature = {} for dockey in self.doctext.keys(): docfeature = {} doctext = self.doctext[dockey] doctags = self.doctags[dockey] for i, word in enumerate(doctext): if word in self.wordsmap and doctags[i] in self.posset: if self.wordsmap[word] not in docfeature: # class # wordfeature = Feature() # wordfeature.tf = 1 # wordfeature.df = self.worddf[word] # wordfeature.position = i # list wordfeature = [0 for j in range(self.featurenum)] wordfeature[0] = 1 wordfeature[1] = self.worddf[self.wordsmap[word]] wordfeature[2] = i # word's length feature if self.feature_bittag[4] == 1: # wordfeature.lentext = len(word) wordfeature[4] = len(word) # word's pos feature if self.feature_bittag[5] == 1: posidx = self.poslist.index(doctags[i]) if posidx < 0: print "Invalid pos tags" sys.exit(1) wordfeature[self.nonposfeature + posidx] = 1 docfeature[self.wordsmap[word]] = wordfeature else: docfeature[self.wordsmap[word]][0] += 1 # word's tfidf feature if self.feature_bittag[3] == 1: for wordkey in docfeature.keys(): docfeature[wordkey][3] = self.comptfidf( docfeature[wordkey][0], docfeature[wordkey][1], len(self.doctext.keys()) ) corpfeature[dockey] = docfeature return corpfeature def comptfidf(self, tf, df, docnum): return tf * math.log((docnum * 1.0) / df) def normalization(self, method): """ feature normalization: 1.document frequency features are normalized in the whole corpus; 2.words frequency and position are normalized in their corresponding document. """ if method == "minmax": self.minmax() elif method == "norm": self.norm() elif method == "original": pass else: print "Invalid method choice" sys.exit(0) def minmax(self): std_feature = {} # words' df feature mindf = min(map(lambda x: x[1], self.worddf.items())) maxdf = max(map(lambda x: x[1], self.worddf.items())) for dockey in self.corpfeature.keys(): docfeature = self.corpfeature[dockey] mintf = min(map(lambda x: x[1][0], docfeature.items())) maxtf = max(map(lambda x: x[1][0], docfeature.items())) minpos = min(map(lambda x: x[1][2], docfeature.items())) maxpos = max(map(lambda x: x[1][2], docfeature.items())) if self.feature_bittag[3] == 1: mintfidf = min(map(lambda x: x[1][3], docfeature.items())) maxtfidf = max(map(lambda x: x[1][3], docfeature.items())) if self.feature_bittag[4] == 1: minlength = min(map(lambda x: x[1][4], docfeature.items())) maxlength = max(map(lambda x: x[1][4], docfeature.items())) for word in docfeature.keys(): docfeature[word][0] = 1.0 * (docfeature[word][0] - mintf) / max(1, (maxtf - mintf)) docfeature[word][1] = 1.0 * (docfeature[word][1] - mindf) / (maxdf - mindf) docfeature[word][2] = 1.0 * (docfeature[word][2] - minpos) / (maxpos - minpos) if self.feature_bittag[3] == 1: docfeature[word][3] = 1.0 * (docfeature[word][3] - mintfidf) / (maxtfidf - mintfidf) if self.feature_bittag[4] == 1: docfeature[word][4] = 1.0 * (docfeature[word][4] - minlength) / (maxlength - minlength) std_feature[dockey] = docfeature self.corpfeature = std_feature def norm(self): pass def outputdocfeatures(self): for dockey in self.corpfeature.keys(): docfeature = self.corpfeature[dockey] manuallabelkey = dockey.split("/")[-1].split(".")[0] if dockey.find("Train") != -1: dir_feature = self.dir_feature[0] elif dockey.find("Validation") != -1: dir_feature = self.dir_feature[1] elif dockey.find("Test") != -1: dir_feature = self.dir_feature[2] else: dir_feature = self.dir_feature[0] output_feature_file = dir_feature + manuallabelkey + "." + self.feature_suffix wfd = open(output_feature_file, "w") for word in docfeature.keys(): # print word # print self.manualkeywords[manuallabelkey] # raw_input() if word in self.manualkeywords[manuallabelkey]: wfd.write("%s 1 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2])) else: wfd.write("%s 0 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2])) if self.feature_bittag[3] == 1: wfd.write(" %f" % docfeature[word][3]) if self.feature_bittag[4] == 1: wfd.write(" %f" % docfeature[word][4]) if self.feature_bittag[5] == 1: for i in range(5, self.featurenum): wfd.write(" %d" % docfeature[word][i]) wfd.write("\n") wfd.close()