class FeatFunctions(object): """docstring for featFunctions""" def __init__(self, n_features=None): # self.arg = arg import re from sklearn.feature_extraction.text import FeatureHasher from numpy.random import randn, randint from sklearn.feature_extraction.text import CountVectorizer # Define some parameters: if not n_features: n_features = 100000 # Initialize the hasher: self.hasher = FeatureHasher(n_features=n_features, input_type="string", non_negative=True) # Initialize the ngram: self.vectorizer = CountVectorizer(binary=True) # Feature name-function dictionary: self.featName_function = {"url": self.url, "all_caps": self.all_caps, "ngrams": self.ngrams} def all_caps(self, x): pat = re.compile(r"^[A-Z\d]+$") groups = pat.match(x) if groups: return ["f_all_caps"] def url(self, x): pat = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") groups = pat.findall(x) if groups: return ["f_url"] def ngrams(self, x): ngram_feats = self.vectorizer.fit_transform([x]) return self.vectorizer.inverse_transform(ngram_feats)[0].tolist() # An observation function that extracts features. x is a raw text def getObsFeatures(self, x, feat_list): str_feats = [] for feat in feat_list: feat = feat(x) if feat: str_feats += feat return str_feats def getYXFeatures(self, y_name, y_idx, obs_feat_list): # return y_name+'_'+str(y_idx).join(obs_feat_list) # return map(lambda x,y:x+y,y_name+'_'+str(y_idx),obs_feat_list) xy_feat = [y_name + str(y_idx) + "_" + xfeat for xfeat in obs_feat_list] # print xy_feat hashed_feats = self.hasher.transform([xy_feat]) # return hashed_feats.nonzero()[1] return hashed_feats
def trainClassifier(batchSize, dataFolder, clfFolderName, tagsSplitSize): startTime = time() if not os.path.exists(clfFolderName): os.makedirs(clfFolderName) if not os.path.exists(clfFolderName + 'Temp'): os.makedirs(clfFolderName + 'Temp') tags = list(USED_TAGS.keys()) totalRows = getTotalRows('data/' + dataFolder + '/TrainIds') hasher = FeatureHasher() batchGen = batchGenerator(batchSize, dataFolder, 'Train', totalRows) hashInd = 1 print 'number of tags : ' + str(len(tags)) extractor = FeatureExtractor() for _, X, _ in batchGen: batchTime = time() print 'computing batch : ' + str(hashInd) X_batch = hasher.transform(extractor.extract(sample) for sample in X) print 'saving batch : ' + str(hashInd) with open(clfFolderName + 'Temp/' + str(hashInd) + '.pkl', 'wb') as fid: cPickle.dump(X_batch, fid) print 'batch time : ' + str(time() - batchTime) hashInd += 1 with open(clfFolderName + '/hasher.pkl', 'wb') as fid: cPickle.dump(hasher, fid) with open(clfFolderName + '/extractor.pkl', 'wb') as fid: cPickle.dump(extractor, fid) print 'hashing time : ' + str(time() - startTime) tagIndDic = {} tagInd = 1 loop = 1 for currTags in [ tags[i:i + tagsSplitSize] for i in range(0, len(tags), tagsSplitSize) ]: iterStartTime = time() print 'tags iteration : ' + str(loop) clfDic = {} for tag in currTags: clfDic[tag] = Perceptron(alpha=ALPHA, n_iter=N_ITER) batchGen = batchGenerator(batchSize, dataFolder, 'Train', totalRows) batchInd = 1 for _, _, targets_in_batch in batchGen: batchTime = time() print 'batch number : ' + str(batchInd) with open(clfFolderName + 'Temp/' + str(batchInd) + '.pkl', 'rb') as fp: X_batch = cPickle.load(fp) for tag in currTags: Y_batch_binary = toBinary(tag, targets_in_batch) clfDic[tag].partial_fit(X_batch, Y_batch_binary, classes=[0, 1]) batchInd += 1 print 'batch time : ' + str(time() - batchTime) for tag in clfDic: clfDic[tag].sparsify() tagIndDic[tag] = tagInd with open(clfFolderName + '/' + str(tagInd) + '.pkl', 'wb') as fid: cPickle.dump(clfDic[tag], fid) tagInd += 1 loop += 1 print 'iter time : ' + str(time() - iterStartTime) print print 'saving model...' with open(clfFolderName + '/tagIndDic.pkl', 'wb') as fid: cPickle.dump(tagIndDic, fid) print 'total time : ' + str(time() - startTime)
# X = crfutils.get_features(feature_extractor, fields=fields, sep=separator) # doc = [] Y = set() for x in X: for entry in x: Y.add(entry['y']) # doc.append(entry['F']) # return X, hf.transform(doc) return X, list(Y) if __name__ == '__main__': # pass # crfutils.main(feature_extractor, fields=fields, sep=separator) X = crfutils.get_features(feature_extractor, fields=fields, sep=separator) # Apply the hashing trick hf = FeatureHasher(input_type='string',non_negative=True) # # List of dictionaries: # x_set = set() # # Iterate over each of the tokens features: doc = [] for x in X: # sg_tv.transform(x) for entry in x: # print entry['F'] doc+=entry['F'] # vec = sg_tv.transform(doc) print hf.transform(doc)
def trainClassifier(batchSize,dataFolder,clfFolderName,tagsSplitSize): startTime = time() if not os.path.exists(clfFolderName): os.makedirs(clfFolderName) if not os.path.exists(clfFolderName+'Temp'): os.makedirs(clfFolderName+'Temp') tags = list(USED_TAGS.keys()) totalRows = getTotalRows('data/'+dataFolder+'/TrainIds') hasher = FeatureHasher() batchGen = batchGenerator(batchSize,dataFolder,'Train',totalRows) hashInd = 1 print 'number of tags : ' + str(len(tags)) extractor = FeatureExtractor() for _,X,_ in batchGen: batchTime = time() print 'computing batch : ' + str(hashInd) X_batch = hasher.transform(extractor.extract(sample) for sample in X) print 'saving batch : ' + str(hashInd) with open(clfFolderName+'Temp/'+str(hashInd)+'.pkl', 'wb') as fid: cPickle.dump(X_batch, fid) print 'batch time : ' + str(time()-batchTime) hashInd+=1 with open(clfFolderName+'/hasher.pkl', 'wb') as fid: cPickle.dump(hasher, fid) with open(clfFolderName+'/extractor.pkl', 'wb') as fid: cPickle.dump(extractor, fid) print 'hashing time : ' + str(time()-startTime) tagIndDic = {} tagInd = 1 loop = 1 for currTags in [tags[i:i+tagsSplitSize] for i in range(0,len(tags),tagsSplitSize)]: iterStartTime = time() print 'tags iteration : ' + str(loop) clfDic = {} for tag in currTags: clfDic[tag] = Perceptron(alpha=ALPHA,n_iter=N_ITER) batchGen = batchGenerator(batchSize,dataFolder,'Train',totalRows) batchInd = 1 for _,_,targets_in_batch in batchGen: batchTime = time() print 'batch number : ' + str(batchInd) with open(clfFolderName+'Temp/'+str(batchInd)+'.pkl','rb') as fp: X_batch=cPickle.load(fp) for tag in currTags: Y_batch_binary = toBinary(tag,targets_in_batch) clfDic[tag].partial_fit(X_batch, Y_batch_binary, classes=[0,1]) batchInd+=1 print 'batch time : ' + str(time()-batchTime) for tag in clfDic: clfDic[tag].sparsify() tagIndDic[tag]=tagInd with open(clfFolderName+'/'+str(tagInd)+'.pkl', 'wb') as fid: cPickle.dump(clfDic[tag], fid) tagInd+=1 loop+=1 print 'iter time : ' + str(time()-iterStartTime) print print 'saving model...' with open(clfFolderName+'/tagIndDic.pkl', 'wb') as fid: cPickle.dump(tagIndDic, fid) print 'total time : ' + str(time()-startTime)