示例#1
0
class FeatFunctions(object):
    """docstring for featFunctions"""

    def __init__(self, n_features=None):

        # self.arg = arg
        import re
        from sklearn.feature_extraction.text import FeatureHasher
        from numpy.random import randn, randint
        from sklearn.feature_extraction.text import CountVectorizer

        # Define some parameters:
        if not n_features:
            n_features = 100000

            # Initialize the hasher:
        self.hasher = FeatureHasher(n_features=n_features, input_type="string", non_negative=True)

        # Initialize the ngram:
        self.vectorizer = CountVectorizer(binary=True)

        # Feature name-function dictionary:
        self.featName_function = {"url": self.url, "all_caps": self.all_caps, "ngrams": self.ngrams}

    def all_caps(self, x):
        pat = re.compile(r"^[A-Z\d]+$")
        groups = pat.match(x)
        if groups:
            return ["f_all_caps"]

    def url(self, x):
        pat = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
        groups = pat.findall(x)
        if groups:
            return ["f_url"]

    def ngrams(self, x):
        ngram_feats = self.vectorizer.fit_transform([x])
        return self.vectorizer.inverse_transform(ngram_feats)[0].tolist()

        # An observation function that extracts features. x is a raw text

    def getObsFeatures(self, x, feat_list):
        str_feats = []
        for feat in feat_list:
            feat = feat(x)
            if feat:
                str_feats += feat

        return str_feats

    def getYXFeatures(self, y_name, y_idx, obs_feat_list):
        # return y_name+'_'+str(y_idx).join(obs_feat_list)
        # return map(lambda x,y:x+y,y_name+'_'+str(y_idx),obs_feat_list)
        xy_feat = [y_name + str(y_idx) + "_" + xfeat for xfeat in obs_feat_list]
        # print xy_feat

        hashed_feats = self.hasher.transform([xy_feat])
        # return hashed_feats.nonzero()[1]
        return hashed_feats
示例#2
0
def trainClassifier(batchSize, dataFolder, clfFolderName, tagsSplitSize):
    startTime = time()
    if not os.path.exists(clfFolderName):
        os.makedirs(clfFolderName)
    if not os.path.exists(clfFolderName + 'Temp'):
        os.makedirs(clfFolderName + 'Temp')
    tags = list(USED_TAGS.keys())
    totalRows = getTotalRows('data/' + dataFolder + '/TrainIds')

    hasher = FeatureHasher()
    batchGen = batchGenerator(batchSize, dataFolder, 'Train', totalRows)
    hashInd = 1
    print 'number of tags : ' + str(len(tags))
    extractor = FeatureExtractor()
    for _, X, _ in batchGen:
        batchTime = time()
        print 'computing batch : ' + str(hashInd)
        X_batch = hasher.transform(extractor.extract(sample) for sample in X)
        print 'saving batch : ' + str(hashInd)
        with open(clfFolderName + 'Temp/' + str(hashInd) + '.pkl',
                  'wb') as fid:
            cPickle.dump(X_batch, fid)
        print 'batch time : ' + str(time() - batchTime)
        hashInd += 1
    with open(clfFolderName + '/hasher.pkl', 'wb') as fid:
        cPickle.dump(hasher, fid)
    with open(clfFolderName + '/extractor.pkl', 'wb') as fid:
        cPickle.dump(extractor, fid)
    print 'hashing time : ' + str(time() - startTime)

    tagIndDic = {}
    tagInd = 1
    loop = 1
    for currTags in [
            tags[i:i + tagsSplitSize]
            for i in range(0, len(tags), tagsSplitSize)
    ]:
        iterStartTime = time()
        print 'tags iteration : ' + str(loop)
        clfDic = {}
        for tag in currTags:
            clfDic[tag] = Perceptron(alpha=ALPHA, n_iter=N_ITER)
        batchGen = batchGenerator(batchSize, dataFolder, 'Train', totalRows)
        batchInd = 1
        for _, _, targets_in_batch in batchGen:
            batchTime = time()
            print 'batch number : ' + str(batchInd)
            with open(clfFolderName + 'Temp/' + str(batchInd) + '.pkl',
                      'rb') as fp:
                X_batch = cPickle.load(fp)
            for tag in currTags:
                Y_batch_binary = toBinary(tag, targets_in_batch)
                clfDic[tag].partial_fit(X_batch,
                                        Y_batch_binary,
                                        classes=[0, 1])
            batchInd += 1
            print 'batch time : ' + str(time() - batchTime)
        for tag in clfDic:
            clfDic[tag].sparsify()
            tagIndDic[tag] = tagInd
            with open(clfFolderName + '/' + str(tagInd) + '.pkl', 'wb') as fid:
                cPickle.dump(clfDic[tag], fid)
            tagInd += 1
        loop += 1
        print 'iter time : ' + str(time() - iterStartTime)
        print
    print 'saving model...'
    with open(clfFolderName + '/tagIndDic.pkl', 'wb') as fid:
        cPickle.dump(tagIndDic, fid)

    print 'total time : ' + str(time() - startTime)
示例#3
0
    # X = crfutils.get_features(feature_extractor, fields=fields, sep=separator)
    # doc = []
    Y = set()
    for x in X:
        for entry in x:
            Y.add(entry['y'])
            # doc.append(entry['F'])
    # return X, hf.transform(doc)
    return X, list(Y)


if __name__ == '__main__':
    # pass
    # crfutils.main(feature_extractor, fields=fields, sep=separator)
    X = crfutils.get_features(feature_extractor, fields=fields, sep=separator)

    # Apply the hashing trick
    hf = FeatureHasher(input_type='string',non_negative=True)
    # # List of dictionaries:
    # x_set = set()
    # # Iterate over each of the tokens features:

    doc = []
    for x in X:
        # sg_tv.transform(x)
        for entry in x:
            # print entry['F']
            doc+=entry['F']
        # vec = sg_tv.transform(doc)
        print hf.transform(doc)
示例#4
0
def trainClassifier(batchSize,dataFolder,clfFolderName,tagsSplitSize):
    startTime = time()
    if not os.path.exists(clfFolderName):
        os.makedirs(clfFolderName)
    if not os.path.exists(clfFolderName+'Temp'):
        os.makedirs(clfFolderName+'Temp')
    tags = list(USED_TAGS.keys())
    totalRows = getTotalRows('data/'+dataFolder+'/TrainIds')
     
    hasher = FeatureHasher()
    batchGen = batchGenerator(batchSize,dataFolder,'Train',totalRows)   
    hashInd = 1
    print 'number of tags : ' + str(len(tags))
    extractor = FeatureExtractor()
    for _,X,_ in batchGen:
        batchTime = time()
        print 'computing batch : ' + str(hashInd)
        X_batch = hasher.transform(extractor.extract(sample) for sample in X)
        print 'saving batch : ' + str(hashInd)
        with open(clfFolderName+'Temp/'+str(hashInd)+'.pkl', 'wb') as fid:
                cPickle.dump(X_batch, fid)
        print 'batch time : ' + str(time()-batchTime)
        hashInd+=1
    with open(clfFolderName+'/hasher.pkl', 'wb') as fid:
        cPickle.dump(hasher, fid)
    with open(clfFolderName+'/extractor.pkl', 'wb') as fid:
        cPickle.dump(extractor, fid)
    print 'hashing time : ' + str(time()-startTime)
    
    tagIndDic = {}
    tagInd = 1 
    loop = 1
    for currTags in [tags[i:i+tagsSplitSize] for i in range(0,len(tags),tagsSplitSize)]:
        iterStartTime = time()
        print 'tags iteration : ' + str(loop)
        clfDic = {}
        for tag in currTags:
            clfDic[tag] = Perceptron(alpha=ALPHA,n_iter=N_ITER)
        batchGen = batchGenerator(batchSize,dataFolder,'Train',totalRows)
        batchInd = 1
        for _,_,targets_in_batch in batchGen:
            batchTime = time()
            print 'batch number : ' + str(batchInd)
            with open(clfFolderName+'Temp/'+str(batchInd)+'.pkl','rb') as fp:
                X_batch=cPickle.load(fp)
            for tag in currTags:
                Y_batch_binary = toBinary(tag,targets_in_batch)
                clfDic[tag].partial_fit(X_batch, Y_batch_binary, classes=[0,1])
            batchInd+=1
            print 'batch time : ' + str(time()-batchTime)
        for tag in clfDic:
            clfDic[tag].sparsify()
            tagIndDic[tag]=tagInd
            with open(clfFolderName+'/'+str(tagInd)+'.pkl', 'wb') as fid:
                cPickle.dump(clfDic[tag], fid)
            tagInd+=1
        loop+=1
        print 'iter time : ' + str(time()-iterStartTime)
        print 
    print 'saving model...'
    with open(clfFolderName+'/tagIndDic.pkl', 'wb') as fid:
        cPickle.dump(tagIndDic, fid)

    print 'total time : ' + str(time()-startTime)