示例#1
0
def process_records(records, fields, target, textmodel=None):
	tokenize = CountVectorizer().build_analyzer()

	input = None
	X = None
	y_labels = []

	for i, record in enumerate(records):
		nums = []
		strs = []
		y_labels.append(record.get(target))

		for field in fields:
			if is_number(record.get(field)):
				nums.append(record[field])
			else:
				strs.append(str(record.get(field) or "").lower())
		if strs:
			if input is None:
				input = StringIO.StringIO()
			print >> input, " ".join(tokenize(" ".join(strs)))
		if nums:
			if X is None:
				X = sp.lil_matrix((len(records),len(nums)))
			X[i] = np.array(nums, dtype=np.float64)

	if input is not None:
		if X is not None:
			X_2 = X.tocsr()
		else:
			X_2 = None

		if isinstance(textmodel,basestring):
			if textmodel == 'lsi':
				corpus = TextCorpus(input)
				textmodel = LsiModel(corpus, chunksize=1000)
			elif textmodel == 'tfidf':
				corpus = TextCorpus(input)
				textmodel = TfidfModel(corpus)
			elif textmodel == 'hashing':
				textmodel = None
				hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
				input.seek(0)
				X = hasher.transform(tokenize(line.strip()) for line in input)
		if textmodel:
			num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
			X = corpus2csc(textmodel[corpus], num_terms).transpose()

		if X_2 is not None:
			# print >> sys.stderr, "X SHAPE:", X.shape
			# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
			X = sp.hstack([X, X_2], format='csr')

	elif X is not None:
		textmodel = None
		X = X.tocsr()

	print >> sys.stderr, "X SHAPE:", X.shape

	return X, y_labels, textmodel
示例#2
0
文件: ffm.py 项目: PKostya/kaggle
def to_ffm(df, outfile, ycol, num_columns = []):
    df = df.copy()
    one_based = True
    hasher = FeatureHasher(input_type='string', non_negative=True)
    bs = 2**10
    value_pattern = u'%d:%d:%.16g'
    line_pattern = u'%d %s\n'
    with open(outfile, 'w') as out:
        pb = progressbar.ProgressBar(maxval=(df.shape[0]+bs+1) // bs).start()
        for i in xrange((df.shape[0]+bs+1) // bs):
            pb.update(i)
            s = slice(i*bs, (i+1)*bs)
            if ycol in df.columns:
                Xh = np.asarray(df.iloc[s].drop([ycol], axis=1).drop(num_columns,axis=1).astype('str'))
                Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
                y = df.iloc[s][ycol].values.astype('int')
            else:
                Xh = np.asarray(df.iloc[s].drop(num_columns,axis=1).astype('str'))
                Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
                y = np.zeros((bs,))
            Xt = scipy.sparse.hstack([Xv,hasher.transform(Xh)]).tocsr()
            for j in xrange(Xt.shape[0]):
                span = slice(Xt.indptr[j], Xt.indptr[j+1])
                row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span])
                st = " ".join(value_pattern % (j + one_based, fe + one_based, x) for j, fe, x in row if np.isnan(x) == False)
                feat = (y[j], st)
                out.write((line_pattern % feat).encode('ascii'))
        pb.finish()
示例#3
0
class QClassifierImpl:
    """
    A wrapper for question classifier
    """

    def __init__(self, train_data_path, pred_qs = None):
        """
        Constructor
        """
        logging.basicConfig(level = logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='qclassifier.log',
                filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']

    def train(self):
        """
        Train use all of the given data
        """
        self.extractor.load(path = self.path)
        self.features = self.extractor.extract_features()
        self.labels = self.extractor.get_labels()
        self.clf = QClassifier(questions = self.extractor.questions)
        assert(len(self.labels) == len(self.features))

        X = self.features
        Y = self.labels
        self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True)
        X = self.vectorizer.transform(X)
        Y = asarray(Y)

        logging.info('start training')
        self.clf.train(X, Y)
        logging.info('done')

    def get_type(self, question):
        """
        Get type for a given question
        """
        if not self.features or not self.labels:
            logging.error('You need to train model first!')
            return None
        if not question:
            logging.error('Question should not be None')
            return None
        f = [self.extractor.extract_features_aux(question)]
        f = self.vectorizer.transform(f)
        # print self.clf.predict(f)
        return self.cate[self.clf.predict(f)[0]]
示例#4
0
class HashSarca(Sarcalingua):
    
    def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")):
        
        self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair")
        self.classifier = model
        self.outEncoder = LabelEncoder()
        self.drop_outs = set((   u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony",
                    u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique",
                    u"uncyclopedia", u"wikipedia"))
        
    def extractFeatures(self, clean_text):
        return self.featureExtractor.transform( (token_pattern.finditer(clean_text),) )
        
    def corpusToDataset(self, chunkIterator, column_label, HTML=False, **args):
        
        def prepare(raw_text):
            tokens = token_pattern.findall(self.sanitize(raw_text, HTML))
            if random.random() < 0.5:   # we delete the drop-outs half the time
                tokens = [tok for tok in tokens if tok not in self.drop_outs]
            try:
                alpha = 1./len(tokens)  #1./(1+log(len(tokens)))
                return ((tok.lower(), alpha) for tok in tokens)
            except ZeroDivisionError:
                return tuple()
        
        for chunk in chunkIterator:
            X = self.featureExtractor.transform(imap(prepare, chunk.text))
            y = np.array(self.outEncoder.fit_transform(chunk[column_label]))
            
            yield X,y
            gc.collect()
示例#5
0
def load_conll(f, features, n_features=(2 ** 16), split=False):
    """Load CoNLL file, extract features on the tokens and hash them.

    Parameters
    ----------
    f : {string, file-like}
        Input file.
    features : callable
        Feature extraction function. Must take a list of tokens (see below)
        and an index into this list.
    n_features : integer, optional
        Number of columns in the output.
    split : boolean, default=False
        Whether to split lines on whitespace beyond what is needed to parse
        out the labels. This is useful for CoNLL files that have extra columns
        containing information like part of speech tags.
    """
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _conll_sequences(f, features, labels, lengths, split)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
def hash(data, labels, new_dimension):
    print "start hashing trick..."
    # convert features as dict
    dictList = list()
    if hasattr(data, "indices"):
        #ind = data.indices
        #dat = data.data
        data = data.toarray()
        indices = range(len(data[0]))
        for item in data:
            zipped = zip(indices, item)
            row = dict()
            for index,value in zipped:
                if value != 0:
                    row[str(index)] = value
            dictList.append(row)

        a = 234
    else:
        indices = map(str, range(len(data[0])))
        for row in data:
            dictList.append(dict(zip(indices, row)))

    start = time.time()
    hasher = FeatureHasher(n_features=new_dimension) # , input_type='dict'
    reduced = hasher.fit_transform(dictList).toarray()
    end = time.time()
    return (reduced, end-start)
示例#7
0
def io():
    hv = FeatureHasher()

    target = []
    train_int = []
    train_label = []

    for iline in dio.io():
        iline = iline.strip().split(',')
        t = int(iline[0])
        int_fs = map(lambda i: numpy.NaN if not i else int(i), iline[1:14])
        label_fs = [k for k in iline[14:]]
        #label_fs = ",".join(iline[14:])
#        print int_fs, label_fs

        target.append(t)
        train_int.append(int_fs)
        train_label.append({k:1 for k in label_fs if k})

#    print train_int
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    train_int = imp.fit_transform(train_int)
#    print train_int
    scaler = preprocessing.StandardScaler().fit(train_int)
    train_int = scaler.transform(train_int)
#    print train_int
    train_int = csr_matrix(train_int)
#    print train_label
    train_label = hv.transform(train_label)
    train = hstack((train_int, train_label))
#    print train_label
#    print train
    return target, train
def test_feature_hasher_pairs():
    raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz)
def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
示例#10
0
 def hash(mat, num_features):
   """
   hashing trick
   """
   hasher = FeatureHasher(n_features=num_features, non_negative=True)
   X = hasher.transform(mat)
   X = X.toarray()
   return X
示例#11
0
def ner(tokens):
    """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset."""

    global _model

    X = [_features(tokens, i) for i in range(len(tokens))]
    hasher = FeatureHasher(2**16, input_type="string")
    return zip(tokens, _model.predict(hasher.transform(X)))
def test_feature_hasher_dicts():
    h = FeatureHasher(n_features=16)
    assert_equal("dict", h.input_type)

    raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}]
    X1 = FeatureHasher(n_features=16).transform(raw_X)
    gen = (d.iteritems() for d in raw_X)
    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
    assert_array_equal(X1.toarray(), X2.toarray())
def load_seq2seq(f, features, n_features=(2 ** 16)):
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _sequences(f, features, labels, lengths)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
示例#14
0
文件: file.py 项目: PKostya/kaggle
def dump_libffm_format(X, y, f):
    one_based = True
    hasher = FeatureHasher(input_type='string', non_negative=True)
    Xt = hasher.transform(X)
    value_pattern = u'%d:%d:%.16g'
    line_pattern = u'%d %s\n'
    for i in xrange(Xt.shape[0]):
        span = slice(Xt.indptr[i], Xt.indptr[i+1])
        row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span])
        s = " ".join(value_pattern % (j + one_based, fe, x) for j, fe, x in row)
        feat = (y[i], s)
        f.write((line_pattern % feat).encode('ascii'))
示例#15
0
def encode_titles(titles, num_features=2**14):
  '''
  Encode the titles formatted as a string as numerical values using
  the 'hashing trick'.
  The size of the feature vector can be specified using the
  num_features parameter'
  '''
  myHasher = FeatureHasher(input_type='string',
                           n_features= num_features,
                           non_negative=True)
  featureMatrix = myHasher.transform(titles)
  return featureMatrix, myHasher
示例#16
0
def hash_features(features, arm_ids, use_id=True):
    n_features = np.shape(features)[1]
    feature_names = [str(x) for x in np.arange(n_features)]
    all_features = []
    for arm_id, feature_set in zip(arm_ids, features):
        temp_features = zip(feature_names, feature_set)
        if use_id == True:
            temp_features.append(("id_"+str(arm_id), 1))
        all_features.append(temp_features)

    f = FeatureHasher(input_type='pair')
    return f.transform(all_features)
    def predictUserScore(self, body, tags, fgen, users):
        featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair')
        # document features
        featureVector = [(str(dim), value) for dim, value in fgen.getDocumentFeatures(body, tags)]
        # additional features
        featureVector.append(("Length", 1))
        featureVector.append(("Score", 1))
        featureVector.append(("Accepted", 1))
        featureVector.append(("OwnerRep", 1))

        X = featureHasher.transform([[(str(dim), value) for dim, value in featureVector]])
        scores = [score for index, score in enumerate(self.cf.decision_function(X)[0]) if int(self.cf.classes_[index]) in users]
        return scores
示例#18
0
def hash_array(feature_dict, feature_num):
	# print feature_dict[0]
	if feature_num == 1:
		x_new = np.asarray(feature_dict)
		x_h = x_new.reshape(len(feature_dict), 1)
	else:
		hasher = FeatureHasher(n_features=feature_num, non_negative=True, input_type='dict')
		X_new = hasher.fit_transform(feature_dict)
		x_h = X_new.toarray()
		# vec = DictVectorizer()
		# x_h = vec.fit_transform(feature_dict).toarray()
		# print x_h.shape, type(x_h)
	return x_h
示例#19
0
def load_conll(f, features, n_features=(2 ** 16), split=False):
    """Load CoNLL file, extract features on the tokens and vectorize them.

    The ConLL file format is a line-oriented text format that describes
    sequences in a space-separated format, separating the sequences with
    blank lines. Typically, the last space-separated part is a label.

    Since the tab-separated parts are usually tokens (and maybe things like
    part-of-speech tags) rather than feature vectors, a function must be
    supplied that does the actual feature extraction. This function has access
    to the entire sequence, so that it can extract context features.

    A ``sklearn.feature_extraction.FeatureHasher`` (the "hashing trick")
    is used to map symbolic input feature names to columns, so this function
    dos not remember the actual input feature names.

    Parameters
    ----------
    f : {string, file-like}
        Input file.
    features : callable
        Feature extraction function. Must take a list of tokens l that
        represent a single sequence and an index i into this list, and must
        return an iterator over strings that represent the features of l[i].
    n_features : integer, optional
        Number of columns in the output.
    split : boolean, default=False
        Whether to split lines on whitespace beyond what is needed to parse
        out the labels. This is useful for CoNLL files that have extra columns
        containing information like part of speech tags.

    Returns
    -------
    X : scipy.sparse matrix, shape (n_samples, n_features)
        Samples (feature vectors), as a single sparse matrix.
    y : np.ndarray, dtype np.string, shape n_samples
        Per-sample labels.
    lengths : np.ndarray, dtype np.int32, shape n_sequences
        Lengths of sequences within (X, y). The sum of these is equal to
        n_samples.
    """
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _conll_sequences(f, features, labels, lengths, split)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
    def predictUsers(self, body, tags, fgen, n = 3):
        featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair')
        # document features
        featureVector = [(str(dim), value) for dim, value in fgen.getDocumentFeatures(body, tags)]
        # additional features
        featureVector.append(("Length", 1))
        featureVector.append(("Score", 1))
        featureVector.append(("Accepted", 1))
        featureVector.append(("OwnerRep", 1))

        X = featureHasher.transform([[(str(dim), value) for dim, value in featureVector]])
        userIds = [int(self.cf.classes_[index]) for index, score in sorted(enumerate(self.cf.decision_function(X)[0]), key=lambda x:x[1], reverse=True)][:n]
        # print(userIds)
        # print(self.cf.predict(X))

        return [Users.get(Users.id == userId) for userId in userIds]
    def initialize(self):
        if self.model_class == 'scikit':
            self.model = SGDRegressor(loss='squared_loss', alpha=0.1, n_iter=10, shuffle=True, eta0=0.0001)
            self.feature_constructor = FeatureHasher(n_features=200, dtype=np.float64, non_negative=False, input_type='dict')

        elif self.model_class == 'lookup':
            self.model = {}
示例#22
0
文件: models.py 项目: mrshu/diaqres
    def train(self, corpus, classes=None, chunk_size=100000):
        self.vectorizer = FeatureHasher(non_negative=True,
                                        n_features=len(classes)*2*self.window,
                                        input_type='pair')
        self.clf = MultinomialNB()
        i = 0
        j = 0
        X = []
        Y = []
        for x, y in corpus:
            if x[self.window][1] in self.input_classes:
                X.append(x)
                Y.append(y)
                i += 1
            if i < chunk_size:
                continue

            j += 1
            click.echo("Running iteration {}".format(j))

            X = self.vectorizer.transform(X)
            self.clf.partial_fit(X, Y, classes)
            X = []
            Y = []
            i = 0
示例#23
0
 def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")):
     
     self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair")
     self.classifier = model
     self.outEncoder = LabelEncoder()
     self.drop_outs = set((   u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony",
                 u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique",
                 u"uncyclopedia", u"wikipedia"))
示例#24
0
class Model:
    def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
        #Init scikit models
        self.FH = FeatureHasher(n_features=numFeatures, input_type='string')
        self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
    def train(self, gen,  v=False):

        i = 0
        for x, y in gen: #For each batch
            xHash = self.FH.transform(x) #hash trick
            y = np.array(y)            
##            for epoch in range(numEpochs):
            self.Classifier.partial_fit(xHash, y, [0,1])
            i += len(x)
            if v : print(str(datetime.now())[:-7] , "example:", i)
            
    def test(self, gen,  v=False):

        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        i = 0
        for x,y in gen:
            xHash = self.FH.transform(x) #hash trick
            p = self.Classifier.predict_proba(xHash)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))
            i += y.shape[0]
            if v : print(str(datetime.now())[:-7] , "example:", i)
        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def predictBatch(self, batch):
        hashedBatch = self.FH.transform(batch)
        prediction = self.Classifier.predict_proba(hashedBatch)
        return prediction
    def generatePrediction(self, generator):
        for xBatch, idBatch in generator:
            prediction = self.predictBatch(xBatch)
            yield prediction, idBatch
    def score(self, target, prediction):
        return llfun(target, prediction)
示例#25
0
    def process(self):

        header = self.inputFile.readline()

        ids = []
        self.features = []

        count = 0
        for line in self.inputFile:
            count += 1
            fields = line.split(',')

            id = fields[0]
            names = {}
            name = Kaggle_Grupo.Utils.StringNormalize(fields[1])

            for i in name.split(' '):
                names[i] = 1

            ids.append(id)
            self.features.append(names)




        featureHasher = FeatureHasher(n_features=2**12, dtype=np.uint16)

        self.features = featureHasher.transform(self.features)
        self.features = self.features.toarray()

        self.features = self.encode(width=24)


        headerFields  = ["Cliente_ID"]

        for i in range(self.features.shape[1]):
            headerFields.append('ClientName_{}'.format(i))

        headerFields = "\t".join(headerFields)


        self.outputFile.write(headerFields+'\n')

        for i in range(self.features.shape[0]):
            self.outputFile.write('{}\t{}\n'.format(ids[i], ('\t'.join(self.features[i].astype('str')).replace('False', '0').replace('True', '1'))))
def test_feature_hasher_pairs_with_string_values():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
                                       {"baz": u"abc", "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 1], x1_nz)
    assert_equal([1, 1, 4], x2_nz)

    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
                                       {"bax": "abc"}])
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = np.abs(x1[x1 != 0])
    x2_nz = np.abs(x2[x2 != 0])
    assert_equal([1], x1_nz)
    assert_equal([1], x2_nz)
    assert_equal(x1, x2)
示例#27
0
def gen_cinput(origindata, pooldata = [],threshold = 5):
	origin_feas = gen_feature_data(origindata)
	pool_feas = gen_feature_data(pooldata)

	feas_X = []
	label_Y = []
	s  = set()
	for seq in origin_feas:
		feas_X.extend([item["F"] for item in seq])
		for item in seq:
			s.update(item["F"])
		label_Y.extend([item["L"] for item in seq])

	assert len(feas_X) == len(label_Y)
	print "original  data  data  num   :   "+str(len(feas_X))

	
	feas_X_2 = []
	label_Y_2 = []
	for seq_id, seq in enumerate(pool_feas):
		for token_id, token in enumerate(seq):
			if pooldata[seq_id][2][token_id] == 1:
				feas_X_2.append(token["F"])
				s.update(token["F"])
				label_Y_2.append(token["L"])



	print "pool data  data  num   :   "+str(len(feas_X_2))


	print "original feature num   ................ "+str(len(s))

	X = feas_X + feas_X_2
	X = featurefilter(X, threshold)
	print X[:2]


	Y = label_Y + label_Y_2
	h = FeatureHasher(input_type = "string", non_negative = True)

	X = h.transform(X)

	return X ,Y, h 
def test_feature_hasher_strings():
    raw_X = [[u"foo", "bar", "baz", "foo"], [u"bar", "baz", "quux"]]  # note: duplicate

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)  # iterable

        h = FeatureHasher(n_features, non_negative=True, input_type="string")
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_true(np.all(X.data > 0))
        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, sum(len(set(x)) for x in raw_X))
    def learn(self, fgen, postLimit=None):
        Parent = Posts.alias()
        query = Posts.select().join(Parent, on=(Posts.parentid == Parent.id)).where(Posts.posttypeid == 2 & Parent.forevaluation == 0)
        if postLimit is not None:
            query = query.limit(postLimit)
        count = query.count()
        print("Learning {0} questions".format(count))

        allClasses = numpy.array([user.id for user in Users.select()])

        maxUserRep = float(Users.select(peewee.fn.Max(Users.reputation)).scalar())

        featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair')
        featureMatrix = []
        classList = []
        for i, answer in enumerate(query):
            if answer.owneruserid is None:
                continue
            print("Generating feature vector for id {0}".format(answer.id))
            # docment features
            # featureVector = fgen.getDocumentFeatures(answer.parentid.title + answer.parentid.body + answer.body, tagIds)
            featureVector = fgen.getAnswerFeatures(answer)

            featureVector = [(str(dim), value) for dim, value in featureVector]
            # additional features
            maxScore = Posts.select(peewee.fn.Max(Posts.score)).where(Posts.parentid == answer.parentid).scalar()
            maxLength = max(len(post.body) for post in Posts.select().where(Posts.parentid == answer.parentid))
            featureVector.append(("Length", (len(answer.body)/float(maxLength))))
            featureVector.append(("Score", 1 if maxScore == 0 else (answer.score/float(maxScore))))
            featureVector.append(("Accepted", 1 if answer.id == answer.parentid.acceptedanswerid else 0))
            featureVector.append(("OwnerRep", answer.owneruserid.reputation/maxUserRep))

            featureMatrix.append(featureVector)
            classList.append(answer.owneruserid.id)
            if len(featureMatrix) == self.batchSize or i == count-1:
                print("Partial fitting classifier".format(answer.id))
                X = featureHasher.transform(featureMatrix)
                Y = numpy.array(classList)
                self.cf.partial_fit(X, Y, classes=allClasses)
                allClasses = None
                featureMatrix = []
                classList = []
示例#30
0
def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)  # iterable

        h = FeatureHasher(n_features, non_negative=True, input_type="string")
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_true(np.all(X.data > 0))
        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6)
示例#31
0

parser = argparse.ArgumentParser("get windows object vectors for files")
parser.add_argument("--malware_paths",
                    default=None,
                    help="Path to malware training files")
parser.add_argument("--benignware_paths",
                    default=None,
                    help="Path to benignware training files")
parser.add_argument("--scan_file_path", default=None, help="File to scan")
parser.add_argument("--evaluate",
                    default=False,
                    action="store_true",
                    help="Perform cross-validation")

args = parser.parse_args()

hasher = FeatureHasher(20000)
if args.malware_paths and args.benignware_paths and not args.evaluate:
    train_detector(args.benignware_paths, args.malware_paths, hasher)
elif args.scan_file_path:
    scan_file(args.scan_file_path)
elif args.malware_paths and args.benignware_paths and args.evaluate:
    X, y = get_training_data(args.benignware_paths, args.malware_paths, hasher)
    cv_evaluate(X, y, hasher)
else:
    print "[*] You did not specify a path to scan," \
        " nor did you specify paths to malicious and benign training files" \
        " please specify one of these to use the detector.\n"
    parser.print_help()
class ActorCriticLearner(WhenLearner):
    def __init__(self,
                 gamma=0.9,
                 lr=1e-3,
                 state_size=1000,
                 action_size=1000,
                 hidden_size=200):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = gamma
        self.lr = lr

        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size

        self.state_hasher = FeatureHasher(n_features=self.state_size)
        self.action_hasher = FeatureHasher(n_features=self.action_size)
        self.value_net = ACValueNet(self.state_size, self.hidden_size)
        self.action_net = ACActionNet(self.action_size, self.hidden_size,
                                      self.hidden_size)

        params = (list(self.value_net.parameters()) +
                  list(self.action_net.parameters()))
        self.optimizer = torch.optim.Adam(params, lr=self.lr)

    def gen_state_vector(self, state: dict) -> np.ndarray:
        state = {str(a): state[a] for a in state}
        return self.state_hasher.transform([state]).toarray()

    def gen_action_vectors(self,
                           actions: Collection[Activation]) -> np.ndarray:

        action_dicts = []
        for action in actions:
            act_d = {}
            name = action.get_rule_name()
            act_d['rulename'] = name
            bindings = action.get_rule_bindings()
            for a, v in bindings.items():
                if isinstance(v, bool):
                    act_d[str(a)] = str(v)
                else:
                    act_d[str(a)] = v
            action_dicts.append(act_d)

        return self.action_hasher.transform(action_dicts).toarray()

    def eval_all(self, state: dict,
                 actions: Collection[Activation]) -> Collection[float]:
        pass

    def eval(self, state: dict, action: Activation) -> float:
        if state is None:
            return 0

        state_x = torch.from_numpy(self.gen_state_vector(state)).float().to(
            self.device)
        action_x = torch.from_numpy(self.gen_action_vectors(
            [action])).float().to(self.device)

        with torch.no_grad():
            state_val, state_hidden = self.value_net(state_x)
            action_val = self.action_net(action_x, state_hidden)
            return action_val[0].cpu().item()

    def update(
        self,
        state: dict,
        action: Activation,
        reward: float,
        next_state: dict,
        next_actions: Collection[Activation],
    ) -> None:
        return

        sa = self.generate_vector(state, action)
        if len(next_actions) == 0:
            next_sa = None
        else:
            next_sa = np.stack((self.generate_vector(next_state,
                                                     next_actions[i])
                                for i in range(len(next_actions))))

        # print("REWARD")
        # print(reward)
        # print("NEXT SAs")
        # print(next_sa.shape)
        # print()

        self.replay_memory.push(
            torch.from_numpy(sa).float().to(self.device),
            torch.tensor([reward]).to(self.device),
            torch.from_numpy(next_sa).float().to(self.device))

        self.train()
 def __init__(self, verbose, min_label_count=1, inference=False):
     self.fh = FeatureHasher(dtype='float32')
     self.verbose = verbose
     self.inference = inference
     self.min_label_count = min_label_count
    # Extract the subject & body
    ('HeadlineBodyFeatures', HeadlineBodyFeaturesExtractor()),

    # Use FeatureUnion to combine the features from subject and body
    (
        'union',
        FeatureUnion(
            transformer_list=[

                #Pipeline for pulling punctuation feature from articles
                #Using FeatureHasher for both headline and the body
                ('punct_stats_headline',
                 Pipeline([
                     ('selector', ItemSelector(key='headline')),
                     ('stats', Punct_Stats()),
                     ('vect', FeatureHasher(10)),
                 ])),
                ('punct_stats_body',
                 Pipeline([
                     ('selector', ItemSelector(key='article_body')),
                     ('stats', Punct_Stats()),
                     ('vect', FeatureHasher(10)),
                 ])),
            ], )),

    # Use an Bernoulli Naive Bayes classifier as the Baseline Model
    ('clf', BernoulliNB()),
])

#Fitting the pipline to the training text and labels
pipeline.fit(train_texts, train_labels)
示例#35
0
def make_classification_data(num_examples=100,
                             train_test_ratio=0.5,
                             num_features=10,
                             use_feature_hashing=False,
                             feature_bins=4,
                             num_labels=2,
                             empty_labels=False,
                             feature_prefix='f',
                             class_weights=None,
                             non_negative=False,
                             one_string_feature=False,
                             num_string_values=4,
                             random_state=1234567890):

    # use sklearn's make_classification to generate the data for us
    num_numeric_features = (num_features -
                            1 if one_string_feature else num_features)
    X, y = make_classification(n_samples=num_examples,
                               n_features=num_numeric_features,
                               n_informative=num_numeric_features,
                               n_redundant=0,
                               n_classes=num_labels,
                               weights=class_weights,
                               random_state=random_state)

    # if we were told to only generate non-negative features, then
    # we can simply take the absolute values of the generated features
    if non_negative:
        X = abs(X)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]

    # create a string feature that has four possible values
    # 'a', 'b', 'c' and 'd' and add it to X at the end
    if one_string_feature:
        prng = RandomState(random_state)
        random_indices = prng.random_integers(0, num_string_values - 1,
                                              num_examples)
        possible_values = [chr(x) for x in range(97, 97 + num_string_values)]
        string_feature_values = [possible_values[i] for i in random_indices]
        string_feature_column = np.array(string_feature_values,
                                         dtype=object).reshape(100, 1)
        X = np.append(X, string_feature_column, 1)

    # create a list of dictionaries as the features
    feature_names = [
        '{}{:02d}'.format(feature_prefix, n)
        for n in range(1, num_features + 1)
    ]
    features = [dict(zip(feature_names, row)) for row in X]

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # are we told to generate empty labels
    train_labels = None if empty_labels else train_y
    test_labels = None if empty_labels else test_y

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('classification_train',
                          train_ids,
                          labels=train_labels,
                          features=train_features,
                          vectorizer=vectorizer)
    if train_test_ratio < 1.0:
        test_fs = FeatureSet('classification_test',
                             test_ids,
                             labels=test_labels,
                             features=test_features,
                             vectorizer=vectorizer)
    else:
        test_fs = None

    return (train_fs, test_fs)
示例#36
0
        # 'clf__eta0':(0.0001, 0.00001,0.000001),
        # 'clf__penalty': ('l2', 'elasticnet'),
        # 'clf__n_iter': (1000, 5000, 8000, 10000),
    }

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, cv=4)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    for number_hashing_features in list_number_hashing_features:
        print("Number of hashing features : %d" % number_hashing_features)
        data = original_data
        hasher = FeatureHasher(n_features=number_hashing_features,
                               input_type='string')

        for column_name in list_hash_columns:
            data = hashing_data(data, column_name, hasher,
                                number_hashing_features)

        data = extract_time_stamp_feature(data)
        X_data, Y_data = split_X_and_Y(data)

        grid_search.fit(X_data, Y_data['click'])

        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        print(grid_search.grid_scores_)
        for param_name in sorted(parameters.keys()):
示例#37
0
def test_hasher_zeros():
    # Assert that no zeros are materialized in the output.
    X = FeatureHasher().transform([{"foo": 0}])
    assert X.data.shape == (0,)
示例#38
0
class RegCBLearner(Learner):
    """A learner using the RegCB algorithm by Foster et al.
        and the online bin search implementation by Bietti et al. 

    References:
        Foster, Dylan, Alekh Agarwal, Miroslav Dudík, Haipeng Luo, and Robert Schapire.
        "Practical contextual bandits with regression oracles." In International 
        Conference on Machine Learning, pp. 1539-1548. PMLR, 2018.

        Bietti, Alberto, Alekh Agarwal, and John Langford.
        "A contextual bandit bake-off." arXiv preprint 
        arXiv:1802.04064 (2018).
    """

    @property
    def family(self) -> str:
        """The family of the learner.

        See the base class for more information
        """
        return f"RegCB"

    @property
    def params(self) -> Dict[str, Any]:
        """The parameters of the learner.

        See the base class for more information
        """
        dict = {'beta': self._beta, 'alpha': self._alpha, 'interactions': self._interactions}
        return dict

    def __init__(self, *, beta: float, alpha: float, learning_rate:float=0.1, interactions: Sequence[str] = ['a', 'ax']) -> None:
        """Instantiate a RegCBLearner.

        Args:
            beta : square-loss tolerance
            alpha: confidence bounds precision
            interactions: the set of interactions the learner will use. x refers to context and a refers to actions, 
                e.g. xaa would mean interactions between context, actions and actions. 
        """

        PackageChecker.sklearn("RegCBLearner")
        from sklearn.feature_extraction import FeatureHasher
        from sklearn.preprocessing import PolynomialFeatures

        self._beta  = beta
        self._alpha = alpha
        self._iter  = 0

        self._core_model = []

        self._times         = [0,0,0,0]
        self._interactions  = interactions
        self._terms         = []
        self._learning_rate = learning_rate

        for term in self._interactions:
            term = term.lower()
            x_num = term.count('x')
            a_num = term.count('a')

            if x_num + a_num != len(term):
                raise Exception("Letters other than x and a were passed for parameter interactions. Please remove other letters/characters.")

            self._terms.append((x_num, a_num))

        max_x_term = max(max(term[0] for term in self._terms),1)
        max_a_term = max(max(term[1] for term in self._terms),1)

        self._x_p = PolynomialFeatures(degree=max_x_term, include_bias=False, interaction_only=False)
        self._a_p = PolynomialFeatures(degree=max_a_term, include_bias=False, interaction_only=False)
        self._h   = FeatureHasher(input_type='pair')

    def predict(self, key: Key, context: Context, actions: Sequence[Action]) -> Sequence[float]:
        """Determine a PMF with which to select the given actions.

        Args:
            key: The key identifying the interaction we are choosing for.
            context: The context we're currently in. See the base class for more information.
            actions: The actions to choose from. See the base class for more information.

        Returns:
            The probability of taking each action. See the base class for more information.
        """

        import numpy as np
        from scipy import sparse

        if self._iter == 0:
            if isinstance(context,dict) or isinstance(actions[0],dict):
                self._core_model = sparse.csr_matrix(self._featurize(context, actions[0]).shape)
            else:
                self._core_model = np.zeros(self._featurize(context, actions[0]).shape)

        if self._iter == 200:
            self._times = [0,0,0,0]

        if (self._iter < 200):
            return [1/len(actions)] * len(actions)

        else:
            maxScore  = -float('inf')
            maxAction = None

            for action in actions:
                features = self._featurize(context,action)
                score = self._bin_search(features, len(actions))

                if score > maxScore:
                    maxAction = action
                    maxScore  = score

            return [int(action == maxAction) for action in actions]

    def learn(self, key: Key, context: Context, action: Action, reward: float, probability: float) -> None:
        """Learn from the given interaction.

        Args:
            key: The key identifying the interaction this observed reward came from.
            context: The context we're learning about. See the base class for more information.
            action: The action that was selected in the context. See the base class for more information.
            reward: The reward that was gained from the action. See the base class for more information.
            probability: The probability that the given action was taken.
        """

        start = time.time()
        features = self._featurize(context, action)
        self._core_model = self._update_model(self._core_model, features, reward, 1)
        self._times[2] += time.time()-start

        self._iter += 1

        # if (self._iter-200-1) % 50 == 0 and self._iter > 200:
        #     print(f'avg phi time: {round(self._times[0]/(self._iter-200),2)}')
        #     print(f'avg bin time: {round(self._times[1]/(self._iter-200),2)}')
        #     print(f'avg lrn time: {round(self._times[2]/(self._iter-200),2)}')

    def _bin_search(self, features, K_t) -> float:

        start = time.time()

        y_u = 2
        w   = 1

        f_u_a_w = self._update_model(self._core_model, features, y_u, w)
        f_x_t_a = self._predict_model(self._core_model, features)
        s_u_a   = (self._predict_model(f_u_a_w, features) - f_x_t_a) / w

        obj = lambda w: w*(f_x_t_a-y_u)**2 - w*(f_x_t_a+s_u_a*w-y_u)**2

        lower_search_bound = 0
        upper_search_bound = (f_x_t_a-y_u)/(-s_u_a)
        width_search_bound = upper_search_bound - lower_search_bound

        constraint = self._alpha * math.log(K_t)

        w_old = lower_search_bound
        w_now = lower_search_bound + 1/2*width_search_bound
        o     = obj(w_now)

        while abs(w_now-w_old) > width_search_bound*(1/2)**30 or o >= constraint:
            w_diff = abs(w_now-w_old)
            w_old  = w_now
            if o < constraint:
                w_now += w_diff/2
            else:
                w_now -= w_diff/2
            o = obj(w_now)

        self._times[1] += time.time() - start

        return f_x_t_a + s_u_a*w_now

    def _featurize(self, context, action):
        import numpy as np #type: ignore

        start = time.time()

        is_sparse = isinstance(context, dict) or isinstance(action, dict)

        if isinstance(context, dict):
            context_values = list(context.values())
            context_names  = list([ f"x{k}" for k in context.keys() ])
        else:
            context_values = (context or [1])
            context_names  = [''] if not is_sparse else [ f"x{i}" for i in range(len(context_values)) ]

        if isinstance(action, dict):
            action_names  = list([ f"a{k}" for k in action.keys() ])
            action_values = list(action.values())
        else:
            action_values = action
            action_names  = [''] if not is_sparse else [ f"a{i}" for i in range(len(action_values)) ]

        x_terms_by_degree = self._terms_by_degree(len(context_values), self._x_p.fit_transform([context_values])[0])
        a_terms_by_degree = self._terms_by_degree(len(action_values) , self._a_p.fit_transform([action_values])[0])
        features          = self._interaction_terms(x_terms_by_degree, a_terms_by_degree, [1])

        if is_sparse:
            x_names_by_degree = self._terms_by_degree(len(context_values), self._x_p.get_feature_names(context_names))
            a_names_by_degree = self._terms_by_degree(len(context_values), self._a_p.get_feature_names(action_names))
            names             = self._interaction_terms(x_names_by_degree, a_names_by_degree, [''])

        final_features = np.array(features) if not is_sparse else self._h.fit_transform([list(zip(names,features))])

        self._times[0] += time.time() - start

        return final_features

    def _terms_by_degree(self, base_term_count:int, terms:Sequence[Any], with_bias:bool = False) -> Dict[int,Sequence[Any]]:
        terms_by_degree = {} 

        index  = 0 if not with_bias else 1
        degree = 1

        while index != len(terms):
            degree_terms_count = int((base_term_count**degree + base_term_count)/2)
            terms_by_degree[degree] = terms[index:degree_terms_count]

            index  += degree_terms_count
            degree += 1

        return terms_by_degree

    def _interaction_terms(self, x_terms_by_degree, a_terms_by_degree, default):

        import numpy as np

        interaction_terms = []

        for term in self._terms:
            x_for_degree = x_terms_by_degree.get(term[0], default)
            a_for_degree = a_terms_by_degree.get(term[1], default)

            if not isinstance(x_for_degree[0],str):
                outer = np.outer(x_for_degree, a_for_degree)
            else:
                outer = np.char.array(x_for_degree)[:,None] + np.char.array(a_for_degree)

            interaction_terms += outer.T.reshape((1,-1)).squeeze().tolist()

        return interaction_terms

    def _predict_model(self, model, features):
        import numpy as np
        import scipy.sparse as sp

        if sp.issparse(model):
            return model.multiply(features).data.sum()
        else:
            return np.dot(model, features)

    def _update_model(self, model, features, value, importance):
        error = self._predict_model(model, features) - value
        return model - self._learning_rate*features*error*importance
示例#39
0
class QClassifierImpl:
    """
    A wrapper for question classifier
    """
    def __init__(self, train_data_path, pred_qs=None):
        """
        Constructor
        """
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
            datefmt='%a, %d %b %Y %H:%M:%S',
            filename='qclassifier.log',
            filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']

    def train(self):
        """
        Train use all of the given data
        """
        self.extractor.load(path=self.path)
        self.features = self.extractor.extract_features()
        self.labels = self.extractor.get_labels()
        self.clf = QClassifier(questions=self.extractor.questions)
        assert (len(self.labels) == len(self.features))

        X = self.features
        Y = self.labels
        self.vectorizer = FeatureHasher(input_type='string', non_negative=True)
        X = self.vectorizer.transform(X)
        Y = asarray(Y)

        logging.info('start training')
        self.clf.train(X, Y)
        logging.info('done')

    def get_type(self, question):
        """
        Get type for a given question
        """
        if not self.features or not self.labels:
            logging.error('You need to train model first!')
            return None
        if not question:
            logging.error('Question should not be None')
            return None
        f = [self.extractor.extract_features_aux(question)]
        f = self.vectorizer.transform(f)
        # print self.clf.predict(f)
        return self.cate[self.clf.predict(f)[0]]
示例#40
0
def load_data_v1(data_path):
    attr_name = [
        'taxi_id', 'point', 'duration', 'time', 'duration', 'distance'
    ]
    # 训练集数据
    train = pd.read_csv(os.path.join(data_path, 'train.txt'), header=None)
    train_set = train.values[:, [0, 1, 2, 3, 4, 5, 6]]
    dataset = train.values[:, [0, 1, 2, 3, 4, 5]]
    print(train_set[0])

    # 测试集数据
    test = pd.read_csv(os.path.join(data_path, 'test.txt'), header=None)
    test_set = test.values[:, [0, 1, 2, 3, 4, 5, 6]]
    print(test_set[0])

    # 测试集中除去最后一列数据存放于列表中,以出租车ID为主键
    samples = list()
    for sample in dataset:
        sample_dict = dict()
        for index, attr in enumerate(sample):
            sample_dict[attr_name[index]] = attr
        samples.append(sample_dict)

    h = FeatureHasher(n_features=2048)
    h.fit(samples)

    # 训练集数据转换成x,y列表
    x_train = list()
    y_train = list()
    for sample in train_set:
        sample_dict = dict()
        for index, attr in enumerate(sample):
            attr = str(attr)
            if index == 6:
                y_train.append(int(attr))
                continue
            sample_dict[attr_name[index]] = attr
        x_train.append(sample_dict)

    # 测试集数据转换成x,y列表
    x_test = list()
    y_test = list()
    for sample in test_set:
        sample_dict = dict()
        for index, attr in enumerate(sample):
            attr = str(attr)
            if index == 6:
                y_test.append(int(attr))
                continue
            sample_dict[attr_name[index]] = attr
        x_test.append(sample_dict)

    x_train = h.transform(x_train).toarray()
    x_test = h.transform(x_test).toarray()
    print(x_train[0])
    print(x_test[0])
    print(x_train.shape)
    print(x_test.shape)

    y_train = np.asarray(y_train, dtype='int16')
    y_test = np.asarray(y_test, dtype='int16')

    y_train = np_utils.to_categorical(y_train)
    y_test = np_utils.to_categorical(y_test, nb_classes)
    print(y_train.shape)
    print(y_test.shape)

    # return x_train, y_train, x_dev, y_dev, x_test
    return x_train, y_train, x_test, y_test, x_test
import apsw
c = apsw.Connection("../data/imdb.sqlite")
movie_data = c.cursor().execute("select * from movie_data").fetchall()
c.close()
del c
del apsw

X = [x.split(',') for (x, y) in movie_data]
y = [y for (x, y) in movie_data]
del movie_data

from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.feature_extraction import FeatureHasher
from sklearn.neural_network import BernoulliRBM
thePipe = Pipeline([("hash", FeatureHasher(input_type="string")),
                    ('RBM', BernoulliRBM()), ('XGB', XGBRegressor())])

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

paramGrid = {
    'XGB__max_depth': [3],
    'XGB__n_estimators': [100],
    'RBM__n_components': [20],
    "hash__n_features": [100000]
}

theScorer = make_scorer(mean_squared_error, greater_is_better=False)

clf = GridSearchCV(thePipe,
示例#42
0
文件: test.py 项目: pay64k/_tools
print("Loading 20 newsgroups training data")
raw_data = fetch_20newsgroups(subset='train', categories=categories).data
data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6
print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb))
print()
#
# print("DictVectorizer")
# t0 = time()
# vectorizer = DictVectorizer()
# vectorizer.fit_transform(token_freqs(d) for d in raw_data)
# duration = time() - t0
# print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
# print("Found %d unique terms" % len(vectorizer.get_feature_names()))
# print()

print("FeatureHasher on frequency dicts")
t0 = time()
hasher = FeatureHasher(n_features=n_features)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % n_nonzero_columns(X))
print()

print("FeatureHasher on raw tokens")
t0 = time()
hasher = FeatureHasher(n_features=n_features, input_type="string")
X = hasher.transform(tokens(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % n_nonzero_columns(X))
示例#43
0
def test_hasher_invalid_input():
    raw_X = [[], (), iter(range(0))]

    feature_hasher = FeatureHasher(input_type="gobbledygook")
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features=-1)
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features=0)
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features="ham")
    with pytest.raises(TypeError):
        feature_hasher.transform(raw_X)

    feature_hasher = FeatureHasher(n_features=np.uint16(2 ** 6))
    with pytest.raises(ValueError):
        feature_hasher.transform([])
    with pytest.raises(Exception):
        feature_hasher.transform([[5.5]])
    with pytest.raises(Exception):
        feature_hasher.transform([[None]])
示例#44
0
    for pos in xrange(0, len(seq), size):
        yield seq[pos:pos + size]


categories = [
    'alt.atheism',
    'comp.graphics',
    'comp.sys.ibm.pc.hardware',
    'misc.forsale',
    'rec.autos',
    'sci.space',
    'talk.religion.misc',
]

dataset = fetch_20newsgroups(subset='train', categories=categories)
classif_data = zip(dataset.data, dataset.target)
classes = np.array(list(set(dataset.target)))

hasher = FeatureHasher()
classifier = SGDClassifier()

for i, chunk in enumerate(chunker(classif_data, 100)):
    messages, topics = zip(*chunk)
    X = hasher.transform(token_freqs(msg) for msg in messages)
    y = np.array(topics)
    classifier.partial_fit(X, topics, classes=classes)
    if i % 100 == 0:
        # dump model to be able to monitor quality and later
        # analyse convergence externally
        joblib.dump(classifier, 'model_%04d.pkl' % i)
示例#45
0
def test_hasher_set_params():
    # Test delayed input validation in fit (useful for grid search).
    hasher = FeatureHasher()
    hasher.set_params(n_features=np.inf)
    with pytest.raises(TypeError):
        hasher.fit()
示例#46
0
def make_sparse_data(use_feature_hashing=False):
    """
    Function to create sparse data with two features always zero
    in the training set and a different one always zero in the
    test set
    """
    # Create training data
    X, y = make_classification(n_samples=500,
                               n_features=3,
                               n_informative=3,
                               n_redundant=0,
                               n_classes=2,
                               random_state=1234567890)

    # we need features to be non-negative since we will be
    # using naive bayes laster
    X = np.abs(X)

    # make sure that none of the features are zero
    X[np.where(X == 0)] += 1

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)]

    # create a list of dictionaries as the features
    # with f1 and f5 always 0
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        row = [0] + row.tolist() + [0]
        features.append(dict(zip(feature_names, row)))

    # use a FeatureHasher if we are asked to do feature hashing
    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    train_fs = FeatureSet('train_sparse',
                          ids,
                          features=features,
                          labels=y,
                          vectorizer=vectorizer)

    # now create the test set with f4 always 0 but nothing else
    X, y = make_classification(n_samples=100,
                               n_features=4,
                               n_informative=4,
                               n_redundant=0,
                               n_classes=2,
                               random_state=1234567890)
    X = np.abs(X)
    X[np.where(X == 0)] += 1
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)]

    # create a list of dictionaries as the features
    # with f4 always 0
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        row = row.tolist()
        row = row[:3] + [0] + row[3:]
        features.append(dict(zip(feature_names, row)))

    test_fs = FeatureSet('test_sparse',
                         ids,
                         features=features,
                         labels=y,
                         vectorizer=vectorizer)

    return train_fs, test_fs
示例#47
0
文件: cab_mlp.py 项目: guangxush/Trec
def save_epoch(nn_model, epoch):
    if not os.path.exists('models/'):
        os.makedirs('models/')
    nn_model.save_weights('models/weights_epoch_%d.h5' % epoch, overwrite=True)


def load_epoch(nn_model, epoch):
    assert os.path.exists('models/weights_epoch_%d.h5' %
                          epoch), 'Weights at epoch %d not found' % epoch
    nn_model.load_weights('models/weights_epoch_%d.h5' % epoch)


seed = 7
np.random.seed(seed)
h = FeatureHasher(n_features=2048)
vec = DictVectorizer()
le = preprocessing.LabelEncoder()
nb_epoch = 500
batch_size = 2048

attr_name = [
    'taxiID', 'point', 'time', 'dst', 'direc', 'distance', 'wth', 'FX'
]
train = pd.read_csv("train.txt", header=None)
train_set = train.values[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]]
print(train_set[0])

test = pd.read_csv("test.txt")
test_set = test.values[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]]
print(test_set[0])
示例#48
0
class CountMinSketch(object):
    """
    A class for counting hashable items using the Count-min Sketch strategy.
    It fulfills a similar purpose than `itertools.Counter`.

    The Count-min Sketch is a randomized data structure that uses a constant
    amount of memory and has constant insertion and lookup times at the cost
    of an arbitrarily small overestimation of the counts.

    It has two parameters:
     - `m` the size of the hash tables, larger implies smaller overestimation
     - `d` the amount of hash tables, larger implies lower probability of
           overestimation.

    An example usage:

        from countminsketch import CountMinSketch
        sketch = CountMinSketch(1000, 10)  # m=1000, d=10
        sketch.add("oh yeah")
        sketch.add(tuple())
        sketch.add(1, value=123)
        print sketch["oh yeah"]       # prints 1
        print sketch[tuple()]         # prints 1
        print sketch[1]               # prints 123
        print sketch["non-existent"]  # prints 0

    Note that this class can be used to count *any* hashable type, so it's
    possible to "count apples" and then "ask for oranges". Validation is up to
    the user.
    """
    def __init__(self, m, samplesize,rs):
        """ sizes is an array of hash dimensions.
        """
        if not m:
            raise ValueError("Table size (m) and amount of hash functions (d)"
                             " must be non-zero")

        self.n = 0
        self.m=m
        self.samplesize=samplesize
        self.rs=rs
        self.fh=FeatureHasher(self.m) #,alternate_sign=False

        row=[]
        col=[]
        data=[]
        #print indices
        for i in xrange(self.m):
            numpy.random.seed(i+(self.rs*10000))
            v=numpy.random.normal(0,1,self.m)
            v=numpy.multiply(sqrt(self.m),v)
            row.extend([idx for idx in xrange(self.m)])
            col.extend([i for idx in xrange(self.m)])
            data.extend(v)
        self.tables=csr_matrix ((data,(row,col)), shape=(self.m,self.m))
        #self.tables = numpy.matlib.zeros(shape=(m,samplesize))
        #self.tables=numpy.random.normal(size=(m,samplesize))
#        for _ in xrange(d):
#            table = array.array("d", (0.0 for _ in xrange(m)))
#            self.tables.append(table)
    def _old_hash(self, x):
        #x=x.reshape((x.shape[0],))
        #print x
        #hv=np.zeros((self.m,1))
        #print hv
        #print x.nonzero()[0]
        dict_feat={}
        for ind in x.nonzero()[0]:
            #print ind
            #print x[ind,0]
            dict_feat[str(ind+(self.rs*10000))]= x[ind,0]
            #md5 = hashlib.md5(str(hash(ind)))
            #md5.update(str((self.rs*10000)))
            #print int(md5.hexdigest(), 16) % self.m
            #hv[int(md5.hexdigest(), 16) % self.m]+= x[ind,0]
        #print dict_feat
        hashed_features = self.fh.transform([dict_feat]).todense().T
        #print hashed_features
        return hashed_features
    def _hash(self, x):
        #x=x.reshape((x.shape[0],))
        #print x
        hv=np.zeros((self.m,1))
        #print hv
        #print x.nonzero()[0]
        for ind in x.nonzero()[0]:
            #print ind
            #print x[ind,0]
            md5 = hashlib.md5(str(hash(ind)))
            md5.update(str((self.rs*10000)))
            #print int(md5.hexdigest(), 16) % self.m
            hv[int(md5.hexdigest(), 16) % self.m]+= x[ind,0]
        return hv
    def transform(self, vector):
        #print "example size", vector.shape
        #print "transformation size", self.tables.shape
        #tables=csr_matrix ((self.m,self.samplesize))

        #num_cores = multiprocessing.cpu_count()
        indices=vector.nonzero()[0]
        #TODO hash the vector in a reduced space
        hv = self._hash(vector)
        #print hv


        # results = Parallel(n_jobs=num_cores)(delayed(processInput)(i,self.m,self.rs) for i in indices)
        # parrow = []
        # parcol = []
        # pardata = []
        # for (row,col,v) in results:
        #     parrow.extend(row)
        #     parcol.extend(col)
        #     pardata.extend(v)



        transformation= numpy.tanh(self.tables*hv)
        #print transformation.shape
        #assert(parrow==row)
        #assert(parcol==col)
        #assert(pardata==data)

        return transformation
示例#49
0
warm_file = 'f:\\data\\avazu_ctr\\start.csv'
seed = int(3217)

#%%
###############################################################################
# Main
###############################################################################
chunk_size = int(4096)
header=['id','click','hour','C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id'\
        ,'device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21']

#preprocessing
preproc = Pipeline([('fh',
                     FeatureHasher(n_features=2**27,
                                   input_type='string',
                                   non_negative=False))])


#
def clean_data(data):
    y_train = data['click']  ##for Vowpal Wabbit
    data['app'] = data['app_id'].values + data['app_domain'].values + data[
        'app_category'].values
    data['site'] = data['site_id'].values + data['site_domain'].values + data[
        'site_category'].values
    data['device'] = data['device_id'].values + data[
        'device_ip'].values + data['device_model'].values + (
            data['device_type'].values.astype(str)) + (
                data['device_conn_type'].values.astype(str))
    data['type'] = data['device_type'].values + data['device_conn_type'].values
示例#50
0
def main():

    path = r'/Users/jlittler/Documents/Developer/python/mlenv/datasets/kaggle-avazu'
    train = pd.read_csv(os.path.join(path, 'train-10k.csv'))

    msk = np.random.rand(len(train)) < 0.8
    features = [
        3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
    ]

    # create a simple baseline method
    X_train = train[msk].iloc[:, features]
    X_test = train[~msk].iloc[:, features]
    y_train = train[msk].iloc[:, 1]
    y_test = train[~msk].iloc[:, 1]

    print('{:.2f}'.format(
        log_loss(y_test,
                 np.ones(len(y_test)) * y_train.mean())))

    # method 1 - encoding to ordinal values
    X_train_ordinal = X_train.values
    X_test_ordinal = X_test.values

    les = []
    l = LogisticRegression()
    r = RandomForestClassifier(n_estimators=25, max_depth=10)

    for i in range(X_train_ordinal.shape[1]):
        le = LabelEncoder()
        le.fit(train.iloc[:, features].iloc[:, i])
        les.append(le)
        X_train_ordinal[:, i] = le.transform(X_train_ordinal[:, i])
        X_test_ordinal[:, i] = le.transform(X_test_ordinal[:, i])

    l.fit(X_train_ordinal, y_train)
    y_pred = l.predict_proba(X_test_ordinal)
    print('{:.2f}'.format(log_loss(y_test, y_pred)))

    r.fit(X_train_ordinal, y_train)
    y_pred = r.predict_proba(X_test_ordinal)
    print('{:.2f}'.format(log_loss(y_test, y_pred)))

    # method 2 - one hot encoding
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(X_train_ordinal)
    X_train_onehot = enc.transform(X_train_ordinal)
    X_test_onehot = enc.transform(X_test_ordinal)

    l.fit(X_train_onehot, y_train)
    y_pred = l.predict_proba(X_test_onehot)
    print('{:.2f}'.format(log_loss(y_test, y_pred)))

    r.fit(X_train_onehot, y_train)
    y_pred = r.predict_proba(X_test_onehot)
    print('{:.2f}'.format(log_loss(y_test, y_pred)))

    # method 3 - group rare values
    X_train_rare = copy.copy(X_train)
    X_test_rare = copy.copy(X_test)
    X_train_rare["test"] = 0
    X_test_rare["test"] = 1

    temp_df = pd.concat([X_train_rare, X_test_rare], axis=0)
    names = list(X_train_rare.columns.values)

    for i in names:
        temp_df.loc[temp_df[i].value_counts()[temp_df[i]].values < 20,
                    i] = 'RARE_VALUE'

    for i in range(temp_df.shape[1]):
        temp_df.iloc[:, i] = temp_df.iloc[:, i].astype('str')

    X_train_rare = temp_df[temp_df['test'] == '0'].iloc[:, :-1].values
    X_test_rare = temp_df[temp_df['test'] == '1'].iloc[:, :-1].values

    for i in range(X_train_rare.shape[1]):
        le = LabelEncoder()
        le.fit(temp_df.iloc[:, :-1].iloc[:, i])
        les.append(le)
        X_train_rare[:, i] = le.transform(X_train_rare[:, i])
        X_test_rare[:, i] = le.transform(X_test_rare[:, i])

    enc.fit(X_train_rare)
    X_train_rare = enc.transform(X_train_rare)
    X_test_rare = enc.transform(X_test_rare)

    l.fit(X_train_rare, y_train)
    y_pred = l.predict_proba(X_test_rare)
    print(log_loss(y_test, y_pred))

    r.fit(X_train_rare, y_train)
    y_pred = r.predict_proba(X_test_rare)
    print(log_loss(y_test, y_pred))

    print(X_train_rare.shape)

    # method 4 - feature hashing
    X_train_hash = copy.copy(X_train)
    X_test_hash = copy.copy(X_test)

    for i in range(X_train_hash.shape[1]):
        X_train_hash.iloc[:, i] = X_train_hash.iloc[:, i].astype('str')

    for i in range(X_test_hash.shape[1]):
        X_test_hash.iloc[:, i] = X_test_hash.iloc[:, i].astype('str')

    h = FeatureHasher(n_features=100, input_type='string')
    X_train_hash = h.transform(X_train_hash.values)
    X_test_hash = h.transform(X_test_hash.values)

    l.fit(X_train_hash, y_train)
    y_pred = l.predict_proba(X_test_hash)
    print(log_loss(y_test, y_pred))

    r.fit(X_train_hash, y_train)
    y_pred = r.predict_proba(X_test_hash)
    print(log_loss(y_test, y_pred))
示例#51
0
         steps=[("imputer", SimpleImputer(strategy='mean')
                 ), ("scaler",
                     StandardScaler(with_mean=True, with_std=True))]),
     make_column_selector(dtype_include=['float', 'int'])),
    ("category",
     Pipeline(steps=[("imputer",
                      SimpleImputer(strategy='constant', fill_value='missing')
                      ), ("encoder", OneHotEncoder(handle_unknown="ignore"))]),
     make_column_selector(dtype_include='category')),
    (
        "high_cardinality",
        Pipeline(steps=[(
            "imputer",
            SimpleImputer(
                strategy='constant', fill_value='missing', missing_values=None)
        ), ("hasher", FeatureHasher(n_features=10, input_type='string'))]),
        make_column_selector(dtype_include='object'),
    )
],
                                 remainder='passthrough')

#xd = preprocessor.fit_transform(X, y)

# Classification Pipeline
classifier = Pipeline(steps=[('poly', PolynomialFeatures()), (
    'reductor', PCA()), ('selector', SelectFromModel(ExtraTreesClassifier())
                         ), ('estimator', RandomForestClassifier())])

# Main Pipeline
pipe = Pipeline(steps=[('preprocessor',
                        preprocessor), ('classifier', classifier)])
示例#52
0
def test():

    from sklearn.feature_extraction import FeatureHasher
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import metrics

    feat = 7000
    # hasher for the dictionaries where we do not know the number of features
    h = FeatureHasher(n_features=feat)

    start_time = time()

    # hash of the list of the feature dictionaries of the each test directory
    TX = h.transform(pickle.load(open(test_dict_filename, 'rb'))).toarray()

    # appending the vector corresponding to the already found list of features for each file in the test directory
    TX = np.concatenate(
        (TX, np.array(pickle.load(open(test_feature_list_filename, 'rb')))),
        axis=1)

    # loading the categories for the test sets
    Ty = np.array(pickle.load(open(test_predict_filename, 'rb')))

    # load the saved model
    clf = pickle.load(open('modeldyn_parameters.sav', 'rb'))

    # predict the values for test data
    prediction_values = clf.predict(TX)

    # function to see if a class corresponds to benign binaries or malware
    f = lambda x: 1 if x > 0 else 0

    def fromiter(x):
        return np.fromiter((f(xi) for xi in x), x.dtype)

    # lump all malware predictions/categories into one
    prediction_values = fromiter(prediction_values)
    Ty = fromiter(Ty)

    # print statistics from the data
    print("features:", feat)
    print("accuracy:", metrics.accuracy_score(prediction_values, Ty))
    print("f1 score:", metrics.f1_score(prediction_values, Ty,
                                        average='micro'))
    print("precision score:",
          metrics.precision_score(prediction_values, Ty, average='micro'))
    print("recall score:",
          metrics.recall_score(prediction_values, Ty, average='micro'))
    print("f1 score (macro):",
          metrics.f1_score(prediction_values, Ty, average='macro'))
    print("precision score (macro):",
          metrics.precision_score(prediction_values, Ty, average='macro'))
    print("recall score (macro):",
          metrics.recall_score(prediction_values, Ty, average='macro'))

    # finding the number of wrong predictions
    mismatch = 0
    tot = prediction_values.shape[0]
    for i in range(tot):
        mismatch += 1 if prediction_values[i] != Ty[i] else 0

    print("mismatches:", mismatch)

    # printing the whole prediction array
    print("prediction is", prediction_values.tolist())

    # printing the whole category array
    print("y is", Ty.tolist())

    end_time = time()

    print('Testing complete in ' + str(end_time - start_time) + ' seconds')
示例#53
0
# In[5]:

#Feature Hashing
from sklearn.feature_extraction import FeatureHasher
X_train_hash = X_train.copy()
X_val_hash = X_val.copy()
X_test_hash = X_test.copy()
for i in range(X_train_hash.shape[1]):
    X_train_hash.iloc[:, i] = X_train_hash.iloc[:, i].astype('str')
for i in range(X_val_hash.shape[1]):
    X_val_hash.iloc[:, i] = X_val_hash.iloc[:, i].astype('str')
for i in range(X_test_hash.shape[1]):
    X_test_hash.iloc[:, i] = X_test_hash.iloc[:, i].astype('str')

#encoding hashing
h = FeatureHasher(n_features=10000, input_type="string")
X_train_hash = h.transform(X_train_hash.values)
X_val_hash = h.transform(X_val_hash.values)
X_test_hash = h.transform(X_test_hash.values)

# # Modeling

# In[9]:

#Import Neccessary Packages
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
import keras
import random as rn
示例#54
0
文件: features.py 项目: ithjz/ember
 def process_raw_features(self, raw_obj):
     exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
     return exports_hashed.astype(np.float32)
示例#55
0
    train_x[c] = le.transform(train_x[c])
    test_x[c] = le.transform(test_x[c])

# -----------------------------------
# feature hashing
# -----------------------------------
# データの読み込み
train_x, test_x = load_data()
# -----------------------------------
from sklearn.feature_extraction import FeatureHasher

# カテゴリ変数をループしてfeature hashing
for c in cat_cols:
    # FeatureHasherの使い方は、他のencoderとは少し異なる

    fh = FeatureHasher(n_features=5, input_type='string')
    # 変数を文字列に変換してからFeatureHasherを適用
    hash_train = fh.transform(train_x[[c]].astype(str).values)
    hash_test = fh.transform(test_x[[c]].astype(str).values)
    # データフレームに変換
    hash_train = pd.DataFrame(hash_train.todense(), columns=[f'{c}_{i}' for i in range(5)])
    hash_test = pd.DataFrame(hash_test.todense(), columns=[f'{c}_{i}' for i in range(5)])
    # 元のデータフレームと結合
    train_x = pd.concat([train_x, hash_train], axis=1)
    test_x = pd.concat([test_x, hash_test], axis=1)

# 元のカテゴリ変数を削除
train_x.drop(cat_cols, axis=1, inplace=True)
test_x.drop(cat_cols, axis=1, inplace=True)

# -----------------------------------
示例#56
0
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder()

one.fit(X)
train = one.transform(X)

print('Train Data Set Has Got {} Rows and {} Columns'.format(
    train.shape[0], train.shape[1]))
# Train Data Set Has Got 300000 Rows and 316461 Columns

from sklearn.feature_extraction import FeatureHasher
X_train_hash = X.copy()
for c in X.columns:
    X_train_hash[c] = X[c].astype('str')
hashing = FeatureHasher(input_type='string')
train = hashing.transform(X_train_hash.values)
print('Train Data Set Has Got {} Rows and {} Columns'.format(
    train.shape[0], train.shape[1]))

X_train_stat = X.copy()
for c in X_train_stat.columns:
    if (X_train_stat[c].dtype == 'object'):
        X_train_stat[c] = X_train_stat[c].astype('category')
        counts = X_train_stat[c].value_counts()
        counts = counts.sort_index()
        counts = counts.fillna(0)
        counts += np.random.rand(len(counts)) / 1000
        X_train_stat[c].cat.categories = counts
print(X_train_stat.head(3))
'''
示例#57
0
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1 - epsilon, pred)
    ll = sum(act * sp.log(pred) +
             sp.subtract(1, act) * sp.log(sp.subtract(1, pred)))
    ll = ll * -1.0 / len(act)
    return ll


# add two columns for hour and weekday
def dayhour(timestr):
    d = datetime.strptime(str(x), "%y%m%d%H")
    return [float(d.weekday()), float(d.hour)]


fh = FeatureHasher(n_features=2**20, input_type="string")

# Train classifier
clf = LassoLars()
train = pd.read_csv("train/subtrain.csv", chunksize=100000, iterator=True)
all_classes = np.array([0, 1])
for chunk in train:
    y_train = chunk["click"]
    chunk = chunk[cols]
    chunk = chunk.join(
        pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
    chunk.drop(["hour"], axis=1, inplace=True)
    Xcat = fh.transform(np.asarray(chunk.astype(str)))
    clf.fit(Xcat, y_train)

# Create a submission file
示例#58
0
def predict_task(current_task, filename, data):
    start_time = time.time()

    # remove password
    current_task.update_state(state='PROGRESS',
                              meta="Removing password on " + filename)
    print("Predict: " + filename)
    input_pdf = io.BytesIO(base64.b64decode(data))
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    temp_file_name = temp_file.name
    with pikepdf.open(input_pdf) as pdf:
        pdf.save(temp_file)

    # extract features
    stats = [0, 0]
    pages = pdf_parser.parse_pdf(temp_file_name, False, stats, current_task)
    temp_csv_file = tempfile.NamedTemporaryFile(delete=False)
    temp_csv_file_name = temp_csv_file.name
    pdfUtil.save_pdf_pages_tocsv(filename, pages, temp_csv_file_name)

    # prepare data
    markup_data = pd.read_csv(temp_csv_file_name)
    markup_data["HasCentLine"] = markup_data["HasCentLine"].astype(int)
    markup_data["HasComboLine"] = markup_data["HasComboLine"].astype(int)
    markup_data["IsMarkupField"] = markup_data["IsMarkupField"].astype(int)
    x = markup_data.drop(labels=[
        'FileName', 'PageNum', 'LineLeft', 'LineRight', 'LineTop',
        'LineBottom', 'Prefix', 'Suffix', 'FieldCode', 'FieldLeft',
        'FieldRight', 'FieldTop', 'FieldBottom', "IsMarkupField"
    ],
                         axis=1)
    transformer = ColumnTransformer(
        [("hash", FeatureHasher(n_features=2,
                                input_type='string'), 'TopElement')],
        remainder="passthrough")
    transformed_x = transformer.fit_transform(x)

    # Get the model's prediction
    current_task.update_state(state='PROGRESS',
                              meta="Get the model's prediction ")
    pdf_model = pickle.load(open("/app/ml_model/markup.pkl", "rb"))
    markup_data['IsMarkupField'] = pdf_model.predict_proba(transformed_x)[:, 1]

    # markup the PDF
    temp_output_file = tempfile.NamedTemporaryFile(delete=False)
    temp_output_file_name = temp_output_file.name
    pdfUtil.markup_pdf(markup_data, temp_file_name, temp_output_file_name)

    # return marked up PDF
    return_data = io.BytesIO()
    with open(temp_output_file_name, 'rb') as fo:
        return_data.write(fo.read())
    return_data.seek(0)

    # clean up
    temp_file.close()
    temp_csv_file.close()
    temp_output_file.close()
    os.remove(temp_file_name)
    os.remove(temp_csv_file_name)
    os.remove(temp_output_file_name)

    total_time = "total time spent: " + str(time.time() - start_time)
    current_task.update_state(state='PROGRESS', meta=total_time)
    print(total_time)

    return {
        'data': base64.b64encode(return_data.read()),
        'attachment_filename': filename,
        'mimetype': 'application/pdf'
    }
示例#59
0
def main(rawdata, rawtarget=pd.DataFrame(), train_test_flag='train'):
    raw_target = rawtarget
    raw_data = rawdata
    raw_data['date_recorded'] = pd.to_datetime(
        raw_data['date_recorded']).apply(lambda x:
                                         (datetime.datetime.today() - x).days)
    numeric_cols = [
        c for c in raw_data.columns
        if raw_data[c].dtype in ['int64', 'float64']
        and c not in ['region_code', 'district_code']
    ]
    cat_cols = [c for c in raw_data.columns if c not in numeric_cols]

    # sns.pairplot(raw_data.merge(raw_target)[numeric_cols+['status_group']].iloc[:,1:], hue="status_group",diag_kind='hist',plot_kws= {'alpha': 0.5})
    # plt.show()

    # Categorical columns cardinality
    print("\n# Unique values in each categorical column:\n",
          raw_data[cat_cols].nunique(axis=0))

    # No. of unknown categories
    (raw_data[cat_cols] == 'unknown').sum()
    raw_data[cat_cols] = raw_data[cat_cols].replace('unknown', np.nan)

    # Deleting unneeded columns
    to_be_del = [
        'waterpoint_type_group', 'source_type', 'quantity_group',
        'quality_group', 'payment_type', 'management_group',
        'extraction_type_class', 'extraction_type_group', 'scheme_name',
        'recorded_by', 'region', 'scheme_management'
    ]

    raw_data = raw_data.drop(to_be_del, axis=1)

    # % of missing values per column
    print("\nMissing value % \n", (raw_data.isna().sum() * 100 /
                                   len(raw_data)).sort_values(ascending=False))
    # (raw_data.isna().sum()*100/len(raw_data)).sort_values(ascending=False).plot(kind='bar')
    # plt.xticks(rotation=45)
    # plt.show()

    # Columns without missing values are hash encoded in bulk
    # Rest of the columns are individually hash encoded
    # This is done to preserve nan's across encoding in order to perform imputation later.

    print("Encoding categorical features..")

    ohc = ['public_meeting', 'permit', 'source_class']
    hashc_ind = [
        'payment', 'installer', 'funder', 'public_meeting', 'permit',
        'water_quality', 'quantity', 'management', 'subvillage',
        'source_class', 'source'
    ]
    hashc0 = ['district_code', 'region_code', 'ward', 'wpt_name',
              'lga']  # 1024 bit encoding
    hashc1 = ['extraction_type', 'waterpoint_type']  # 32 bit encoding
    hashc2 = ['basin']  # 8 bit encoding

    # One hot encoding on binary categorical data
    oh = []
    for oc in ohc:
        ohe = OneHotEncoder(drop='first')
        enc = enc_with_na(raw_data[[oc]], ohe, 1)
        oh.append(enc)
    oh = np.hstack(oh).astype(np.int8)

    # Hash encoding on the rest

    # Individual hashing
    #n_feats_ind = [4, 128, 128, 6, 2, 8, 1024, 8]
    n_feats_ind = [4, 16, 16, 6, 2, 8, 64, 8]
    n_feats_ind = [4, 8, 8, 4, 2, 4, 32, 4]
    hashed_ind = []
    for hc, n in zip(hashc_ind, n_feats_ind):
        h = FeatureHasher(n_features=n,
                          input_type='string',
                          alternate_sign=False)
        enc = enc_with_na(raw_data[[hc]], h, n)
        hashed_ind.append(enc)
    hashed_ind = np.hstack(hashed_ind).astype(np.int8)

    # Collective hashing
    hash_cols_list = [hashc0, hashc1, hashc2]
    #n_feats = [1024, 32, 8]
    n_feats = [64, 16, 8]
    n_feats = [32, 8, 4]
    hashed = []
    for hc, n in zip(hash_cols_list, n_feats):
        h = FeatureHasher(n_features=n,
                          input_type='string',
                          alternate_sign=False)
        enc = enc_with_na(raw_data[hc], h, n)
        hashed.append(enc)
    hashed = np.hstack(hashed).astype(np.int8)

    print("Encoding complete..")

    print("Preparing to write data to disk..")
    raw_data_encoded = pd.concat([
        raw_data.drop(ohc + hashc_ind + hashc0 + hashc1 + hashc2, axis=1),
        pd.DataFrame(np.hstack([oh, hashed_ind, hashed]))
    ],
                                 axis=1)
    raw_data_encoded.to_csv(os.path.join(DATA_DIR, train_test_flag +
                                         '_data_encoded.csv'),
                            header=True,
                            index=False)
    print("Written encoded data to disk..")
orltest=pd.read_csv('D://frad_test.csv')
orldata=orldata.append(orltest)
del orltest
feature=orldata.columns.values.tolist()
orldata.astype(object)
orldata.dtypes.value_counts()

sample=orldata.iloc[0:100,:]



from sklearn.feature_extraction import FeatureHasher

bin_columns_name=['pkgname','ver','adunitshowid','mediashowid','apptype','city','reqrealip','idfamd5','openudidmd5','model','make','osv']
for i in bin_columns_name:
    fh = FeatureHasher(n_features=5, input_type='string')
    orldata[i]=orldata[i].astype('str')
    hashed_features = fh.fit_transform(orldata[i])
    hashed_features = hashed_features.toarray()
    hashed_features=pd.DataFrame(hashed_features)
    hashed_features.columns=[i+'0',i+'1',i+'2',i+'3',i+'4']
    orldata=orldata.join(hashed_features)
    orldata=orldata.drop(columns=i)
    
oh_columns=['os','lan']
orldata_oh=pd.get_dummies(orldata[oh_columns].astype('object'))
orldata_oh=orldata_oh.reset_index(drop=True)
orldata=orldata.join(orldata_oh)
#
#
orldata=orldata.drop(columns=oh_columns)