def process_records(records, fields, target, textmodel=None): tokenize = CountVectorizer().build_analyzer() input = None X = None y_labels = [] for i, record in enumerate(records): nums = [] strs = [] y_labels.append(record.get(target)) for field in fields: if is_number(record.get(field)): nums.append(record[field]) else: strs.append(str(record.get(field) or "").lower()) if strs: if input is None: input = StringIO.StringIO() print >> input, " ".join(tokenize(" ".join(strs))) if nums: if X is None: X = sp.lil_matrix((len(records),len(nums))) X[i] = np.array(nums, dtype=np.float64) if input is not None: if X is not None: X_2 = X.tocsr() else: X_2 = None if isinstance(textmodel,basestring): if textmodel == 'lsi': corpus = TextCorpus(input) textmodel = LsiModel(corpus, chunksize=1000) elif textmodel == 'tfidf': corpus = TextCorpus(input) textmodel = TfidfModel(corpus) elif textmodel == 'hashing': textmodel = None hasher = FeatureHasher(n_features=2 ** 18, input_type="string") input.seek(0) X = hasher.transform(tokenize(line.strip()) for line in input) if textmodel: num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[])) X = corpus2csc(textmodel[corpus], num_terms).transpose() if X_2 is not None: # print >> sys.stderr, "X SHAPE:", X.shape # print >> sys.stderr, "X_2 SHAPE:", X_2.shape X = sp.hstack([X, X_2], format='csr') elif X is not None: textmodel = None X = X.tocsr() print >> sys.stderr, "X SHAPE:", X.shape return X, y_labels, textmodel
def to_ffm(df, outfile, ycol, num_columns = []): df = df.copy() one_based = True hasher = FeatureHasher(input_type='string', non_negative=True) bs = 2**10 value_pattern = u'%d:%d:%.16g' line_pattern = u'%d %s\n' with open(outfile, 'w') as out: pb = progressbar.ProgressBar(maxval=(df.shape[0]+bs+1) // bs).start() for i in xrange((df.shape[0]+bs+1) // bs): pb.update(i) s = slice(i*bs, (i+1)*bs) if ycol in df.columns: Xh = np.asarray(df.iloc[s].drop([ycol], axis=1).drop(num_columns,axis=1).astype('str')) Xv = np.asarray(df.iloc[s][num_columns].astype('float')) y = df.iloc[s][ycol].values.astype('int') else: Xh = np.asarray(df.iloc[s].drop(num_columns,axis=1).astype('str')) Xv = np.asarray(df.iloc[s][num_columns].astype('float')) y = np.zeros((bs,)) Xt = scipy.sparse.hstack([Xv,hasher.transform(Xh)]).tocsr() for j in xrange(Xt.shape[0]): span = slice(Xt.indptr[j], Xt.indptr[j+1]) row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span]) st = " ".join(value_pattern % (j + one_based, fe + one_based, x) for j, fe, x in row if np.isnan(x) == False) feat = (y[j], st) out.write((line_pattern % feat).encode('ascii')) pb.finish()
class QClassifierImpl: """ A wrapper for question classifier """ def __init__(self, train_data_path, pred_qs = None): """ Constructor """ logging.basicConfig(level = logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') self.clf = None self.path = train_data_path self.pred_qs = pred_qs self.extractor = FeatureExtractor() self.features = None self.labels = None self.vectorizer = None self.cate = ['Person', 'Number', 'Location', 'Other'] def train(self): """ Train use all of the given data """ self.extractor.load(path = self.path) self.features = self.extractor.extract_features() self.labels = self.extractor.get_labels() self.clf = QClassifier(questions = self.extractor.questions) assert(len(self.labels) == len(self.features)) X = self.features Y = self.labels self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True) X = self.vectorizer.transform(X) Y = asarray(Y) logging.info('start training') self.clf.train(X, Y) logging.info('done') def get_type(self, question): """ Get type for a given question """ if not self.features or not self.labels: logging.error('You need to train model first!') return None if not question: logging.error('Question should not be None') return None f = [self.extractor.extract_features_aux(question)] f = self.vectorizer.transform(f) # print self.clf.predict(f) return self.cate[self.clf.predict(f)[0]]
class HashSarca(Sarcalingua): def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")): self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair") self.classifier = model self.outEncoder = LabelEncoder() self.drop_outs = set(( u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony", u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique", u"uncyclopedia", u"wikipedia")) def extractFeatures(self, clean_text): return self.featureExtractor.transform( (token_pattern.finditer(clean_text),) ) def corpusToDataset(self, chunkIterator, column_label, HTML=False, **args): def prepare(raw_text): tokens = token_pattern.findall(self.sanitize(raw_text, HTML)) if random.random() < 0.5: # we delete the drop-outs half the time tokens = [tok for tok in tokens if tok not in self.drop_outs] try: alpha = 1./len(tokens) #1./(1+log(len(tokens))) return ((tok.lower(), alpha) for tok in tokens) except ZeroDivisionError: return tuple() for chunk in chunkIterator: X = self.featureExtractor.transform(imap(prepare, chunk.text)) y = np.array(self.outEncoder.fit_transform(chunk[column_label])) yield X,y gc.collect()
def load_conll(f, features, n_features=(2 ** 16), split=False): """Load CoNLL file, extract features on the tokens and hash them. Parameters ---------- f : {string, file-like} Input file. features : callable Feature extraction function. Must take a list of tokens (see below) and an index into this list. n_features : integer, optional Number of columns in the output. split : boolean, default=False Whether to split lines on whitespace beyond what is needed to parse out the labels. This is useful for CoNLL files that have extra columns containing information like part of speech tags. """ fh = FeatureHasher(n_features=n_features, input_type="string") labels = [] lengths = [] with _open(f) as f: raw_X = _conll_sequences(f, features, labels, lengths, split) X = fh.transform(raw_X) return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
def hash(data, labels, new_dimension): print "start hashing trick..." # convert features as dict dictList = list() if hasattr(data, "indices"): #ind = data.indices #dat = data.data data = data.toarray() indices = range(len(data[0])) for item in data: zipped = zip(indices, item) row = dict() for index,value in zipped: if value != 0: row[str(index)] = value dictList.append(row) a = 234 else: indices = map(str, range(len(data[0]))) for row in data: dictList.append(dict(zip(indices, row))) start = time.time() hasher = FeatureHasher(n_features=new_dimension) # , input_type='dict' reduced = hasher.fit_transform(dictList).toarray() end = time.time() return (reduced, end-start)
def io(): hv = FeatureHasher() target = [] train_int = [] train_label = [] for iline in dio.io(): iline = iline.strip().split(',') t = int(iline[0]) int_fs = map(lambda i: numpy.NaN if not i else int(i), iline[1:14]) label_fs = [k for k in iline[14:]] #label_fs = ",".join(iline[14:]) # print int_fs, label_fs target.append(t) train_int.append(int_fs) train_label.append({k:1 for k in label_fs if k}) # print train_int imp = Imputer(missing_values='NaN', strategy='mean', axis=0) train_int = imp.fit_transform(train_int) # print train_int scaler = preprocessing.StandardScaler().fit(train_int) train_int = scaler.transform(train_int) # print train_int train_int = csr_matrix(train_int) # print train_label train_label = hv.transform(train_label) train = hstack((train_int, train_label)) # print train_label # print train return target, train
def test_feature_hasher_pairs(): raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 2], x1_nz) assert_equal([1, 3, 4], x2_nz)
def test_hash_empty_input(): n_features = 16 raw_X = [[], (), iter(range(0))] h = FeatureHasher(n_features=n_features, input_type="string") X = h.transform(raw_X) assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
def hash(mat, num_features): """ hashing trick """ hasher = FeatureHasher(n_features=num_features, non_negative=True) X = hasher.transform(mat) X = X.toarray() return X
def ner(tokens): """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset.""" global _model X = [_features(tokens, i) for i in range(len(tokens))] hasher = FeatureHasher(2**16, input_type="string") return zip(tokens, _model.predict(hasher.transform(X)))
def test_feature_hasher_dicts(): h = FeatureHasher(n_features=16) assert_equal("dict", h.input_type) raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}] X1 = FeatureHasher(n_features=16).transform(raw_X) gen = (d.iteritems() for d in raw_X) X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen) assert_array_equal(X1.toarray(), X2.toarray())
def load_seq2seq(f, features, n_features=(2 ** 16)): fh = FeatureHasher(n_features=n_features, input_type="string") labels = [] lengths = [] with _open(f) as f: raw_X = _sequences(f, features, labels, lengths) X = fh.transform(raw_X) return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
def dump_libffm_format(X, y, f): one_based = True hasher = FeatureHasher(input_type='string', non_negative=True) Xt = hasher.transform(X) value_pattern = u'%d:%d:%.16g' line_pattern = u'%d %s\n' for i in xrange(Xt.shape[0]): span = slice(Xt.indptr[i], Xt.indptr[i+1]) row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span]) s = " ".join(value_pattern % (j + one_based, fe, x) for j, fe, x in row) feat = (y[i], s) f.write((line_pattern % feat).encode('ascii'))
def encode_titles(titles, num_features=2**14): ''' Encode the titles formatted as a string as numerical values using the 'hashing trick'. The size of the feature vector can be specified using the num_features parameter' ''' myHasher = FeatureHasher(input_type='string', n_features= num_features, non_negative=True) featureMatrix = myHasher.transform(titles) return featureMatrix, myHasher
def hash_features(features, arm_ids, use_id=True): n_features = np.shape(features)[1] feature_names = [str(x) for x in np.arange(n_features)] all_features = [] for arm_id, feature_set in zip(arm_ids, features): temp_features = zip(feature_names, feature_set) if use_id == True: temp_features.append(("id_"+str(arm_id), 1)) all_features.append(temp_features) f = FeatureHasher(input_type='pair') return f.transform(all_features)
def predictUserScore(self, body, tags, fgen, users): featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair') # document features featureVector = [(str(dim), value) for dim, value in fgen.getDocumentFeatures(body, tags)] # additional features featureVector.append(("Length", 1)) featureVector.append(("Score", 1)) featureVector.append(("Accepted", 1)) featureVector.append(("OwnerRep", 1)) X = featureHasher.transform([[(str(dim), value) for dim, value in featureVector]]) scores = [score for index, score in enumerate(self.cf.decision_function(X)[0]) if int(self.cf.classes_[index]) in users] return scores
def hash_array(feature_dict, feature_num): # print feature_dict[0] if feature_num == 1: x_new = np.asarray(feature_dict) x_h = x_new.reshape(len(feature_dict), 1) else: hasher = FeatureHasher(n_features=feature_num, non_negative=True, input_type='dict') X_new = hasher.fit_transform(feature_dict) x_h = X_new.toarray() # vec = DictVectorizer() # x_h = vec.fit_transform(feature_dict).toarray() # print x_h.shape, type(x_h) return x_h
def load_conll(f, features, n_features=(2 ** 16), split=False): """Load CoNLL file, extract features on the tokens and vectorize them. The ConLL file format is a line-oriented text format that describes sequences in a space-separated format, separating the sequences with blank lines. Typically, the last space-separated part is a label. Since the tab-separated parts are usually tokens (and maybe things like part-of-speech tags) rather than feature vectors, a function must be supplied that does the actual feature extraction. This function has access to the entire sequence, so that it can extract context features. A ``sklearn.feature_extraction.FeatureHasher`` (the "hashing trick") is used to map symbolic input feature names to columns, so this function dos not remember the actual input feature names. Parameters ---------- f : {string, file-like} Input file. features : callable Feature extraction function. Must take a list of tokens l that represent a single sequence and an index i into this list, and must return an iterator over strings that represent the features of l[i]. n_features : integer, optional Number of columns in the output. split : boolean, default=False Whether to split lines on whitespace beyond what is needed to parse out the labels. This is useful for CoNLL files that have extra columns containing information like part of speech tags. Returns ------- X : scipy.sparse matrix, shape (n_samples, n_features) Samples (feature vectors), as a single sparse matrix. y : np.ndarray, dtype np.string, shape n_samples Per-sample labels. lengths : np.ndarray, dtype np.int32, shape n_sequences Lengths of sequences within (X, y). The sum of these is equal to n_samples. """ fh = FeatureHasher(n_features=n_features, input_type="string") labels = [] lengths = [] with _open(f) as f: raw_X = _conll_sequences(f, features, labels, lengths, split) X = fh.transform(raw_X) return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
def predictUsers(self, body, tags, fgen, n = 3): featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair') # document features featureVector = [(str(dim), value) for dim, value in fgen.getDocumentFeatures(body, tags)] # additional features featureVector.append(("Length", 1)) featureVector.append(("Score", 1)) featureVector.append(("Accepted", 1)) featureVector.append(("OwnerRep", 1)) X = featureHasher.transform([[(str(dim), value) for dim, value in featureVector]]) userIds = [int(self.cf.classes_[index]) for index, score in sorted(enumerate(self.cf.decision_function(X)[0]), key=lambda x:x[1], reverse=True)][:n] # print(userIds) # print(self.cf.predict(X)) return [Users.get(Users.id == userId) for userId in userIds]
def initialize(self): if self.model_class == 'scikit': self.model = SGDRegressor(loss='squared_loss', alpha=0.1, n_iter=10, shuffle=True, eta0=0.0001) self.feature_constructor = FeatureHasher(n_features=200, dtype=np.float64, non_negative=False, input_type='dict') elif self.model_class == 'lookup': self.model = {}
def train(self, corpus, classes=None, chunk_size=100000): self.vectorizer = FeatureHasher(non_negative=True, n_features=len(classes)*2*self.window, input_type='pair') self.clf = MultinomialNB() i = 0 j = 0 X = [] Y = [] for x, y in corpus: if x[self.window][1] in self.input_classes: X.append(x) Y.append(y) i += 1 if i < chunk_size: continue j += 1 click.echo("Running iteration {}".format(j)) X = self.vectorizer.transform(X) self.clf.partial_fit(X, Y, classes) X = [] Y = [] i = 0
def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")): self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair") self.classifier = model self.outEncoder = LabelEncoder() self.drop_outs = set(( u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony", u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique", u"uncyclopedia", u"wikipedia"))
class Model: def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True): #Init scikit models self.FH = FeatureHasher(n_features=numFeatures, input_type='string') self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle) def train(self, gen, v=False): i = 0 for x, y in gen: #For each batch xHash = self.FH.transform(x) #hash trick y = np.array(y) ## for epoch in range(numEpochs): self.Classifier.partial_fit(xHash, y, [0,1]) i += len(x) if v : print(str(datetime.now())[:-7] , "example:", i) def test(self, gen, v=False): #init target and prediction arrays ytot = np.array([]) ptot = np.array([]) #Get prediction for each batch i = 0 for x,y in gen: xHash = self.FH.transform(x) #hash trick p = self.Classifier.predict_proba(xHash) p = p.T[1].T #Keep column corresponding to probability of class 1 #Stack target and prediction for later analysis ytot = np.hstack((ytot, y)) ptot = np.hstack((ptot, p)) i += y.shape[0] if v : print(str(datetime.now())[:-7] , "example:", i) if v: print("Score:", self.score(ytot, ptot)) return (ytot, ptot) def predictBatch(self, batch): hashedBatch = self.FH.transform(batch) prediction = self.Classifier.predict_proba(hashedBatch) return prediction def generatePrediction(self, generator): for xBatch, idBatch in generator: prediction = self.predictBatch(xBatch) yield prediction, idBatch def score(self, target, prediction): return llfun(target, prediction)
def process(self): header = self.inputFile.readline() ids = [] self.features = [] count = 0 for line in self.inputFile: count += 1 fields = line.split(',') id = fields[0] names = {} name = Kaggle_Grupo.Utils.StringNormalize(fields[1]) for i in name.split(' '): names[i] = 1 ids.append(id) self.features.append(names) featureHasher = FeatureHasher(n_features=2**12, dtype=np.uint16) self.features = featureHasher.transform(self.features) self.features = self.features.toarray() self.features = self.encode(width=24) headerFields = ["Cliente_ID"] for i in range(self.features.shape[1]): headerFields.append('ClientName_{}'.format(i)) headerFields = "\t".join(headerFields) self.outputFile.write(headerFields+'\n') for i in range(self.features.shape[0]): self.outputFile.write('{}\t{}\n'.format(ids[i], ('\t'.join(self.features[i].astype('str')).replace('False', '0').replace('True', '1'))))
def test_feature_hasher_pairs_with_string_values(): raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"}, {"baz": u"abc", "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 1], x1_nz) assert_equal([1, 1, 4], x2_nz) raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}]) x1, x2 = h.transform(raw_X).toarray() x1_nz = np.abs(x1[x1 != 0]) x2_nz = np.abs(x2[x2 != 0]) assert_equal([1], x1_nz) assert_equal([1], x2_nz) assert_equal(x1, x2)
def gen_cinput(origindata, pooldata = [],threshold = 5): origin_feas = gen_feature_data(origindata) pool_feas = gen_feature_data(pooldata) feas_X = [] label_Y = [] s = set() for seq in origin_feas: feas_X.extend([item["F"] for item in seq]) for item in seq: s.update(item["F"]) label_Y.extend([item["L"] for item in seq]) assert len(feas_X) == len(label_Y) print "original data data num : "+str(len(feas_X)) feas_X_2 = [] label_Y_2 = [] for seq_id, seq in enumerate(pool_feas): for token_id, token in enumerate(seq): if pooldata[seq_id][2][token_id] == 1: feas_X_2.append(token["F"]) s.update(token["F"]) label_Y_2.append(token["L"]) print "pool data data num : "+str(len(feas_X_2)) print "original feature num ................ "+str(len(s)) X = feas_X + feas_X_2 X = featurefilter(X, threshold) print X[:2] Y = label_Y + label_Y_2 h = FeatureHasher(input_type = "string", non_negative = True) X = h.transform(X) return X ,Y, h
def test_feature_hasher_strings(): raw_X = [[u"foo", "bar", "baz", "foo"], [u"bar", "baz", "quux"]] # note: duplicate for lg_n_features in (7, 9, 11, 16, 22): n_features = 2 ** lg_n_features it = (x for x in raw_X) # iterable h = FeatureHasher(n_features, non_negative=True, input_type="string") X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) assert_equal(X.shape[1], n_features) assert_true(np.all(X.data > 0)) assert_equal(X[0].sum(), 4) assert_equal(X[1].sum(), 3) assert_equal(X.nnz, sum(len(set(x)) for x in raw_X))
def learn(self, fgen, postLimit=None): Parent = Posts.alias() query = Posts.select().join(Parent, on=(Posts.parentid == Parent.id)).where(Posts.posttypeid == 2 & Parent.forevaluation == 0) if postLimit is not None: query = query.limit(postLimit) count = query.count() print("Learning {0} questions".format(count)) allClasses = numpy.array([user.id for user in Users.select()]) maxUserRep = float(Users.select(peewee.fn.Max(Users.reputation)).scalar()) featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair') featureMatrix = [] classList = [] for i, answer in enumerate(query): if answer.owneruserid is None: continue print("Generating feature vector for id {0}".format(answer.id)) # docment features # featureVector = fgen.getDocumentFeatures(answer.parentid.title + answer.parentid.body + answer.body, tagIds) featureVector = fgen.getAnswerFeatures(answer) featureVector = [(str(dim), value) for dim, value in featureVector] # additional features maxScore = Posts.select(peewee.fn.Max(Posts.score)).where(Posts.parentid == answer.parentid).scalar() maxLength = max(len(post.body) for post in Posts.select().where(Posts.parentid == answer.parentid)) featureVector.append(("Length", (len(answer.body)/float(maxLength)))) featureVector.append(("Score", 1 if maxScore == 0 else (answer.score/float(maxScore)))) featureVector.append(("Accepted", 1 if answer.id == answer.parentid.acceptedanswerid else 0)) featureVector.append(("OwnerRep", answer.owneruserid.reputation/maxUserRep)) featureMatrix.append(featureVector) classList.append(answer.owneruserid.id) if len(featureMatrix) == self.batchSize or i == count-1: print("Partial fitting classifier".format(answer.id)) X = featureHasher.transform(featureMatrix) Y = numpy.array(classList) self.cf.partial_fit(X, Y, classes=allClasses) allClasses = None featureMatrix = [] classList = []
def test_feature_hasher_strings(): # mix byte and Unicode strings; note that "foo" is a duplicate in row 0 raw_X = [["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"]] for lg_n_features in (7, 9, 11, 16, 22): n_features = 2 ** lg_n_features it = (x for x in raw_X) # iterable h = FeatureHasher(n_features, non_negative=True, input_type="string") X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) assert_equal(X.shape[1], n_features) assert_true(np.all(X.data > 0)) assert_equal(X[0].sum(), 4) assert_equal(X[1].sum(), 3) assert_equal(X.nnz, 6)
parser = argparse.ArgumentParser("get windows object vectors for files") parser.add_argument("--malware_paths", default=None, help="Path to malware training files") parser.add_argument("--benignware_paths", default=None, help="Path to benignware training files") parser.add_argument("--scan_file_path", default=None, help="File to scan") parser.add_argument("--evaluate", default=False, action="store_true", help="Perform cross-validation") args = parser.parse_args() hasher = FeatureHasher(20000) if args.malware_paths and args.benignware_paths and not args.evaluate: train_detector(args.benignware_paths, args.malware_paths, hasher) elif args.scan_file_path: scan_file(args.scan_file_path) elif args.malware_paths and args.benignware_paths and args.evaluate: X, y = get_training_data(args.benignware_paths, args.malware_paths, hasher) cv_evaluate(X, y, hasher) else: print "[*] You did not specify a path to scan," \ " nor did you specify paths to malicious and benign training files" \ " please specify one of these to use the detector.\n" parser.print_help()
class ActorCriticLearner(WhenLearner): def __init__(self, gamma=0.9, lr=1e-3, state_size=1000, action_size=1000, hidden_size=200): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = gamma self.lr = lr self.state_size = state_size self.action_size = action_size self.hidden_size = hidden_size self.state_hasher = FeatureHasher(n_features=self.state_size) self.action_hasher = FeatureHasher(n_features=self.action_size) self.value_net = ACValueNet(self.state_size, self.hidden_size) self.action_net = ACActionNet(self.action_size, self.hidden_size, self.hidden_size) params = (list(self.value_net.parameters()) + list(self.action_net.parameters())) self.optimizer = torch.optim.Adam(params, lr=self.lr) def gen_state_vector(self, state: dict) -> np.ndarray: state = {str(a): state[a] for a in state} return self.state_hasher.transform([state]).toarray() def gen_action_vectors(self, actions: Collection[Activation]) -> np.ndarray: action_dicts = [] for action in actions: act_d = {} name = action.get_rule_name() act_d['rulename'] = name bindings = action.get_rule_bindings() for a, v in bindings.items(): if isinstance(v, bool): act_d[str(a)] = str(v) else: act_d[str(a)] = v action_dicts.append(act_d) return self.action_hasher.transform(action_dicts).toarray() def eval_all(self, state: dict, actions: Collection[Activation]) -> Collection[float]: pass def eval(self, state: dict, action: Activation) -> float: if state is None: return 0 state_x = torch.from_numpy(self.gen_state_vector(state)).float().to( self.device) action_x = torch.from_numpy(self.gen_action_vectors( [action])).float().to(self.device) with torch.no_grad(): state_val, state_hidden = self.value_net(state_x) action_val = self.action_net(action_x, state_hidden) return action_val[0].cpu().item() def update( self, state: dict, action: Activation, reward: float, next_state: dict, next_actions: Collection[Activation], ) -> None: return sa = self.generate_vector(state, action) if len(next_actions) == 0: next_sa = None else: next_sa = np.stack((self.generate_vector(next_state, next_actions[i]) for i in range(len(next_actions)))) # print("REWARD") # print(reward) # print("NEXT SAs") # print(next_sa.shape) # print() self.replay_memory.push( torch.from_numpy(sa).float().to(self.device), torch.tensor([reward]).to(self.device), torch.from_numpy(next_sa).float().to(self.device)) self.train()
def __init__(self, verbose, min_label_count=1, inference=False): self.fh = FeatureHasher(dtype='float32') self.verbose = verbose self.inference = inference self.min_label_count = min_label_count
# Extract the subject & body ('HeadlineBodyFeatures', HeadlineBodyFeaturesExtractor()), # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( transformer_list=[ #Pipeline for pulling punctuation feature from articles #Using FeatureHasher for both headline and the body ('punct_stats_headline', Pipeline([ ('selector', ItemSelector(key='headline')), ('stats', Punct_Stats()), ('vect', FeatureHasher(10)), ])), ('punct_stats_body', Pipeline([ ('selector', ItemSelector(key='article_body')), ('stats', Punct_Stats()), ('vect', FeatureHasher(10)), ])), ], )), # Use an Bernoulli Naive Bayes classifier as the Baseline Model ('clf', BernoulliNB()), ]) #Fitting the pipline to the training text and labels pipeline.fit(train_texts, train_labels)
def make_classification_data(num_examples=100, train_test_ratio=0.5, num_features=10, use_feature_hashing=False, feature_bins=4, num_labels=2, empty_labels=False, feature_prefix='f', class_weights=None, non_negative=False, one_string_feature=False, num_string_values=4, random_state=1234567890): # use sklearn's make_classification to generate the data for us num_numeric_features = (num_features - 1 if one_string_feature else num_features) X, y = make_classification(n_samples=num_examples, n_features=num_numeric_features, n_informative=num_numeric_features, n_redundant=0, n_classes=num_labels, weights=class_weights, random_state=random_state) # if we were told to only generate non-negative features, then # we can simply take the absolute values of the generated features if non_negative: X = abs(X) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] # create a string feature that has four possible values # 'a', 'b', 'c' and 'd' and add it to X at the end if one_string_feature: prng = RandomState(random_state) random_indices = prng.random_integers(0, num_string_values - 1, num_examples) possible_values = [chr(x) for x in range(97, 97 + num_string_values)] string_feature_values = [possible_values[i] for i in random_indices] string_feature_column = np.array(string_feature_values, dtype=object).reshape(100, 1) X = np.append(X, string_feature_column, 1) # create a list of dictionaries as the features feature_names = [ '{}{:02d}'.format(feature_prefix, n) for n in range(1, num_features + 1) ] features = [dict(zip(feature_names, row)) for row in X] # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # are we told to generate empty labels train_labels = None if empty_labels else train_y test_labels = None if empty_labels else test_y # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('classification_train', train_ids, labels=train_labels, features=train_features, vectorizer=vectorizer) if train_test_ratio < 1.0: test_fs = FeatureSet('classification_test', test_ids, labels=test_labels, features=test_features, vectorizer=vectorizer) else: test_fs = None return (train_fs, test_fs)
# 'clf__eta0':(0.0001, 0.00001,0.000001), # 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (1000, 5000, 8000, 10000), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, cv=4) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) for number_hashing_features in list_number_hashing_features: print("Number of hashing features : %d" % number_hashing_features) data = original_data hasher = FeatureHasher(n_features=number_hashing_features, input_type='string') for column_name in list_hash_columns: data = hashing_data(data, column_name, hasher, number_hashing_features) data = extract_time_stamp_feature(data) X_data, Y_data = split_X_and_Y(data) grid_search.fit(X_data, Y_data['click']) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() print(grid_search.grid_scores_) for param_name in sorted(parameters.keys()):
def test_hasher_zeros(): # Assert that no zeros are materialized in the output. X = FeatureHasher().transform([{"foo": 0}]) assert X.data.shape == (0,)
class RegCBLearner(Learner): """A learner using the RegCB algorithm by Foster et al. and the online bin search implementation by Bietti et al. References: Foster, Dylan, Alekh Agarwal, Miroslav Dudík, Haipeng Luo, and Robert Schapire. "Practical contextual bandits with regression oracles." In International Conference on Machine Learning, pp. 1539-1548. PMLR, 2018. Bietti, Alberto, Alekh Agarwal, and John Langford. "A contextual bandit bake-off." arXiv preprint arXiv:1802.04064 (2018). """ @property def family(self) -> str: """The family of the learner. See the base class for more information """ return f"RegCB" @property def params(self) -> Dict[str, Any]: """The parameters of the learner. See the base class for more information """ dict = {'beta': self._beta, 'alpha': self._alpha, 'interactions': self._interactions} return dict def __init__(self, *, beta: float, alpha: float, learning_rate:float=0.1, interactions: Sequence[str] = ['a', 'ax']) -> None: """Instantiate a RegCBLearner. Args: beta : square-loss tolerance alpha: confidence bounds precision interactions: the set of interactions the learner will use. x refers to context and a refers to actions, e.g. xaa would mean interactions between context, actions and actions. """ PackageChecker.sklearn("RegCBLearner") from sklearn.feature_extraction import FeatureHasher from sklearn.preprocessing import PolynomialFeatures self._beta = beta self._alpha = alpha self._iter = 0 self._core_model = [] self._times = [0,0,0,0] self._interactions = interactions self._terms = [] self._learning_rate = learning_rate for term in self._interactions: term = term.lower() x_num = term.count('x') a_num = term.count('a') if x_num + a_num != len(term): raise Exception("Letters other than x and a were passed for parameter interactions. Please remove other letters/characters.") self._terms.append((x_num, a_num)) max_x_term = max(max(term[0] for term in self._terms),1) max_a_term = max(max(term[1] for term in self._terms),1) self._x_p = PolynomialFeatures(degree=max_x_term, include_bias=False, interaction_only=False) self._a_p = PolynomialFeatures(degree=max_a_term, include_bias=False, interaction_only=False) self._h = FeatureHasher(input_type='pair') def predict(self, key: Key, context: Context, actions: Sequence[Action]) -> Sequence[float]: """Determine a PMF with which to select the given actions. Args: key: The key identifying the interaction we are choosing for. context: The context we're currently in. See the base class for more information. actions: The actions to choose from. See the base class for more information. Returns: The probability of taking each action. See the base class for more information. """ import numpy as np from scipy import sparse if self._iter == 0: if isinstance(context,dict) or isinstance(actions[0],dict): self._core_model = sparse.csr_matrix(self._featurize(context, actions[0]).shape) else: self._core_model = np.zeros(self._featurize(context, actions[0]).shape) if self._iter == 200: self._times = [0,0,0,0] if (self._iter < 200): return [1/len(actions)] * len(actions) else: maxScore = -float('inf') maxAction = None for action in actions: features = self._featurize(context,action) score = self._bin_search(features, len(actions)) if score > maxScore: maxAction = action maxScore = score return [int(action == maxAction) for action in actions] def learn(self, key: Key, context: Context, action: Action, reward: float, probability: float) -> None: """Learn from the given interaction. Args: key: The key identifying the interaction this observed reward came from. context: The context we're learning about. See the base class for more information. action: The action that was selected in the context. See the base class for more information. reward: The reward that was gained from the action. See the base class for more information. probability: The probability that the given action was taken. """ start = time.time() features = self._featurize(context, action) self._core_model = self._update_model(self._core_model, features, reward, 1) self._times[2] += time.time()-start self._iter += 1 # if (self._iter-200-1) % 50 == 0 and self._iter > 200: # print(f'avg phi time: {round(self._times[0]/(self._iter-200),2)}') # print(f'avg bin time: {round(self._times[1]/(self._iter-200),2)}') # print(f'avg lrn time: {round(self._times[2]/(self._iter-200),2)}') def _bin_search(self, features, K_t) -> float: start = time.time() y_u = 2 w = 1 f_u_a_w = self._update_model(self._core_model, features, y_u, w) f_x_t_a = self._predict_model(self._core_model, features) s_u_a = (self._predict_model(f_u_a_w, features) - f_x_t_a) / w obj = lambda w: w*(f_x_t_a-y_u)**2 - w*(f_x_t_a+s_u_a*w-y_u)**2 lower_search_bound = 0 upper_search_bound = (f_x_t_a-y_u)/(-s_u_a) width_search_bound = upper_search_bound - lower_search_bound constraint = self._alpha * math.log(K_t) w_old = lower_search_bound w_now = lower_search_bound + 1/2*width_search_bound o = obj(w_now) while abs(w_now-w_old) > width_search_bound*(1/2)**30 or o >= constraint: w_diff = abs(w_now-w_old) w_old = w_now if o < constraint: w_now += w_diff/2 else: w_now -= w_diff/2 o = obj(w_now) self._times[1] += time.time() - start return f_x_t_a + s_u_a*w_now def _featurize(self, context, action): import numpy as np #type: ignore start = time.time() is_sparse = isinstance(context, dict) or isinstance(action, dict) if isinstance(context, dict): context_values = list(context.values()) context_names = list([ f"x{k}" for k in context.keys() ]) else: context_values = (context or [1]) context_names = [''] if not is_sparse else [ f"x{i}" for i in range(len(context_values)) ] if isinstance(action, dict): action_names = list([ f"a{k}" for k in action.keys() ]) action_values = list(action.values()) else: action_values = action action_names = [''] if not is_sparse else [ f"a{i}" for i in range(len(action_values)) ] x_terms_by_degree = self._terms_by_degree(len(context_values), self._x_p.fit_transform([context_values])[0]) a_terms_by_degree = self._terms_by_degree(len(action_values) , self._a_p.fit_transform([action_values])[0]) features = self._interaction_terms(x_terms_by_degree, a_terms_by_degree, [1]) if is_sparse: x_names_by_degree = self._terms_by_degree(len(context_values), self._x_p.get_feature_names(context_names)) a_names_by_degree = self._terms_by_degree(len(context_values), self._a_p.get_feature_names(action_names)) names = self._interaction_terms(x_names_by_degree, a_names_by_degree, ['']) final_features = np.array(features) if not is_sparse else self._h.fit_transform([list(zip(names,features))]) self._times[0] += time.time() - start return final_features def _terms_by_degree(self, base_term_count:int, terms:Sequence[Any], with_bias:bool = False) -> Dict[int,Sequence[Any]]: terms_by_degree = {} index = 0 if not with_bias else 1 degree = 1 while index != len(terms): degree_terms_count = int((base_term_count**degree + base_term_count)/2) terms_by_degree[degree] = terms[index:degree_terms_count] index += degree_terms_count degree += 1 return terms_by_degree def _interaction_terms(self, x_terms_by_degree, a_terms_by_degree, default): import numpy as np interaction_terms = [] for term in self._terms: x_for_degree = x_terms_by_degree.get(term[0], default) a_for_degree = a_terms_by_degree.get(term[1], default) if not isinstance(x_for_degree[0],str): outer = np.outer(x_for_degree, a_for_degree) else: outer = np.char.array(x_for_degree)[:,None] + np.char.array(a_for_degree) interaction_terms += outer.T.reshape((1,-1)).squeeze().tolist() return interaction_terms def _predict_model(self, model, features): import numpy as np import scipy.sparse as sp if sp.issparse(model): return model.multiply(features).data.sum() else: return np.dot(model, features) def _update_model(self, model, features, value, importance): error = self._predict_model(model, features) - value return model - self._learning_rate*features*error*importance
class QClassifierImpl: """ A wrapper for question classifier """ def __init__(self, train_data_path, pred_qs=None): """ Constructor """ logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') self.clf = None self.path = train_data_path self.pred_qs = pred_qs self.extractor = FeatureExtractor() self.features = None self.labels = None self.vectorizer = None self.cate = ['Person', 'Number', 'Location', 'Other'] def train(self): """ Train use all of the given data """ self.extractor.load(path=self.path) self.features = self.extractor.extract_features() self.labels = self.extractor.get_labels() self.clf = QClassifier(questions=self.extractor.questions) assert (len(self.labels) == len(self.features)) X = self.features Y = self.labels self.vectorizer = FeatureHasher(input_type='string', non_negative=True) X = self.vectorizer.transform(X) Y = asarray(Y) logging.info('start training') self.clf.train(X, Y) logging.info('done') def get_type(self, question): """ Get type for a given question """ if not self.features or not self.labels: logging.error('You need to train model first!') return None if not question: logging.error('Question should not be None') return None f = [self.extractor.extract_features_aux(question)] f = self.vectorizer.transform(f) # print self.clf.predict(f) return self.cate[self.clf.predict(f)[0]]
def load_data_v1(data_path): attr_name = [ 'taxi_id', 'point', 'duration', 'time', 'duration', 'distance' ] # 训练集数据 train = pd.read_csv(os.path.join(data_path, 'train.txt'), header=None) train_set = train.values[:, [0, 1, 2, 3, 4, 5, 6]] dataset = train.values[:, [0, 1, 2, 3, 4, 5]] print(train_set[0]) # 测试集数据 test = pd.read_csv(os.path.join(data_path, 'test.txt'), header=None) test_set = test.values[:, [0, 1, 2, 3, 4, 5, 6]] print(test_set[0]) # 测试集中除去最后一列数据存放于列表中,以出租车ID为主键 samples = list() for sample in dataset: sample_dict = dict() for index, attr in enumerate(sample): sample_dict[attr_name[index]] = attr samples.append(sample_dict) h = FeatureHasher(n_features=2048) h.fit(samples) # 训练集数据转换成x,y列表 x_train = list() y_train = list() for sample in train_set: sample_dict = dict() for index, attr in enumerate(sample): attr = str(attr) if index == 6: y_train.append(int(attr)) continue sample_dict[attr_name[index]] = attr x_train.append(sample_dict) # 测试集数据转换成x,y列表 x_test = list() y_test = list() for sample in test_set: sample_dict = dict() for index, attr in enumerate(sample): attr = str(attr) if index == 6: y_test.append(int(attr)) continue sample_dict[attr_name[index]] = attr x_test.append(sample_dict) x_train = h.transform(x_train).toarray() x_test = h.transform(x_test).toarray() print(x_train[0]) print(x_test[0]) print(x_train.shape) print(x_test.shape) y_train = np.asarray(y_train, dtype='int16') y_test = np.asarray(y_test, dtype='int16') y_train = np_utils.to_categorical(y_train) y_test = np_utils.to_categorical(y_test, nb_classes) print(y_train.shape) print(y_test.shape) # return x_train, y_train, x_dev, y_dev, x_test return x_train, y_train, x_test, y_test, x_test
import apsw c = apsw.Connection("../data/imdb.sqlite") movie_data = c.cursor().execute("select * from movie_data").fetchall() c.close() del c del apsw X = [x.split(',') for (x, y) in movie_data] y = [y for (x, y) in movie_data] del movie_data from sklearn.pipeline import Pipeline from xgboost import XGBRegressor from sklearn.feature_extraction import FeatureHasher from sklearn.neural_network import BernoulliRBM thePipe = Pipeline([("hash", FeatureHasher(input_type="string")), ('RBM', BernoulliRBM()), ('XGB', XGBRegressor())]) from sklearn.grid_search import GridSearchCV from sklearn.metrics import mean_squared_error, make_scorer paramGrid = { 'XGB__max_depth': [3], 'XGB__n_estimators': [100], 'RBM__n_components': [20], "hash__n_features": [100000] } theScorer = make_scorer(mean_squared_error, greater_is_better=False) clf = GridSearchCV(thePipe,
print("Loading 20 newsgroups training data") raw_data = fetch_20newsgroups(subset='train', categories=categories).data data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6 print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb)) print() # # print("DictVectorizer") # t0 = time() # vectorizer = DictVectorizer() # vectorizer.fit_transform(token_freqs(d) for d in raw_data) # duration = time() - t0 # print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) # print("Found %d unique terms" % len(vectorizer.get_feature_names())) # print() print("FeatureHasher on frequency dicts") t0 = time() hasher = FeatureHasher(n_features=n_features) X = hasher.transform(token_freqs(d) for d in raw_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print("Found %d unique terms" % n_nonzero_columns(X)) print() print("FeatureHasher on raw tokens") t0 = time() hasher = FeatureHasher(n_features=n_features, input_type="string") X = hasher.transform(tokens(d) for d in raw_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print("Found %d unique terms" % n_nonzero_columns(X))
def test_hasher_invalid_input(): raw_X = [[], (), iter(range(0))] feature_hasher = FeatureHasher(input_type="gobbledygook") with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=-1) with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=0) with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features="ham") with pytest.raises(TypeError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=np.uint16(2 ** 6)) with pytest.raises(ValueError): feature_hasher.transform([]) with pytest.raises(Exception): feature_hasher.transform([[5.5]]) with pytest.raises(Exception): feature_hasher.transform([[None]])
for pos in xrange(0, len(seq), size): yield seq[pos:pos + size] categories = [ 'alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'misc.forsale', 'rec.autos', 'sci.space', 'talk.religion.misc', ] dataset = fetch_20newsgroups(subset='train', categories=categories) classif_data = zip(dataset.data, dataset.target) classes = np.array(list(set(dataset.target))) hasher = FeatureHasher() classifier = SGDClassifier() for i, chunk in enumerate(chunker(classif_data, 100)): messages, topics = zip(*chunk) X = hasher.transform(token_freqs(msg) for msg in messages) y = np.array(topics) classifier.partial_fit(X, topics, classes=classes) if i % 100 == 0: # dump model to be able to monitor quality and later # analyse convergence externally joblib.dump(classifier, 'model_%04d.pkl' % i)
def test_hasher_set_params(): # Test delayed input validation in fit (useful for grid search). hasher = FeatureHasher() hasher.set_params(n_features=np.inf) with pytest.raises(TypeError): hasher.fit()
def make_sparse_data(use_feature_hashing=False): """ Function to create sparse data with two features always zero in the training set and a different one always zero in the test set """ # Create training data X, y = make_classification(n_samples=500, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1234567890) # we need features to be non-negative since we will be # using naive bayes laster X = np.abs(X) # make sure that none of the features are zero X[np.where(X == 0)] += 1 # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)] # create a list of dictionaries as the features # with f1 and f5 always 0 feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: row = [0] + row.tolist() + [0] features.append(dict(zip(feature_names, row))) # use a FeatureHasher if we are asked to do feature hashing vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None train_fs = FeatureSet('train_sparse', ids, features=features, labels=y, vectorizer=vectorizer) # now create the test set with f4 always 0 but nothing else X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=2, random_state=1234567890) X = np.abs(X) X[np.where(X == 0)] += 1 ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)] # create a list of dictionaries as the features # with f4 always 0 feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: row = row.tolist() row = row[:3] + [0] + row[3:] features.append(dict(zip(feature_names, row))) test_fs = FeatureSet('test_sparse', ids, features=features, labels=y, vectorizer=vectorizer) return train_fs, test_fs
def save_epoch(nn_model, epoch): if not os.path.exists('models/'): os.makedirs('models/') nn_model.save_weights('models/weights_epoch_%d.h5' % epoch, overwrite=True) def load_epoch(nn_model, epoch): assert os.path.exists('models/weights_epoch_%d.h5' % epoch), 'Weights at epoch %d not found' % epoch nn_model.load_weights('models/weights_epoch_%d.h5' % epoch) seed = 7 np.random.seed(seed) h = FeatureHasher(n_features=2048) vec = DictVectorizer() le = preprocessing.LabelEncoder() nb_epoch = 500 batch_size = 2048 attr_name = [ 'taxiID', 'point', 'time', 'dst', 'direc', 'distance', 'wth', 'FX' ] train = pd.read_csv("train.txt", header=None) train_set = train.values[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] print(train_set[0]) test = pd.read_csv("test.txt") test_set = test.values[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] print(test_set[0])
class CountMinSketch(object): """ A class for counting hashable items using the Count-min Sketch strategy. It fulfills a similar purpose than `itertools.Counter`. The Count-min Sketch is a randomized data structure that uses a constant amount of memory and has constant insertion and lookup times at the cost of an arbitrarily small overestimation of the counts. It has two parameters: - `m` the size of the hash tables, larger implies smaller overestimation - `d` the amount of hash tables, larger implies lower probability of overestimation. An example usage: from countminsketch import CountMinSketch sketch = CountMinSketch(1000, 10) # m=1000, d=10 sketch.add("oh yeah") sketch.add(tuple()) sketch.add(1, value=123) print sketch["oh yeah"] # prints 1 print sketch[tuple()] # prints 1 print sketch[1] # prints 123 print sketch["non-existent"] # prints 0 Note that this class can be used to count *any* hashable type, so it's possible to "count apples" and then "ask for oranges". Validation is up to the user. """ def __init__(self, m, samplesize,rs): """ sizes is an array of hash dimensions. """ if not m: raise ValueError("Table size (m) and amount of hash functions (d)" " must be non-zero") self.n = 0 self.m=m self.samplesize=samplesize self.rs=rs self.fh=FeatureHasher(self.m) #,alternate_sign=False row=[] col=[] data=[] #print indices for i in xrange(self.m): numpy.random.seed(i+(self.rs*10000)) v=numpy.random.normal(0,1,self.m) v=numpy.multiply(sqrt(self.m),v) row.extend([idx for idx in xrange(self.m)]) col.extend([i for idx in xrange(self.m)]) data.extend(v) self.tables=csr_matrix ((data,(row,col)), shape=(self.m,self.m)) #self.tables = numpy.matlib.zeros(shape=(m,samplesize)) #self.tables=numpy.random.normal(size=(m,samplesize)) # for _ in xrange(d): # table = array.array("d", (0.0 for _ in xrange(m))) # self.tables.append(table) def _old_hash(self, x): #x=x.reshape((x.shape[0],)) #print x #hv=np.zeros((self.m,1)) #print hv #print x.nonzero()[0] dict_feat={} for ind in x.nonzero()[0]: #print ind #print x[ind,0] dict_feat[str(ind+(self.rs*10000))]= x[ind,0] #md5 = hashlib.md5(str(hash(ind))) #md5.update(str((self.rs*10000))) #print int(md5.hexdigest(), 16) % self.m #hv[int(md5.hexdigest(), 16) % self.m]+= x[ind,0] #print dict_feat hashed_features = self.fh.transform([dict_feat]).todense().T #print hashed_features return hashed_features def _hash(self, x): #x=x.reshape((x.shape[0],)) #print x hv=np.zeros((self.m,1)) #print hv #print x.nonzero()[0] for ind in x.nonzero()[0]: #print ind #print x[ind,0] md5 = hashlib.md5(str(hash(ind))) md5.update(str((self.rs*10000))) #print int(md5.hexdigest(), 16) % self.m hv[int(md5.hexdigest(), 16) % self.m]+= x[ind,0] return hv def transform(self, vector): #print "example size", vector.shape #print "transformation size", self.tables.shape #tables=csr_matrix ((self.m,self.samplesize)) #num_cores = multiprocessing.cpu_count() indices=vector.nonzero()[0] #TODO hash the vector in a reduced space hv = self._hash(vector) #print hv # results = Parallel(n_jobs=num_cores)(delayed(processInput)(i,self.m,self.rs) for i in indices) # parrow = [] # parcol = [] # pardata = [] # for (row,col,v) in results: # parrow.extend(row) # parcol.extend(col) # pardata.extend(v) transformation= numpy.tanh(self.tables*hv) #print transformation.shape #assert(parrow==row) #assert(parcol==col) #assert(pardata==data) return transformation
warm_file = 'f:\\data\\avazu_ctr\\start.csv' seed = int(3217) #%% ############################################################################### # Main ############################################################################### chunk_size = int(4096) header=['id','click','hour','C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id'\ ,'device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21'] #preprocessing preproc = Pipeline([('fh', FeatureHasher(n_features=2**27, input_type='string', non_negative=False))]) # def clean_data(data): y_train = data['click'] ##for Vowpal Wabbit data['app'] = data['app_id'].values + data['app_domain'].values + data[ 'app_category'].values data['site'] = data['site_id'].values + data['site_domain'].values + data[ 'site_category'].values data['device'] = data['device_id'].values + data[ 'device_ip'].values + data['device_model'].values + ( data['device_type'].values.astype(str)) + ( data['device_conn_type'].values.astype(str)) data['type'] = data['device_type'].values + data['device_conn_type'].values
def main(): path = r'/Users/jlittler/Documents/Developer/python/mlenv/datasets/kaggle-avazu' train = pd.read_csv(os.path.join(path, 'train-10k.csv')) msk = np.random.rand(len(train)) < 0.8 features = [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ] # create a simple baseline method X_train = train[msk].iloc[:, features] X_test = train[~msk].iloc[:, features] y_train = train[msk].iloc[:, 1] y_test = train[~msk].iloc[:, 1] print('{:.2f}'.format( log_loss(y_test, np.ones(len(y_test)) * y_train.mean()))) # method 1 - encoding to ordinal values X_train_ordinal = X_train.values X_test_ordinal = X_test.values les = [] l = LogisticRegression() r = RandomForestClassifier(n_estimators=25, max_depth=10) for i in range(X_train_ordinal.shape[1]): le = LabelEncoder() le.fit(train.iloc[:, features].iloc[:, i]) les.append(le) X_train_ordinal[:, i] = le.transform(X_train_ordinal[:, i]) X_test_ordinal[:, i] = le.transform(X_test_ordinal[:, i]) l.fit(X_train_ordinal, y_train) y_pred = l.predict_proba(X_test_ordinal) print('{:.2f}'.format(log_loss(y_test, y_pred))) r.fit(X_train_ordinal, y_train) y_pred = r.predict_proba(X_test_ordinal) print('{:.2f}'.format(log_loss(y_test, y_pred))) # method 2 - one hot encoding enc = OneHotEncoder(handle_unknown='ignore') enc.fit(X_train_ordinal) X_train_onehot = enc.transform(X_train_ordinal) X_test_onehot = enc.transform(X_test_ordinal) l.fit(X_train_onehot, y_train) y_pred = l.predict_proba(X_test_onehot) print('{:.2f}'.format(log_loss(y_test, y_pred))) r.fit(X_train_onehot, y_train) y_pred = r.predict_proba(X_test_onehot) print('{:.2f}'.format(log_loss(y_test, y_pred))) # method 3 - group rare values X_train_rare = copy.copy(X_train) X_test_rare = copy.copy(X_test) X_train_rare["test"] = 0 X_test_rare["test"] = 1 temp_df = pd.concat([X_train_rare, X_test_rare], axis=0) names = list(X_train_rare.columns.values) for i in names: temp_df.loc[temp_df[i].value_counts()[temp_df[i]].values < 20, i] = 'RARE_VALUE' for i in range(temp_df.shape[1]): temp_df.iloc[:, i] = temp_df.iloc[:, i].astype('str') X_train_rare = temp_df[temp_df['test'] == '0'].iloc[:, :-1].values X_test_rare = temp_df[temp_df['test'] == '1'].iloc[:, :-1].values for i in range(X_train_rare.shape[1]): le = LabelEncoder() le.fit(temp_df.iloc[:, :-1].iloc[:, i]) les.append(le) X_train_rare[:, i] = le.transform(X_train_rare[:, i]) X_test_rare[:, i] = le.transform(X_test_rare[:, i]) enc.fit(X_train_rare) X_train_rare = enc.transform(X_train_rare) X_test_rare = enc.transform(X_test_rare) l.fit(X_train_rare, y_train) y_pred = l.predict_proba(X_test_rare) print(log_loss(y_test, y_pred)) r.fit(X_train_rare, y_train) y_pred = r.predict_proba(X_test_rare) print(log_loss(y_test, y_pred)) print(X_train_rare.shape) # method 4 - feature hashing X_train_hash = copy.copy(X_train) X_test_hash = copy.copy(X_test) for i in range(X_train_hash.shape[1]): X_train_hash.iloc[:, i] = X_train_hash.iloc[:, i].astype('str') for i in range(X_test_hash.shape[1]): X_test_hash.iloc[:, i] = X_test_hash.iloc[:, i].astype('str') h = FeatureHasher(n_features=100, input_type='string') X_train_hash = h.transform(X_train_hash.values) X_test_hash = h.transform(X_test_hash.values) l.fit(X_train_hash, y_train) y_pred = l.predict_proba(X_test_hash) print(log_loss(y_test, y_pred)) r.fit(X_train_hash, y_train) y_pred = r.predict_proba(X_test_hash) print(log_loss(y_test, y_pred))
steps=[("imputer", SimpleImputer(strategy='mean') ), ("scaler", StandardScaler(with_mean=True, with_std=True))]), make_column_selector(dtype_include=['float', 'int'])), ("category", Pipeline(steps=[("imputer", SimpleImputer(strategy='constant', fill_value='missing') ), ("encoder", OneHotEncoder(handle_unknown="ignore"))]), make_column_selector(dtype_include='category')), ( "high_cardinality", Pipeline(steps=[( "imputer", SimpleImputer( strategy='constant', fill_value='missing', missing_values=None) ), ("hasher", FeatureHasher(n_features=10, input_type='string'))]), make_column_selector(dtype_include='object'), ) ], remainder='passthrough') #xd = preprocessor.fit_transform(X, y) # Classification Pipeline classifier = Pipeline(steps=[('poly', PolynomialFeatures()), ( 'reductor', PCA()), ('selector', SelectFromModel(ExtraTreesClassifier()) ), ('estimator', RandomForestClassifier())]) # Main Pipeline pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])
def test(): from sklearn.feature_extraction import FeatureHasher from sklearn.ensemble import RandomForestClassifier from sklearn import metrics feat = 7000 # hasher for the dictionaries where we do not know the number of features h = FeatureHasher(n_features=feat) start_time = time() # hash of the list of the feature dictionaries of the each test directory TX = h.transform(pickle.load(open(test_dict_filename, 'rb'))).toarray() # appending the vector corresponding to the already found list of features for each file in the test directory TX = np.concatenate( (TX, np.array(pickle.load(open(test_feature_list_filename, 'rb')))), axis=1) # loading the categories for the test sets Ty = np.array(pickle.load(open(test_predict_filename, 'rb'))) # load the saved model clf = pickle.load(open('modeldyn_parameters.sav', 'rb')) # predict the values for test data prediction_values = clf.predict(TX) # function to see if a class corresponds to benign binaries or malware f = lambda x: 1 if x > 0 else 0 def fromiter(x): return np.fromiter((f(xi) for xi in x), x.dtype) # lump all malware predictions/categories into one prediction_values = fromiter(prediction_values) Ty = fromiter(Ty) # print statistics from the data print("features:", feat) print("accuracy:", metrics.accuracy_score(prediction_values, Ty)) print("f1 score:", metrics.f1_score(prediction_values, Ty, average='micro')) print("precision score:", metrics.precision_score(prediction_values, Ty, average='micro')) print("recall score:", metrics.recall_score(prediction_values, Ty, average='micro')) print("f1 score (macro):", metrics.f1_score(prediction_values, Ty, average='macro')) print("precision score (macro):", metrics.precision_score(prediction_values, Ty, average='macro')) print("recall score (macro):", metrics.recall_score(prediction_values, Ty, average='macro')) # finding the number of wrong predictions mismatch = 0 tot = prediction_values.shape[0] for i in range(tot): mismatch += 1 if prediction_values[i] != Ty[i] else 0 print("mismatches:", mismatch) # printing the whole prediction array print("prediction is", prediction_values.tolist()) # printing the whole category array print("y is", Ty.tolist()) end_time = time() print('Testing complete in ' + str(end_time - start_time) + ' seconds')
# In[5]: #Feature Hashing from sklearn.feature_extraction import FeatureHasher X_train_hash = X_train.copy() X_val_hash = X_val.copy() X_test_hash = X_test.copy() for i in range(X_train_hash.shape[1]): X_train_hash.iloc[:, i] = X_train_hash.iloc[:, i].astype('str') for i in range(X_val_hash.shape[1]): X_val_hash.iloc[:, i] = X_val_hash.iloc[:, i].astype('str') for i in range(X_test_hash.shape[1]): X_test_hash.iloc[:, i] = X_test_hash.iloc[:, i].astype('str') #encoding hashing h = FeatureHasher(n_features=10000, input_type="string") X_train_hash = h.transform(X_train_hash.values) X_val_hash = h.transform(X_val_hash.values) X_test_hash = h.transform(X_test_hash.values) # # Modeling # In[9]: #Import Neccessary Packages from sklearn.metrics import log_loss from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV, KFold, cross_val_score import keras import random as rn
def process_raw_features(self, raw_obj): exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0] return exports_hashed.astype(np.float32)
train_x[c] = le.transform(train_x[c]) test_x[c] = le.transform(test_x[c]) # ----------------------------------- # feature hashing # ----------------------------------- # データの読み込み train_x, test_x = load_data() # ----------------------------------- from sklearn.feature_extraction import FeatureHasher # カテゴリ変数をループしてfeature hashing for c in cat_cols: # FeatureHasherの使い方は、他のencoderとは少し異なる fh = FeatureHasher(n_features=5, input_type='string') # 変数を文字列に変換してからFeatureHasherを適用 hash_train = fh.transform(train_x[[c]].astype(str).values) hash_test = fh.transform(test_x[[c]].astype(str).values) # データフレームに変換 hash_train = pd.DataFrame(hash_train.todense(), columns=[f'{c}_{i}' for i in range(5)]) hash_test = pd.DataFrame(hash_test.todense(), columns=[f'{c}_{i}' for i in range(5)]) # 元のデータフレームと結合 train_x = pd.concat([train_x, hash_train], axis=1) test_x = pd.concat([test_x, hash_test], axis=1) # 元のカテゴリ変数を削除 train_x.drop(cat_cols, axis=1, inplace=True) test_x.drop(cat_cols, axis=1, inplace=True) # -----------------------------------
from sklearn.preprocessing import OneHotEncoder one = OneHotEncoder() one.fit(X) train = one.transform(X) print('Train Data Set Has Got {} Rows and {} Columns'.format( train.shape[0], train.shape[1])) # Train Data Set Has Got 300000 Rows and 316461 Columns from sklearn.feature_extraction import FeatureHasher X_train_hash = X.copy() for c in X.columns: X_train_hash[c] = X[c].astype('str') hashing = FeatureHasher(input_type='string') train = hashing.transform(X_train_hash.values) print('Train Data Set Has Got {} Rows and {} Columns'.format( train.shape[0], train.shape[1])) X_train_stat = X.copy() for c in X_train_stat.columns: if (X_train_stat[c].dtype == 'object'): X_train_stat[c] = X_train_stat[c].astype('category') counts = X_train_stat[c].value_counts() counts = counts.sort_index() counts = counts.fillna(0) counts += np.random.rand(len(counts)) / 1000 X_train_stat[c].cat.categories = counts print(X_train_stat.head(3)) '''
epsilon = 1e-15 pred = sp.maximum(epsilon, pred) pred = sp.minimum(1 - epsilon, pred) ll = sum(act * sp.log(pred) + sp.subtract(1, act) * sp.log(sp.subtract(1, pred))) ll = ll * -1.0 / len(act) return ll # add two columns for hour and weekday def dayhour(timestr): d = datetime.strptime(str(x), "%y%m%d%H") return [float(d.weekday()), float(d.hour)] fh = FeatureHasher(n_features=2**20, input_type="string") # Train classifier clf = LassoLars() train = pd.read_csv("train/subtrain.csv", chunksize=100000, iterator=True) all_classes = np.array([0, 1]) for chunk in train: y_train = chunk["click"] chunk = chunk[cols] chunk = chunk.join( pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"])) chunk.drop(["hour"], axis=1, inplace=True) Xcat = fh.transform(np.asarray(chunk.astype(str))) clf.fit(Xcat, y_train) # Create a submission file
def predict_task(current_task, filename, data): start_time = time.time() # remove password current_task.update_state(state='PROGRESS', meta="Removing password on " + filename) print("Predict: " + filename) input_pdf = io.BytesIO(base64.b64decode(data)) temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file_name = temp_file.name with pikepdf.open(input_pdf) as pdf: pdf.save(temp_file) # extract features stats = [0, 0] pages = pdf_parser.parse_pdf(temp_file_name, False, stats, current_task) temp_csv_file = tempfile.NamedTemporaryFile(delete=False) temp_csv_file_name = temp_csv_file.name pdfUtil.save_pdf_pages_tocsv(filename, pages, temp_csv_file_name) # prepare data markup_data = pd.read_csv(temp_csv_file_name) markup_data["HasCentLine"] = markup_data["HasCentLine"].astype(int) markup_data["HasComboLine"] = markup_data["HasComboLine"].astype(int) markup_data["IsMarkupField"] = markup_data["IsMarkupField"].astype(int) x = markup_data.drop(labels=[ 'FileName', 'PageNum', 'LineLeft', 'LineRight', 'LineTop', 'LineBottom', 'Prefix', 'Suffix', 'FieldCode', 'FieldLeft', 'FieldRight', 'FieldTop', 'FieldBottom', "IsMarkupField" ], axis=1) transformer = ColumnTransformer( [("hash", FeatureHasher(n_features=2, input_type='string'), 'TopElement')], remainder="passthrough") transformed_x = transformer.fit_transform(x) # Get the model's prediction current_task.update_state(state='PROGRESS', meta="Get the model's prediction ") pdf_model = pickle.load(open("/app/ml_model/markup.pkl", "rb")) markup_data['IsMarkupField'] = pdf_model.predict_proba(transformed_x)[:, 1] # markup the PDF temp_output_file = tempfile.NamedTemporaryFile(delete=False) temp_output_file_name = temp_output_file.name pdfUtil.markup_pdf(markup_data, temp_file_name, temp_output_file_name) # return marked up PDF return_data = io.BytesIO() with open(temp_output_file_name, 'rb') as fo: return_data.write(fo.read()) return_data.seek(0) # clean up temp_file.close() temp_csv_file.close() temp_output_file.close() os.remove(temp_file_name) os.remove(temp_csv_file_name) os.remove(temp_output_file_name) total_time = "total time spent: " + str(time.time() - start_time) current_task.update_state(state='PROGRESS', meta=total_time) print(total_time) return { 'data': base64.b64encode(return_data.read()), 'attachment_filename': filename, 'mimetype': 'application/pdf' }
def main(rawdata, rawtarget=pd.DataFrame(), train_test_flag='train'): raw_target = rawtarget raw_data = rawdata raw_data['date_recorded'] = pd.to_datetime( raw_data['date_recorded']).apply(lambda x: (datetime.datetime.today() - x).days) numeric_cols = [ c for c in raw_data.columns if raw_data[c].dtype in ['int64', 'float64'] and c not in ['region_code', 'district_code'] ] cat_cols = [c for c in raw_data.columns if c not in numeric_cols] # sns.pairplot(raw_data.merge(raw_target)[numeric_cols+['status_group']].iloc[:,1:], hue="status_group",diag_kind='hist',plot_kws= {'alpha': 0.5}) # plt.show() # Categorical columns cardinality print("\n# Unique values in each categorical column:\n", raw_data[cat_cols].nunique(axis=0)) # No. of unknown categories (raw_data[cat_cols] == 'unknown').sum() raw_data[cat_cols] = raw_data[cat_cols].replace('unknown', np.nan) # Deleting unneeded columns to_be_del = [ 'waterpoint_type_group', 'source_type', 'quantity_group', 'quality_group', 'payment_type', 'management_group', 'extraction_type_class', 'extraction_type_group', 'scheme_name', 'recorded_by', 'region', 'scheme_management' ] raw_data = raw_data.drop(to_be_del, axis=1) # % of missing values per column print("\nMissing value % \n", (raw_data.isna().sum() * 100 / len(raw_data)).sort_values(ascending=False)) # (raw_data.isna().sum()*100/len(raw_data)).sort_values(ascending=False).plot(kind='bar') # plt.xticks(rotation=45) # plt.show() # Columns without missing values are hash encoded in bulk # Rest of the columns are individually hash encoded # This is done to preserve nan's across encoding in order to perform imputation later. print("Encoding categorical features..") ohc = ['public_meeting', 'permit', 'source_class'] hashc_ind = [ 'payment', 'installer', 'funder', 'public_meeting', 'permit', 'water_quality', 'quantity', 'management', 'subvillage', 'source_class', 'source' ] hashc0 = ['district_code', 'region_code', 'ward', 'wpt_name', 'lga'] # 1024 bit encoding hashc1 = ['extraction_type', 'waterpoint_type'] # 32 bit encoding hashc2 = ['basin'] # 8 bit encoding # One hot encoding on binary categorical data oh = [] for oc in ohc: ohe = OneHotEncoder(drop='first') enc = enc_with_na(raw_data[[oc]], ohe, 1) oh.append(enc) oh = np.hstack(oh).astype(np.int8) # Hash encoding on the rest # Individual hashing #n_feats_ind = [4, 128, 128, 6, 2, 8, 1024, 8] n_feats_ind = [4, 16, 16, 6, 2, 8, 64, 8] n_feats_ind = [4, 8, 8, 4, 2, 4, 32, 4] hashed_ind = [] for hc, n in zip(hashc_ind, n_feats_ind): h = FeatureHasher(n_features=n, input_type='string', alternate_sign=False) enc = enc_with_na(raw_data[[hc]], h, n) hashed_ind.append(enc) hashed_ind = np.hstack(hashed_ind).astype(np.int8) # Collective hashing hash_cols_list = [hashc0, hashc1, hashc2] #n_feats = [1024, 32, 8] n_feats = [64, 16, 8] n_feats = [32, 8, 4] hashed = [] for hc, n in zip(hash_cols_list, n_feats): h = FeatureHasher(n_features=n, input_type='string', alternate_sign=False) enc = enc_with_na(raw_data[hc], h, n) hashed.append(enc) hashed = np.hstack(hashed).astype(np.int8) print("Encoding complete..") print("Preparing to write data to disk..") raw_data_encoded = pd.concat([ raw_data.drop(ohc + hashc_ind + hashc0 + hashc1 + hashc2, axis=1), pd.DataFrame(np.hstack([oh, hashed_ind, hashed])) ], axis=1) raw_data_encoded.to_csv(os.path.join(DATA_DIR, train_test_flag + '_data_encoded.csv'), header=True, index=False) print("Written encoded data to disk..")
orltest=pd.read_csv('D://frad_test.csv') orldata=orldata.append(orltest) del orltest feature=orldata.columns.values.tolist() orldata.astype(object) orldata.dtypes.value_counts() sample=orldata.iloc[0:100,:] from sklearn.feature_extraction import FeatureHasher bin_columns_name=['pkgname','ver','adunitshowid','mediashowid','apptype','city','reqrealip','idfamd5','openudidmd5','model','make','osv'] for i in bin_columns_name: fh = FeatureHasher(n_features=5, input_type='string') orldata[i]=orldata[i].astype('str') hashed_features = fh.fit_transform(orldata[i]) hashed_features = hashed_features.toarray() hashed_features=pd.DataFrame(hashed_features) hashed_features.columns=[i+'0',i+'1',i+'2',i+'3',i+'4'] orldata=orldata.join(hashed_features) orldata=orldata.drop(columns=i) oh_columns=['os','lan'] orldata_oh=pd.get_dummies(orldata[oh_columns].astype('object')) orldata_oh=orldata_oh.reset_index(drop=True) orldata=orldata.join(orldata_oh) # # orldata=orldata.drop(columns=oh_columns)