def run_online_classifier(): vect = HashingVectorizer( decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer_streaming, ) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) csv_filename = os.path.join('datasets', 'movie_data.csv') doc_stream = stream_docs(path=csv_filename) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if X_train is None: break else: X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print("Test accuracy: %.3f" % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test)
def do_training(): global X_train, X_test, feature_names, ch2 print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25, stop_words='english') X_train = vectorizer.fit_transform(data_train_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if True:#opts.select_chi2: print("Extracting %d best features by a chi-squared test" % 20000) t0 = time() ch2 = SelectKBest(chi2, k=20000) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) results = [] #for penalty in ["l2", "l1"]: penalty = 'l2' print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3) results.append(benchmark(clf)) joblib.dump(vectorizer, 'vectorizer.pkl', compress=9) joblib.dump(ch2, 'feature_selector.pkl', compress=9) joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20), single_pass=True): """k-means for very large sets of documents. See kmeans for documentation. Differs from that function in that it does not computer tf-idf or LSA, and fetches the documents in a streaming fashion, so they don't need to be held in memory. It does not do random restarts. If the option single_pass is set to False, the documents are visited twice: once to fit a k-means model, once to determine their label in this model. """ from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.text import HashingVectorizer v = HashingVectorizer(input="content", n_features=n_features, norm="l2") km = MiniBatchKMeans(n_clusters=k) labels = [] for batch in batches(docs, batch_size): batch = map(fetch, docs) batch = v.transform(batch) y = km.fit_predict(batch) if single_pass: labels.extend(y.tolist()) if not single_pass: for batch in batches(docs, batch_size): batch = map(fetch, docs) batch = v.transform(batch) labels.extend(km.predict(batch).tolist()) return labels
def train(): vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, ngram_range=(1, 3), tokenizer=tokenizer) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) stream_path = os.path.join(work_path, 'movie_data.csv') doc_stream = stream_docs(path=stream_path) pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) pbar.update() X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('Accuracy: %.3f' % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test) return clf
def tfidf_classify(user): train_set, y, src, test_set = extract_data(user.id) if not train_set: return [] # Analyse using tf-idf # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5) vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english') # List of topic extracted from text # feature_names = vector.get_feature_names() # print feature_names xtrain = vector.transform(train_set) xtest = vector.transform(test_set) # Select sample using chi-square ch2 = SelectKBest(chi2) xtrain = ch2.fit_transform(xtrain, y) xtest = ch2.transform(xtest) # Predict testing set # classifier = DecisionTreeClassifier() classifier = KNeighborsClassifier(n_neighbors=4) classifier = classifier.fit(xtrain, y) result = classifier.predict(xtest) final = [] for i in xrange(len(result)): if result[i]: final.append(src[i]) print len(final) return final
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20), single_pass=True): """k-means for very large sets of documents. """ from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.text import HashingVectorizer v = HashingVectorizer(input="content", n_features=n_features, norm="l2") km = MiniBatchKMeans(n_clusters=k) labels = [] for batch in batches(docs, batch_size): batch = map(fetch, docs) batch = v.transform(batch) y = km.fit_predict(batch) if single_pass: labels.extend(y.tolist()) if not single_pass: for batch in batches(docs, batch_size): batch = map(fetch, docs) batch = v.transform(batch) labels.extend(km.predict(batch).tolist()) return labels
def trainOnModel(x_VariableList, y_VariableList, testSetList, classifier, hashing=False, chi_squared=False): from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import RidgeClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.utils.extmath import density y_train = y_VariableList if hashing == True: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=2 ** 16) X_train = vectorizer.transform(x_VariableList) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(x_VariableList) X_test = vectorizer.transform(testSetList) if chi_squared == True: print("Extracting best features by a chi-squared test") ch2 = SelectKBest(chi2, k=2 * 16) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) classifierObject = "" print "Using :", classifier if classifier == "LinearSVC": classifierObject = LinearSVC(penalty='l2', dual=False, tol=1e-3) elif classifier == "PassiveAggressiveClassifier": classifierObject = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge', n_iter=50, n_jobs=1, random_state=None, shuffle=True, verbose=0, warm_start=False) elif classifier == "RidgeClassifier": classifierObject = RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, solver='lsqr', tol=0.01) elif classifier == "Perceptron": classifierObject = Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True, n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True, verbose=0, warm_start=False) elif classifier == "SGDClassifier": classifierObject = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) classifierObject.fit(X_train, y_train) pred = classifierObject.predict(X_test) return pred[0]
class ReviewTrainer(TrainerModel): def __init__(self): pass #get rids of stopwords def preprocess(self, l): res = {} sw = stopwords.words('english') clean = ' '.join([w for w in l['text'].split() if w not in sw]) res[l['review_id']] = {'text' : clean, 'label' : l['votes']['useful']} return res #the labels are already given for this review def group_labels(self, fname): pass #vectorizes data and selects K best feats. def prepare_data(self, x, y): self.hv = HashingVectorizer(strip_accents='ascii', non_negative=True) self.feats = self.hv.transform(x) self.labels = np.array(y) self.ch2 = SelectKBest(chi2, k=K_FEAT) self.feats = self.ch2.fit_transform(self.feats, self.labels) def get_error(self, pred, y): return super(ReviewTrainer, self).get_error(pred,y) #optimizes for hyper-parameter alpha def _cross_validate(self): grid = dict(alpha=10.0 ** np.arange(-4,1)) return super(ReviewTrainer, self)._cross_validate_base( Ridge(), grid) #builds examples to feed trainer #MUST RUN BEFORE train def build_examples(self, data, labels=None): feats = [] labels = [] ex = {} for k,v in data.items(): feats.append(v['text']) labels.append(v['label']) ex['feats'] = feats ex['labels'] = labels return ex #fits model using optimal parameters def train(self): self.clf = self._cross_validate() self.clf.fit(self.feats, self.labels) #predicts Y given X def predict(self, data): data = self.hv.transform(data) data = self.ch2.transform(data) pred = self.clf.predict(data) return pred
def vectorize(concepts): """ This vectorizes a list or a string of concepts; the regular `vectorize` method is meant to vectorize text documents; it is trained for that kind of data and thus is inappropriate for concepts. So instead we just use a simple hashing vectorizer. """ h = HashingVectorizer(input='content', stop_words='english', norm=None, tokenizer=Tokenizer()) if type(concepts) is str: # Extract and return the vector for the single document. return h.transform([concepts]).toarray()[0] else: return h.transform(concepts)
class App(object): def __init__(self): self.hv = HashingVectorizer(norm=None, non_negative=True) if os.path.isfile(MODEL): self.clf = load(MODEL) else: self.clf = linear_model.SGDClassifier(warm_start=True) zeros = self.vector('seed') self.clf.partial_fit(zeros, np.unique([GOOD]), classes=(GOOD, BAD)) def feed(self): for doc in stories: self.pred_X = self.vector(doc) self.pred_y = self.clf.predict(self.pred_X) if self.pred_y == GOOD: yield doc def save_model(self): dump(self.clf, MODEL) def train(self, doc, y): X = self.vector(doc) self.clf.partial_fit(X, [y]) def vector(self, doc): clean = preprocess(doc) return self.hv.transform([clean]) def score(self, y): print self.clf.score(self.pred_X, [y])
class FeatureExtractor(object): def __init__(self, csv_filename, batch_size = 1000): self.vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18, non_negative=True) self.train_pd = pandas.read_csv(csv_filename) self.index = 0 self.batch_size = batch_size def nextBatch(self): """ Return a generator for a batch_size number of train (X, y) pairs """ train_length = len(self.train_pd) while self.index < train_length: end_index = min(self.index + self.batch_size, train_length) print "Reading ", self.index, ": ", end_index X_train = list() y_train = list() for i in range(self.index, end_index): filename = "data/" + self.train_pd["file"][i] text = open(filename, "rb").readlines() X_train.append("\n".join(text)) y_train.append(int(self.train_pd["sponsored"][i])) self.index = end_index yield (self.vectorizer.transform(X_train), y_train) def dumpBatch(self, batch, filename): with open(filename, "wb") as f: cPickle.dump(batch, f) def dump(self, filename): with open(filename, "wb") as f: cPickle.dump(self, f)
def main(): ''' >>> main() # stuff happens ''' args = parse_args() setup_logging(args.log, verbose=args.verbose) chunks = sequence_chunk_generator(args.fasta_file, chunk_size=args.chunk_size) hasher = HashingVectorizer(analyzer='char', n_features = 2 ** 18, ngram_range=(args.ngram_min, args.ngram_max), ) estimator = AffinityPropagation() for chunk in chunks: logging.info('hashing chunk') chunk_vector = hasher.transform([ str(i.seq) for i in chunk ]) logging.info('clustering') estimator.fit(chunk_vector) logging.info('got %s clusters' % len(set(estimator.labels_)))
def sim_char5(text1, text2): vect = HashingVectorizer(analyzer='word', tokenizer=normalize, stop_words='english') texts = [text1, text2] matrix = vect.transform(texts) cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten() simmax = max(cosine_similarities[1:]) return simmax
def test_same_output(self): X, X_rdd = self.make_text_rdd() local = HashingVectorizer() dist = SparkHashingVectorizer() result_local = local.transform(X).toarray() result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def test_same_output(self): X, X_rdd = self.generate_text_dataset() local = HashingVectorizer() dist = SparkHashingVectorizer() result_local = local.transform(X) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def predict(line, tagger): tok_cn = lambda (x): crfseg.cut_zh(x, tagger) hasher = HashingVectorizer(n_features=2**16, tokenizer=tok_cn, non_negative=True, norm=None, binary=False) x_test = hasher.transform([line]) return clf_global.predict_proba(x_test)
def vectorize(docs): """ Vectorizes a list of documents. Args: | docs (list) -- the documents to vectorize. | docs (str) -- a single document to vectorize. Returns: | scipy sparse matrix (CSR/Compressed Sparse Row format) """ h = HashingVectorizer(input='content', stop_words='english', norm=None, tokenizer=Tokenizer()) if type(docs) is str: # Extract and return the vector for the single document. return h.transform([docs]).toarray()[0] else: return h.transform(docs)
class Featurizer: def __init__(self): self.vectorizer = HashingVectorizer(stop_words="english") def train_feature(self, examples): return self.vectorizer.fit_transform(examples) def test_feature(self, examples): return self.vectorizer.transform(examples)
def main(output=RESULTS): # change ROOT ID in config.py to your computer's path so that is writes to correct file # load and puts data and desired numpy format movies = load_balanced_movies(MOVIES_DATA, False) # True is for debugging data = pd.DataFrame(movies) pd.options.mode.chained_assignment = None # default='warn' ignore summaries = data[['summary']] summaries['summary'] = summaries['summary'].str.replace('[^\w\s]','').str.lower() ## cleans out puncutation and characters Y = np.array(data[['year']]) Y = np.ravel(Y) X = np.array(summaries['summary']) # standard CountVectorizer for bag of words # vectorizer = CountVectorizer() # X = vectorizer.fit_transform(X) # print "Old Shape Dim" # print X.shape # uses random projections to reduce dimensionality # transformer = random_projection.SparseRandomProjection() # X_new = transformer.fit_transform(X) # print "New Shape Dim" # print X_new.shape # perform vectorization and dim reduction using Hashing Vectorizer (counts # of times a word appears) vectorizer = HashingVectorizer(stop_words='english', n_features=80000) # uses 80,000 word instances as k X = vectorizer.transform(X) # instantiate scaling of data for preprocessing X = StandardScaler(with_mean=False).fit_transform(X) # splits training and test data equally xtrain, xtest, ytrain, ytest = train_test_split(X, Y) names = ["SGDClassifier", "Linear SVC", "SVC Kernel RBF", "PerceptronL1", "PerceptronL2", "Nearest Neighbors", "Ridge Classifier"] # classifiers = [ SGDClassifier(loss="hinge", penalty="l2"), LinearSVC(), SVC(kernel="rbf"), Perceptron(penalty='l1'), Perceptron(penalty='l2', n_iter=25), KNeighborsClassifier(), RidgeClassifier(), ] print "Calculating accuracies" # fits chosen classifier on training data for name, clf in zip(names, classifiers): print name clf.fit(xtrain, ytrain) print "Accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest)) # Predict and score accuracy with open(output, "a+") as outputFile: # write results to file score = 100 * clf.score(xtest, ytest) outputFile.write("Ran classifier {} ".format(name) + '\n' " Achieved accuracy {} ".format(score) )
class svm_text(SVC): # svm_ = SVC(C=500, kernel='poly', gamma=.01, shrinking=True, probability=False, degree= 10, coef0=2, # tol=0.001, cache_size=20000, class_weight=None, verbose=False, max_iter=-1) def __init__(self, train_data, C=5, kernel='poly', gamma=.001, degree=10, coef0=2, n_features=10000000, ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), probability=False, class_weight=None): self.conn = None self.is_tfidf = tfidf if tfidf: self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1], max_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word', norm='l2') else: self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True, n_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word', norm='l2') self.param_set = {'C': str(C), 'kernel': str(kernel), 'gamma': str(gamma), 'degree': str(degree), 'coef0': str(coef0), 'n_features': str(n_features)} if class_weight == 'auto': class_weight = {} for item in train_data.target: if class_weight.get(item): class_weight.update({item: class_weight[item] + 1.0}) else: class_weight.update({item: 1.0}) for key in class_weight: class_weight.update({key: 1.0 / class_weight[key]}) self.class_weight_dict = class_weight super(svm_text, self).__init__(C=C, kernel=kernel, gamma=gamma, shrinking=True, probability=probability, degree=degree, coef0=coef0, tol=0.001, cache_size=20000, class_weight=class_weight, verbose=False, max_iter=-1) if self.is_tfidf: train_x = self.vectorizer.fit_transform(train_data.data) else: train_x = self.vectorizer.transform(train_data.data) self.fit(train_x, train_data.target) def test_data(self, test_data): test_x = self.vectorizer.transform(test_data.data) predicted_values = self.predict(test_x) test_y = test_data.target self.score = metrics.f1_score(test_y, predicted_values) self.accuracy = metrics.accuracy_score(test_y, predicted_values) def guess_text(self, text_text): text_x = self.vectorizer.transform([pre_proc(text_text, removestop=False, alwayskeep=True, word_punc=True, unquote=True),]) return self.predict(text_x)
def extractFeatures(): print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() return X_train, X_test
def feature_extraction(feature, target_name, df): vect = HashingVectorizer(decode_error='ignore', ngram_range=(1,2), n_features = 2**18, binary=True, norm="l2") le = preprocessing.LabelEncoder() # for multiple features replace this with http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html df[feature] = df[feature].fillna('') titles = vect.transform(df[feature]) X = titles #y = le.fit_transform(df[target_name]) y = df[target_name] return X, y
def test_hashed_binary_occurrences(): # by default multiple occurrences are counted as longs test_data = ["aaabc", "abbde"] vect = HashingVectorizer(analyzer="char", non_negative=True, norm=None) X = vect.transform(test_data) assert_equal(np.max(X[0:1].data), 3) assert_equal(np.max(X[1:2].data), 2) assert_equal(X.dtype, np.float64) # using boolean features, we can fetch the binary occurrence info # instead. vect = HashingVectorizer(analyzer="char", non_negative=True, binary=True, norm=None) X = vect.transform(test_data) assert_equal(np.max(X.data), 1) assert_equal(X.dtype, np.float64) # check the ability to change the dtype vect = HashingVectorizer(analyzer="char", non_negative=True, binary=True, norm=None, dtype=np.float64) X = vect.transform(test_data) assert_equal(X.dtype, np.float64)
def tfidfVectorizeData(listOfSentences, useHashTable=False, nFeatures=100): if useHashTable: from sklearn.feature_extraction.text import HashingVectorizer vec = HashingVectorizer(stop_words='english', non_negative=True, n_features=nFeatures) X_noProcess = vec.transform(listOfSentences).toarray() else: from sklearn.feature_extraction.text import TfidfVectorizer vec = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_noProcess = vec.fit_transform(listOfSentences).toarray() return vec, X_noProcess
class svm_multi_label_text(OneVsRestClassifier): # svm_ = SVC(C=500, kernel='poly', gamma=.01, shrinking=True, probability=False, degree= 10, coef0=2, # tol=0.001, cache_size=20000, class_weight=None, verbose=False, max_iter=-1) def __init__(self, train_data, C=None, n_features=10000000, loss='l2', penalty='l1', ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), dual=True, tol=1e-4): self.conn = None self.is_tfidf = tfidf if tfidf: self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1], max_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word') else: self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True, n_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word') self.param_set = {'C': str(), 'kernel': str(), 'gamma': str(), 'degree': str(), 'coef0': str(), 'n_features': str(n_features)} super(svm_multi_label_text, self).__init__(LinearSVC(C=C, loss=loss, penalty=penalty, dual=(False if penalty == 'l1' else dual), tol=tol)) if self.is_tfidf: train_x = self.vectorizer.fit_transform(train_data.data) else: train_x = self.vectorizer.transform(train_data.data) train_y = train_data.target self.fit(train_x, train_y) def test_data(self, test_data): test_x = self.vectorizer.transform(test_data.data) predicted_values = self.predict(test_x) test_y = test_data.target try: self.score = metrics.f1_score(test_y, predicted_values) except ZeroDivisionError: self.score = -0.1 try: self.accuracy = metrics.accuracy_score(test_y, predicted_values) except ZeroDivisionError: self.accuracy = -0.1 def guess_text(self, text_text): text_x = self.vectorizer.transform([pre_proc(text_text, removestop=False, alwayskeep=True, word_punc=True, unquote=True),]) return self.predict(text_x)
class TwitterSentiment: def __init__(self): self.vec = HashingVectorizer(stop_words=stopwords.words("english"), non_negative=True) self.pp = PreProcessor(full_pp=True) self.cls = None def predict(self, text): '''predict an emoticon for any string given by text by using a trained classifier''' return self.predict_all([text])[0] def predict_all(self, seq): '''predict all emoticons for a list of strings by using a trained classifier''' return self.cls.predict(self.vec.transform(map(self.pp.process_tweet, seq)))
def trainFeatureExtract(self, opts, trainData, trainDataSize): print 'Extracting features from the training dataset using a sparse vectorizer' t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) dataTrain = vectorizer.transform(trainData.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') dataTrain = vectorizer.fit_transform(trainData.data) duration = time() - t0 print 'done in %fs at %0.3fMB/s' % (duration, trainDataSize / duration) print 'n_samples: %d, n_features: %d' % dataTrain.shape print return dataTrain, vectorizer
def __wordhash_features(self, data, vect=None, num_features=3000): ''' extracts word ngram features from the provided data ''' if vect is None: vect = HashingVectorizer(n_features=num_features, analyzer="word", stop_words='english', strip_accents='unicode', ngram_range=(1, 4)) vect.fit(data) features = vect.transform(data) return features, vect
def main(): with open("trainingdata.txt","r") as f: int(f.readline()) training_set = [r.split(" ") for r in f] y = [doc[0] for doc in training_set] corpus = [reduce(lambda x, y: x + " " + y, doc[1::]) for doc in training_set] N = len(corpus)/2 X_train = corpus[:N] data = corpus[N:] y_train = y[:N] y_test = y[N:] vectorizer = HashingVectorizer(non_negative=True, analyzer='word') X_train = vectorizer.transform(X_train) data_test = vectorizer.transform(data) y_train = np.array(y_train) y_test = np.array(y_test) # Run classifier classifier = LinearSVC(kernel='linear', probability=True, random_state=0) probas_ = classifier.fit(X_train, y_train).predict_proba(data_test) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1]) roc_auc = auc(fpr, tpr) print("Area under the ROC curve : %f" % roc_auc) # Plot ROC curve pl.clf() pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right") pl.show()
def test_hashing_vectorizer(): v = HashingVectorizer() X = v.transform(ALL_FOOD_DOCS) token_nnz = X.nnz assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features)) assert_equal(X.dtype, v.dtype) # By default the hashed values receive a random sign and l2 normalization # makes the feature values bounded assert_true(np.min(X.data) > -1) assert_true(np.min(X.data) < 0) assert_true(np.max(X.data) > 0) assert_true(np.max(X.data) < 1) # Check that the rows are normalized for i in range(X.shape[0]): assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0) # Check vectorization with some non-default parameters v = HashingVectorizer(ngram_range=(1, 2), non_negative=True, norm='l1') X = v.transform(ALL_FOOD_DOCS) assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features)) assert_equal(X.dtype, v.dtype) # ngrams generate more non zeros ngrams_nnz = X.nnz assert_true(ngrams_nnz > token_nnz) assert_true(ngrams_nnz < 2 * token_nnz) # makes the feature values bounded assert_true(np.min(X.data) > 0) assert_true(np.max(X.data) < 1) # Check that the rows are normalized for i in range(X.shape[0]): assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)
def model_train(): sys.path.insert(0, '../notebooks/') from helper import load_data, token from datetime import datetime import humanfriendly import pandas as pd import numpy as np from sklearn.externals import joblib from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import HashingVectorizer from sklearn.linear_model import SGDClassifier stops = joblib.load('../stops.pkl') df = load_data('../sentiment_data', balanced=True) hash_para = dict(decode_error='ignore', n_features=2**18, tokenizer=token, stop_words=stops, ngram_range=(1, 3), alternate_sign=False) clf_prep = HashingVectorizer(**hash_para) clf_prep = HashingVectorizer(**hash_para) clf = SGDClassifier(loss='log', random_state=1, max_iter=1) u = datetime.now() clf.partial_fit(clf_prep.transform(df['features']), df['y'], classes=np.unique(df['y'])) v = datetime.now() delta = v - u print('Training took: {}'.format( humanfriendly.format_timespan(delta.seconds))) joblib.dump(clf_prep, '../HashVectorizer.pkl') joblib.dump(clf, '../SGDclassifier.pkl')
def main(): vec = HashingVectorizer(tokenizer=preprocess, ngram_range=(3, 3), analyzer='word') clu = Birch(n_clusters=3) #clu = MiniBatchKMeans(n_clusters=2) config = configparser.ConfigParser() config.read('cfg.ini') config = config['DEFAULT'] api = twitter.Api(consumer_key=config['consumer_key'], consumer_secret=config['consumer_secret'], access_token_key=config['access_token_key'], access_token_secret=config['access_token_secret']) queue = deque(maxlen=50) for n, line in enumerate( api.GetStreamFilter(track=[ 'pokemon', 'dark souls', 'darksouls', 'sonic', 'hedgehog' ], languages=['en'])): if n > 1000000: break elif len(queue) != 50: try: queue.append(line['text']) logging.warning("%s", line['text']) except KeyError: pass else: try: v = vec.transform(queue) clu = clu.partial_fit(v) logging.warning('TESTING\n.\n.\n.\n.') logging.warning("%s, %s, %s", n, clu.predict(v[-1]), queue[-1]) except KeyError: pass queue.clear() pickle.dump(clu, open('cluster_model.pkl', 'w'))
def CreateRpeFeature(self, look, test=False, verbose=False): if not test: vectorizer = HashingVectorizer(n_features=2**8, ngram_range=(1, 2)) vectorizer.fit(self.fulldata_words['rpe'].values) self.rpe_vectorizer = vectorizer def create_rpe_features(g): rpe = g[((g["word_num"] - g["target_word_num"]).abs() <= look) & ~(g["word_num"] == g["target_word_num"])]['rpe'].values return " ".join(rpe) rpe_sentences = self.fulldata_words.groupby("sentence_num").apply( create_rpe_features) if test: return rpe_sentences.apply(lambda x: pd.Series( data=self.rpe_vectorizer.transform([x]).toarray()[0], index=[f"rpe_hash_{k}" for k in range(vectorizer.n_features)])) else: return rpe_sentences.apply(lambda x: pd.Series( data=vectorizer.transform([x]).toarray()[0], index=[f"rpe_hash_{k}" for k in range(vectorizer.n_features)]))
def batch_train(clf, fnames, labels, iterations=25, batchsize=1000, random_seed=1): vec = HashingVectorizer(encoding='latin-1') idx = np.arange(labels.shape[0]) c_clf = clone(clf) rng = np.random.RandomState(seed=random_seed) for i in range(iterations): rnd_idx = rng.choice(idx, size=batchsize) documents = [] for i in rnd_idx: with open(fnames[i], 'r', encoding='latin-1') as f: documents.append(f.read()) X_batch = vec.transform(documents) batch_labels = labels[rnd_idx] c_clf.partial_fit(X=X_batch, y=batch_labels, classes=[0, 1]) return c_clf
def batch_train(clf, fnames, labels, iterations=1, batchsize=1000, random_seed=1): vec = HashingVectorizer(encoding='latin-1') idx = np.arange(labels.shape[0]) c_clf = clone(clf) rng = np.random.RandomState(seed=random_seed) shuffled_idx = rng.permutation(range(len(fnames))) fnames_ary = np.asarray(fnames) for _ in range(iterations): for batch in np.split(shuffled_idx, len(fnames) // 1000): documents = [] for fn in fnames_ary[batch]: with open(fn, 'r') as f: documents.append(f.read()) X_batch = vec.transform(documents) batch_labels = labels[batch] c_clf.partial_fit(X=X_batch, y=batch_labels, classes=[0, 1]) return c_clf
def _vectorize_chunk(dsid_dir, k, pars, pretend=False): """ Extract features on a chunk of files """ from sklearn.feature_extraction.text import HashingVectorizer from sklearn.externals import joblib filenames = pars['filenames_abs'] chunk_size = pars['chunk_size'] n_samples = pars['n_samples'] mslice = slice(k*chunk_size, min((k+1)*chunk_size, n_samples)) hash_opts = {key: vals for key, vals in pars.items() if key in ['stop_words', 'n_features', 'analyser', 'ngram_range']} hash_opts['alternate_sign'] = False fe = HashingVectorizer(input='content', norm=None, **hash_opts) if pretend: return fe fset_new = fe.transform(_read_file(fname) for fname in filenames[mslice]) fset_new.eliminate_zeros() joblib.dump(fset_new, str(dsid_dir / 'features-{:05}'.format(k)))
def get_kmeans_prototypes(X, n_prototypes, hashing_dim=128, ngram_range=(2, 4), sparse=False, sample_weight=None, random_state=None): """ Computes prototypes based on: - dimensionality reduction (via hashing n-grams) - k-means clustering - nearest neighbor """ vectorizer = HashingVectorizer(analyzer='char', norm=None, alternate_sign=False, ngram_range=ngram_range, n_features=hashing_dim) projected = vectorizer.transform(X) if not sparse: projected = projected.toarray() kmeans = KMeans(n_clusters=n_prototypes, random_state=random_state) kmeans.fit(projected, sample_weight=sample_weight) centers = kmeans.cluster_centers_ neighbors = NearestNeighbors() neighbors.fit(projected) indexes_prototypes = np.unique(neighbors.kneighbors(centers, 1)[-1]) return np.sort(X[indexes_prototypes])
def test_explain_hashing_vectorizer(newsgroups_train_binary): # test that we can pass InvertableHashingVectorizer explicitly vec = HashingVectorizer(n_features=1000) ivec = InvertableHashingVectorizer(vec) clf = LogisticRegression(random_state=42) docs, y, target_names = newsgroups_train_binary ivec.fit([docs[0]]) X = vec.fit_transform(docs) clf.fit(X, y) get_res = lambda **kwargs: explain_prediction( clf, docs[0], vec=ivec, target_names=target_names, top=20, **kwargs) res = get_res() check_explain_linear_binary(res, clf) assert res == get_res() res_vectorized = explain_prediction( clf, vec.transform([docs[0]])[0], vec=ivec, target_names=target_names, top=20, vectorized=True) pprint(res_vectorized) assert res_vectorized == _without_weighted_spans(res) assert res == get_res( feature_names=ivec.get_feature_names(always_signed=False))
def feature_engineering(raw_data, output_file="features.csv", feature_num=250): print("Input song data from the lyrics_datafile...") with open(raw_data, 'rb') as f: reader = csv.reader(f) data_list = list(reader) data_list = np.array(data_list) lyrics = data_list[1:, 7] tag = data_list[1:, 5] print("Processing the input lyrics...") hv = HashingVectorizer(n_features=feature_num) trans = hv.transform(lyrics) # convert to dense matrix dense = trans.todense() dense = dense.tolist() for i in range(len(dense)): dense[i].append(tag[i]) print("Saving feature results...") with open(output_file, "wb") as f: writer = csv.writer(f) writer.writerows(dense) print("-----Feature engineering DONE-----")
def prepareTrainData(): # preparing the data data_examples = filterDataWithNoEngDesc( getTokenizeCleanData(mypath, filename, trainpagename)) y_examples = data_examples['segment'] data_examples.data = data_examples['desc_tokens'] data_examples_size_kb = size_kb(data_examples) print("%d documents - %0.3fKB (examples set)" % (len(data_examples.data), data_examples_size_kb)) argv = ["--report"] op = getOptionParser() opts = getOpts(op, argv) print( "Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=opts.n_features) X_examples = vectorizer.transform(data_examples.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_examples = vectorizer.fit_transform(data_examples.data) duration = time() - t0 print("done in %fs at %0.3fkB/s" % (duration, data_examples_size_kb / duration)) print("n_samples: %d, n_features: %d" % X_examples.shape) print() # print(type(X_examples)) # print(type(X_examples.todense())) return [X_examples, y_examples, vectorizer]
def _hashing_trick(x_train, x_test, n_features, binary=True, ngram_range=(1, 1)): df_train = pd.DataFrame(x_train.astype('str')) df_test = pd.DataFrame(x_test.astype('str')) for col_i in range(df_train.shape[1]): df_train.iloc[:, col_i] = '{}='.format(col_i) + df_train.iloc[:, col_i] df_test.iloc[:, col_i] = '{}='.format(col_i) + df_test.iloc[:, col_i] texts_train = df_train.apply(lambda row: ' '.join(row), axis=1).values texts_test = df_test.apply(lambda row: ' '.join(row), axis=1).values hv = HashingVectorizer(n_features=n_features, binary=binary, ngram_range=ngram_range) hashed_train = hv.fit_transform(texts_train) hashed_test = hv.transform(texts_test) hashed_train, hashed_test = np.array(hashed_train.todense()), np.array( hashed_test.todense()) return hashed_train, hashed_test
def model_fit(input_song, model, feature_num=150): print("Process the input song...") with open(input_song, 'rb') as f: fa = LyricsToSentences(f.read()) delete_table = "\xc3\xa2\xc2\x80\xc2\x99" lyrics = "".join(c for c in fa if c not in delete_table) lyrics = [lyrics] hv = HashingVectorizer(n_features=feature_num) trans = hv.transform(lyrics) # convert to dense matrix dense = trans.todense() dense = dense.tolist() res = [] print("Running Model...") rank = zip(model.classes_, model.predict_proba(dense)[0]) print("-----Prediction Done-----") print("") print("The prediction results are:") for i in range(3): tag, prob = sorted(rank, key=lambda x: -x[1])[i] print tag res.append(tag) return res
def model_education(): data_train = pandas.read_csv('storage_1/data_base_semantica.csv', header=None) gen_text = read_row(data_train) data_to_learn = clean_text(get_minibatch_1(gen_text, size)) k = 0 cls_list = list() while list(data_to_learn.index): vectorize = HashingVectorizer(decode_error='ignore', n_features=2**21) classifier = SGDClassifier(loss='log', warm_start=True, n_jobs=-1, max_iter=5) # cls_list.append(classifier.fit(vectorize.transform(data_to_learn[1]), data_to_learn[0])) classifier.fit(vectorize.transform(data_to_learn[1]), data_to_learn[0]) _ = joblib.dump(classifier, str(k), compress=9) cls_list.append(str(k)) k += size print('Обучено строк', k) try: data_to_learn = clean_text(get_minibatch_1(gen_text, size)) except TypeError: break return cls_list, _
except StopIteration: break return text, label from sklearn.feature_extraction.text import HashingVectorizer from sklearn.linear_model import SGDClassifier #converting texts to occurance matrix vect = HashingVectorizer(n_features=2**21, decode_error='ignore', preprocessor=None, tokenizer=tokenizer) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) doc_stream = stream_docs('movie-data.csv') import pyprind pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): x_train, y_train = get_minibatch(doc_stream, 1000) if not x_train: break x_train = vect.transform(x_train) clf.partial_fit(x_train, y_train, classes=classes) pbar.update() x_test, y_test = get_minibatch(doc_stream, 5000) x_test = vect.transform(x_test) print('accuracy: ', clf.score(x_test, y_test)) y_pred = clf.predict_proba(vect.transform(np.array(['it was too long'])))
X_text, y = get_minibatch(doc_iter, minibatch_size) while len(X_text): yield X_text, y X_text, y = get_minibatch(doc_iter, minibatch_size) # test data statistics test_stats = {"n_test": 0, "n_test_pos": 0} # First we hold out a number of examples to estimate accuracy n_test_documents = 1000 tick = time.time() X_test_text, y_test = get_minibatch(data_stream, 1000) parsing_time = time.time() - tick tick = time.time() X_test = vectorizer.transform(X_test_text) vectorizing_time = time.time() - tick test_stats["n_test"] += len(y_test) test_stats["n_test_pos"] += sum(y_test) print("Test set is %d documents (%d positive)" % (len(y_test), sum(y_test))) def progress(cls_name, stats): """Report progress information, return a string.""" duration = time.time() - stats["t0"] s = "%20s classifier : \t" % cls_name s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats s += "accuracy: %(accuracy).3f " % stats s += "in %.2fs (%5d docs/s)" % (duration, stats["n_train"] / duration) return s
print("Training :: %d documents - " % (len(data_train.data))) print("Testing :: %d documents - " % (len(data_test.data))) print("%d categories" % len(categories)) # ## split a training set and a test set y_train, y_test = data_train.target, data_test.target print( "For training : Extracting features from the training data using a sparse vectorizer" ) t0 = time() if False: # or opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, ngram_range=tuple([1, 3]), stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs" % (duration)) #, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print( "For testing : Extracting features from the test data using the same vectorizer" ) t0 = time()
df_validation.USER_IP = df_validation.USER_IP.apply(create_ip_string) validation_data += df_validation.USER_IP #processing left features validation_data += 'revision_id=' + df_validation.REVISION_SESSION_ID + ' '\ + 'country_code=' + df_validation.USER_COUNTRY_CODE + ' '\ + 'continent_code=' + df_validation.USER_CONTINENT_CODE + ' '\ + 'region_code=' + df_validation.USER_REGION_CODE + ' '\ + 'city_name=' + df_validation.USER_CITY_NAME + ' '\ + 'county_name=' + df_validation.USER_COUNTY_NAME #feature data vec = HashingVectorizer(token_pattern="\\S+",n_features=10000000, norm=None,\ binary=True, dtype=np.uint16, lowercase=False) X_train = vec.transform(train_data) X_validation = vec.transform(validation_data) #label data df_train.ROLLBACK_REVERTED = df_train.ROLLBACK_REVERTED.replace(['F'], 0) df_train.ROLLBACK_REVERTED = df_train.ROLLBACK_REVERTED.replace(['T'], 1) y_train = df_train.ROLLBACK_REVERTED.values df_validation.ROLLBACK_REVERTED = df_validation.ROLLBACK_REVERTED.replace( ['F'], 0) df_validation.ROLLBACK_REVERTED = df_validation.ROLLBACK_REVERTED.replace( ['T'], 1) y_validation = df_validation.ROLLBACK_REVERTED.values t = time() #train
ngram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(1,4), stop_words=None, lowercase=True, tokenizer=tokenizer.tokenize, n_features=10000) #N-gram feature vectorizer character_gram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(4,5), stop_words=None, lowercase=True, analyzer='char', tokenizer=tokenizer.tokenize, n_features=22000) #Char-gram feature vectorizer n_power = float(sys.argv[1]) #parameter of the n_power transformation, I used 0.9 for submission #Linguistic, POS, sentiment disctionaries etc. pos1, pos_features1, different_pos_tags1, pos_text1 = get_pos_tags_and_hashtags(tweetText+tweetTest) #Get POS of everything pos, pos_features, different_pos_tags, pos_text = pos1[:len(categories)], pos_features1[:len(categories)], different_pos_tags1, pos_text1[:len(categories)] #Split train-test again pos_test, pos_features_test, different_pos_tags_test, pos_text_test = pos1[len(categories):], pos_features1[len(categories):], different_pos_tags1, pos_text1[len(categories):] #Split train-test again ngram_features = ngram.fit_transform(tweetText) #Get n-gram features character_gram_features = character_gram.fit_transform(tweetText) #Get char-gram features ngram_features.data **= n_power #a-power transformation character_gram_features.data **= n_power #a-power transformation ngram_features_test = ngram.transform(tweetTest) character_gram_features_test = character_gram.transform(tweetTest) ngram_features_test.data **= n_power character_gram_features_test.data **= n_power x_train, y_train = createDataMatrix(ngram_features, character_gram_features, tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories) #Combine all features (train) x_test, y_test = createDataMatrix(ngram_features_test, character_gram_features_test, tweetTest, pos_test, pos_features_test, different_pos_tags_test, pos_text_test, voca_clusters, categories_test)# Combine feat test print "SVMs crammer singer" for c in np.logspace(-3,4,8): #used 100 for submission clf = svm.LinearSVC(C=c, loss='squared_hinge', penalty='l2', class_weight='balanced', multi_class='crammer_singer', max_iter=4000, dual=True, tol=1e-6) clf.fit(x_train, y_train) print "Hold-out", showMyKLD(y_test, clf.predict(x_test), yo), c
print(vectorizer.vocabulary_) # Transform text to vector vector = vectorizer.transform(text) print(vector.shape) print(type(vector)) print(vector.toarray()) # As we can see here, only word 'can' occur two times print(vector) """TF-IDF""" from sklearn.feature_extraction.text import TfidfVectorizer vectorizer_tdidf = TfidfVectorizer() vectorizer_tdidf.fit(text) print(vectorizer_tdidf.vocabulary_) print(vectorizer_tdidf.idf_) vector_tdidf = vectorizer_tdidf.transform([text[0]]) print(vector_tdidf.shape) print(vector_tdidf.toarray()) """Hashing Vectoring""" from sklearn.feature_extraction.text import HashingVectorizer vectorizer_hash = HashingVectorizer(n_features=20) vector_hash = vectorizer_hash.transform(text) print(vector_hash.shape) print(vector_hash.toarray())
__author__ = 'pratapdangeti' # from sklearn.feature_extraction.text import CountVectorizer # corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich'] # vectorizer = CountVectorizer(stop_words='english') # print(vectorizer.fit_transform(corpus).todense()) # print(vectorizer.vocabulary_) #TD-IDF # from sklearn.feature_extraction.text import TfidfVectorizer # corpus = [ # 'The dog ate a sandwich and I ate a sandwich', # 'The wizard trasnfigured a sandwich' # ] # # vectorizer = TfidfVectorizer(stop_words='english') # print(vectorizer.fit_transform(corpus).todense()) #Using hashing trick from sklearn.feature_extraction.text import HashingVectorizer corpus = ['the', 'ate', 'bacon', 'cat'] vectorizer = HashingVectorizer(n_features=6) print(vectorizer.transform(corpus).todense())
import json,sys from sklearn.svm import LinearSVC #from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import HashingVectorizer if sys.version_info[0]>=3: raw_input=input transformer=HashingVectorizer(stop_words='english') _train=[] train_label=[] f=open('training.json') for i in range(int(f.readline())): h=json.loads(f.readline()) _train.append(h['question']+"\r\n"+h['excerpt']) train_label.append(h['topic']) f.close() train = transformer.fit_transform(_train) svm=LinearSVC() svm.fit(train,train_label) _test=[] for i in range(int(raw_input())): h=json.loads(raw_input()) _test.append(h['question']+"\r\n"+h['excerpt']) test = transformer.transform(_test) test_label=svm.predict(test) for e in test_label: print(e)
def ordering(mdb, mycursor, myresult, custid, from_time, skip, limit): print(skip, limit) client_103 = MongoClient( 'mongodb://*****:*****@49.156.128.103:27017/') mdb_103 = client_103["way2"] #user_rating_collection="user_data_new" #user_keyword_collection="newsreg-user-keywords-data" posts = [] breaking = [] push_posts = [] predict_posts = [] seen_posts = [] # t1=time.time()-t0 # print(t1) # t2=time.time() x = datetime.datetime.now() today_date = str(x).split(" ")[0] print('track_posts_cat_' + str(today_date.split("-")[2]).replace("0", "") + "_" + str(today_date.split("-")[1]).replace("0", "") + "_" + today_date.split("-")[0]) seen_docs = mdb_103['track_posts_cat_' + str(today_date.split("-")[2]).replace("0", "") + "_" + str(today_date.split("-")[1]).replace("0", "") + "_" + today_date.split("-")[0]].find({"custid": int(custid)}) #seen_docs=mdb[user_rating_collection].find({"custid" : int(custid),"date" :{'$in':[today_date]}},{"_id":0,"postid":1}) sp = [] for pid in seen_docs: #print(pid) sp.append(pid["postid"]) client_103.close() current_loop_date = str( datetime.datetime.fromtimestamp(from_time)).split(" ")[0] push_query = "SELECT post_id FROM push_notifications_queue WHERE lang_id=1 AND push_date='" + current_loop_date + "'" mycursor.execute(push_query) push_result = mycursor.fetchall() push_ids = [] for doc in push_result: #print("pushhhh",doc) push_ids.append(doc["post_id"]) for row in myresult: #if(row['postid'])==1958817 and row["is_breaking"]==1 and str(row["publishdate"])==today_date: #print(row) if "is_breaking" in row and row["is_breaking"] == 1 and str( row["publishdate"]) == current_loop_date: if row['postid'] in sp: seen_posts.append(row) else: breaking.append(row) elif "news_type" in row and row["news_type"] == "breaking" and str( row["publishdate"]) == current_loop_date: if row['postid'] in sp: seen_posts.append(row) else: breaking.append(row) elif row['postid'] in push_ids: if row['postid'] in sp: seen_posts.append(row) else: push_posts.append(row) else: post_doc = row key_query = "SELECT post_id,lower(tag_name) as tag_name FROM way2app.mag_post_mechine_tags WHERE post_id={0}".format( row['postid']) mycursor.execute(key_query) key_result = mycursor.fetchall() post_doc['keywords'] = [x['tag_name'] for x in key_result] s = '' if "category_name" in post_doc and post_doc["category_name"]: s = s + cat[post_doc["category_name"]] if post_doc[ "category_name"] in cat else post_doc["category_name"] if "keywords" in post_doc and post_doc["keywords"]: s = s + " " + " ".join(post_doc["keywords"]) posts.append(s) predict_posts.append(row) b_ids = [d['postid'] for d in breaking] p_ids = [d['postid'] for d in push_posts] print(b_ids, p_ids) break_plus_push = [] break_plus_push.extend(breaking) break_plus_push.extend(push_posts) break_plus_push = sorted(break_plus_push, key=itemgetter('post_gmt'), reverse=True) #print(posts) # t3=time.time()-t2 # print(t3) # t4=time.time() vectorizer = HashingVectorizer() #vectorizer=joblib.load("../models/vect_"+str(custid)) vect_posts = vectorizer.transform(posts) try: f = open("../models/" + str(custid), 'wb+') model = joblib.load(f) f.close() print("old user ", custid) pred = model.predict(vect_posts) # t5=time.time()-t4 # print(t5) #test_pred=model.predict(vectorizer.transform(["News chandrababu"])) #print("testtt ",test_pred) ins_array = [] for i in range(len(pred)): post_doc = predict_posts[i] #del post_doc['post_date'] #postid=post_doc['postid'] #post_doc['postid']=int(postid) post_doc['custid'] = custid post_doc['prediction'] = int(pred[i]) #print(post_doc) ins_array.append(post_doc) #print(ins_array) #ins_array = sorted(ins_array, key=itemgetter('prediction','post_gmt'), reverse=True) ins_array = sorted(ins_array, key=itemgetter('prediction'), reverse=True) #print(ins_array) #print(type(custid),type(today_date)) # ============================================================================= # cat_list=list(mdb[user_keyword_collection].find({"custid" : int(custid)},{"_id":0,"category":1})) # #print(cat_list) # cat_list=cat_list[0]["category"] # cat_list=dict(sorted(cat_list.items(),key=itemgetter(1),reverse=True)) # #print(cat_list) # cat_list=list(cat_list.keys()) # try: # cat_list.remove("News") # cat_list.remove("undefined") # cat_list.append("News") # cat_list.apped("undefined") # except: # pass # #print(cat_list) # ============================================================================= unseen_rated_posts = [] unseen_unrated_posts = [] c = [] for p in ins_array: if "category_name" in p and p["category_name"]: if p["category_name"] in cat: if cat[p["category_name"]] not in c: c.append(cat[p["category_name"]]) p["category_name"] = cat[p["category_name"]] else: if p["category_name"] not in c: c.append(p["category_name"]) if p["postid"] in sp: seen_posts.append(p) else: if p["prediction"] > 0: unseen_rated_posts.append(p) else: unseen_unrated_posts.append(p) # ============================================================================= # for e in cat_list: # try: # c.remove(e) # except: # pass # cat_list=list(cat_list) # cat_list.extend(c) # cat_list.append(None) # print(cat_list) # #pprint(unseen_unrated_posts) # srt = {b: i for i, b in enumerate(cat_list)} # unseen_unrated_posts=sorted(unseen_unrated_posts, key=lambda x: srt[x["category_name"]]) # # ============================================================================= final_array = [] #final_array.extend(breaking) #final_array.extend(push_posts) final_array.extend(break_plus_push) final_array.extend(unseen_rated_posts) final_array.extend(unseen_unrated_posts) final_array.extend(seen_posts) s_ids = [d['postid'] for d in seen_posts] print(s_ids) # t6=time.time()-t0 # print(t6) zeros = [ "daysdiff", "categoryid", "show_button", "postid", "btn_text_lang", "writer_custid", "is_ad", "whatsapp_share_count", "fb_share_count", "imgs_count", "sourceid", "lang", "post_parent" ] for post_doc in final_array: for key in post_doc: if key in post_doc and post_doc[key] is not None: post_doc[key] = str(post_doc[key]) if key in zeros and not post_doc[key]: post_doc[key] = str(0) f_ids = [d['postid'] for d in final_array] print(f_ids) unseen_length = len(break_plus_push) + len(unseen_rated_posts) + len( unseen_unrated_posts) print("old user", "unseen::", unseen_length, "seen", len(seen_posts), "skip::", skip, "limit::", limit) # ============================================================================= # try: # if skip and limit: # if unseen_length>0 and len(seen_posts)>0: # r_ids= [d['postid'] for d in final_array[:limit]] # print("c1",r_ids) # return final_array[:limit] # else: # r_ids= [d['postid'] for d in final_array[skip:skip+limit]] # print("c2",r_ids) # return final_array[skip:skip+limit] # elif skip and not limit: # if unseen_length>0 and len(seen_posts)>0: # r_ids= [d['postid'] for d in final_array[:len(final_array)-len(seen_posts)]] # print("c3",r_ids) # return final_array[:len(final_array)-len(seen_posts)] # else: # r_ids= [d['postid'] for d in final_array[skip:]] # print("c4",r_ids) # return final_array[skip:] # elif not skip and limit: # r_ids= [d['postid'] for d in final_array[:limit]] # print("c5",r_ids) # return final_array[:limit] # except Exception as e: # print("2c8",e) # return [] # ============================================================================= try: if skip and limit: if unseen_length > 0 and len(seen_posts) > 0: res = [] new = len(final_array) - len(seen_posts) res.extend(final_array[:new]) if len(res) >= limit: r_ids = [d['postid'] for d in res] print("1c1", r_ids) return res else: res.extend(final_array[new + skip:new + skip + limit]) r_ids = [d['postid'] for d in res] print("1c2", r_ids) return res else: r_ids = [ d['postid'] for d in final_array[skip:skip + limit] ] print("1c3", r_ids) return final_array[skip:skip + limit] elif skip and not limit: if unseen_length > 0 and len(seen_posts) > 0: res = [] new = len(final_array) - len(seen_posts) res.extend(final_array[:new]) if len(res) >= limit: r_ids = [d['postid'] for d in res] print("1c4", r_ids) return res else: res.extend(final_array[new + skip:]) r_ids = [d['postid'] for d in res] print("1c5", r_ids) return res else: r_ids = [d['postid'] for d in final_array[skip:]] print("1c6", r_ids) return final_array[skip:] elif not skip and limit: r_ids = [d['postid'] for d in final_array[:limit]] print("1c7", r_ids) return final_array[:limit] except Exception as e: print("1c8", e) return [] except: print("new user ", custid) unseen_posts = [] for p in predict_posts: if p["postid"] in sp: seen_posts.append(p) else: unseen_posts.append(p) final_array = [] #final_array.extend(breaking) #final_array.extend(push_posts) final_array.extend(break_plus_push) final_array.extend(unseen_posts) final_array.extend(seen_posts) zeros = [ "daysdiff", "categoryid", "show_button", "postid", "btn_text_lang", "writer_custid", "is_ad", "whatsapp_share_count", "fb_share_count", "imgs_count", "sourceid", "lang", "post_parent" ] for post_doc in final_array: for key in post_doc: if key in post_doc and post_doc[key] is not None: post_doc[key] = str(post_doc[key]) if key in zeros and not post_doc[key]: post_doc[key] = str(0) f_ids = [d['postid'] for d in final_array] print(f_ids) unseen_length = len(break_plus_push) + len(unseen_posts) # ============================================================================= # try: # if len(unseen_length)>0 and len(seen_posts)>0: # r_ids= [d['postid'] for d in final_array[len(final_array)-len(seen_posts)]] # print(r_ids) # return final_array[len(final_array)-len(seen_posts)] # else: # r_ids= [d['postid'] for d in final_array[skip:]] # print(r_ids) # return final_array[skip:] # except Exception as e: # print("exception occured",e) # return [] # ============================================================================= print("new user", "unseen::", unseen_length, "seen", len(seen_posts), "skip::", skip, "limit::", limit) try: if skip and limit: if unseen_length > 0 and len(seen_posts) > 0: res = [] new = len(final_array) - len(seen_posts) res.extend(final_array[:new]) if len(res) >= limit: r_ids = [d['postid'] for d in res] print("2c1", r_ids) return res else: res.extend(final_array[new + skip:new + skip + limit]) r_ids = [d['postid'] for d in res] print("2c2", r_ids) return res else: r_ids = [ d['postid'] for d in final_array[skip:skip + limit] ] print("2c3", r_ids) return final_array[skip:skip + limit] elif skip and not limit: if unseen_length > 0 and len(seen_posts) > 0: res = [] new = len(final_array) - len(seen_posts) res.extend(final_array[:new]) if len(res) >= limit: r_ids = [d['postid'] for d in res] print("2c4", r_ids) return res else: res.extend(final_array[new + skip:]) r_ids = [d['postid'] for d in res] print("2c5", r_ids) return res else: r_ids = [d['postid'] for d in final_array[skip:]] print("2c6", r_ids) return final_array[skip:] elif not skip and limit: r_ids = [d['postid'] for d in final_array[:limit]] print("2c7", r_ids) return final_array[:limit] except Exception as e: print("2c8", e) return []
def tokenizer(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()) text = re.sub('[\W]+', ' ', text.lower()) +\ ' '.join(emoticons).replace('-', '') tokenized = [w for w in text.split() if w not in stop] return tokenized vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer) clf = SGDClassifier(loss='log', random_state=1) df = pd.read_csv('./movie_data_small.csv', encoding='utf-8') #df.loc[:100, :].to_csv('./movie_data_small.csv', index=None) X_train = df['review'].values y_train = df['sentiment'].values X_train = vect.transform(X_train) clf.fit(X_train, y_train) pickle.dump(stop, open('stopwords.pkl', 'wb'), protocol=4) pickle.dump(clf, open('classifier.pkl', 'wb'), protocol=4)
def tokenizer(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()) text = re.sub('[\W]+', ' ', text.lower()) + \ ' '.join(emoticons).replace('-', '') tokenized = [w for w in text.split() if w not in stop] return tokenized vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer) file_path = parenr_dir_path + '/classifier.pkl' clf = pickle.load(open(file_path, 'rb')) label = {0: 'negative', 1: 'positive'} example = ['this movie is fun.'] X = vect.transform(example) print(clf.predict(X)) print(clf.predict_proba(X)) print('Prediction: %s\nProbability: %.2f%%' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X)) * 100))
#Splitting the data into Test and Train X_train, X_test, y_train, y_test = train_test_split(sdf['name_full'], y, test_size=0.2, random_state=21, stratify=y) #Learning the parameters using HashingVectorizer vect = HashingVectorizer( analyzer='char', n_features=325000, ngram_range=(2, 4), lowercase=False ).fit( X_train ) # n_features = 325000,min_df=30, max_df=0.3,max_features = 9000,# ,stop_words = stopWords, sublinear_tf=True,norm = 'l2')#,#2,4 giving best accuracy right now #Learning the parameters using TfidfVectorizer #vect = TfidfVectorizer(analyzer='char',min_df=30, max_df=0.3,norm = 'l2',ngram_range=(2,4),lowercase=False).fit(X_train) #Transforming Test Data X_train_transform = vect.transform(X_train) #Transformming Test Data X_test_transform = vect.transform(X_test) #Model Fitting model = OneVsRestClassifier(LinearSVC(random_state=0)).fit( X_train_transform, y_train) #Model Predictions on Test Data svcPredictions = model.predict(X_test_transform) svcAccuracy = accuracy_score(svcPredictions, y_test) print("SVM Accuracy using HashingVectorizer:", svcAccuracy) #Plotting Confusion matrix
class RCTRobot: def __init__(self): self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) json_filename = os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_structure.json') self.cnn_clfs = [ get_model(json_filename, cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck')) self.scale_constants = { 'cnn': { 'mean': 0.15592811611054261, 'std': 0.22405916984696986, 'weight': 1.6666666666666667 }, 'ptyp': { 'mean': 0.055155532891381948, 'std': 0.22828359573751594 }, 'svm': { 'mean': -0.75481403525485891, 'std': 0.7812955939364481, 'weight': 10.0 } } # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models) self.thresholds = { 'cnn': { 'precise': 2.1340457758193034, 'sensitive': -0.076709540491855063 }, 'cnn_ptyp': { 'precise': 3.529609848417909, 'sensitive': 0.083502632442633312 }, 'svm': { 'precise': 1.9185522606237164, 'sensitive': 0.093273630980694439 }, 'svm_cnn': { 'precise': 1.8749128673557529, 'sensitive': 0.064481902000491614 }, 'svm_cnn_ptyp': { 'precise': 3.7674045603568755, 'sensitive': 0.1952449060483534 }, 'svm_ptyp': { 'precise': 3.7358855328111837, 'sensitive': 0.42992224964656178 } } # All precise models have been calibrated to 97.6% sensitivity # All sensitive models have been calibrated to 99.1% sensitivity def annotate(self, data): # use the best performing models from the validation paper (in draft...) filter_class = "svm_cnn_ptyp" threshold_class = "precise" if data.get("abstract") is not None and data.get("title") is not None: ti = data["title"] ab = data["abstract"] elif data.get("parsed_text") is not None: # then just use the start of the document TI_LEN = 30 AB_LEN = 500 # best guesses based on sample of RCT abstracts + aiming for 95% centile ti = data['parsed_text'][:TI_LEN].text ab = data['parsed_text'][:AB_LEN].text else: # else can't proceed return data if "pubmed" in data.data: ptyp = 1.0 else: ptyp = 0.0 X_ti_str = [ti] X_ab_str = ['{}\n\n{}'.format(ti, ab)] if "svm" in filter_class: X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str)) X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str)) svm_preds = self.svm_clf.decision_function(hstack([X_ti, X_ab])) svm_scale = (svm_preds - self.scale_constants['svm']['mean'] ) / self.scale_constants['svm']['std'] if "ptyp" in filter_class: ptyp = np.array([ptyp]) ptyp_scale = (ptyp - self.scale_constants['ptyp']['mean'] ) / self.scale_constants['ptyp']['std'] if "cnn" in filter_class: X_cnn = self.cnn_vectorizer.transform(X_ab_str) cnn_preds = [clf.predict(X_cnn).T[0] for clf in self.cnn_clfs] cnn_preds = np.vstack(cnn_preds) cnn_scale = (cnn_preds - self.scale_constants['cnn']['mean'] ) / self.scale_constants['cnn']['std'] if filter_class == "svm": y_preds = svm_scale elif filter_class == "svm_ptyp": y_preds = svm_scale + ptyp_scale elif filter_class == "ptyp": y_preds = ptyp_scale elif filter_class == "svm_cnn_ptyp": weights = [self.scale_constants['svm']['weight']] + ( [self.scale_constants['cnn']['weight']] * len(self.cnn_clfs)) y_preds = np.average(np.vstack([cnn_scale, svm_scale]), axis=0, weights=weights) + ptyp_scale structured_data = { "is_rct": bool(y_preds[0] > self.thresholds[filter_class][threshold_class]), "decision_score": y_preds[0], "model_class": filter_class } data.ml["rct"] = structured_data return data @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [{ "type": "Trial Design", "title": "Is an RCT?", "annotations": [], "description": "{0} (Decision score={1:0.2f} using {} model)".format( data["rct"]["is_rct"], data["rct"]["decision_score"], data["rct"]["model_class"]) }] return marginalia
xml_content = xml_processor(xml_file.read()) assert type(xml_content) == str yield xml_content print "sent file {0}, named \n {1} to processing".format(i, paths[i]) i += 1 # First try producing features with Hashing Vectorizer, # Which returns a scipy_sparse matrix with shape # (n_samples, 2 ** 20 features). Has some downsides and # may not be useable in training if op.vectorizer == "hashing": # first use simple word tokens (whitespace sperated?) word_hasher = HashingVectorizer() hashed_sparse_mat = word_hasher.transform( generate_xml_paths(train_paths, test_paths) ) print hashed_sparse_mat print type(hashed_sparse_mat) # Save the matrix as follows io.mmwrite("../data/features/naive_word_hashed_full_features.mtx", hashed_sparse_mat) elif op.vectorizer == "hash_4gram_tfidf": # pipe vectorizer with ngrams and tfidf pipe = make_pipeline( HashingVectorizer(ngram_range=(1, 4)), TfidfTransformer() ) hashed_sparse_mat = pipe.fit_transform(
tanonymous = np.array(tanonymous) tpromotedto = np.array(tpromotedto) tnumanswers = np.array(tnumanswers) tnotopics = np.array(tnotopics) tcontextfollowers = np.array(tcontextfollowers) ttopicsfollowers = np.array(ttopicsfollowers) print "extracting features" quevectorizerTfid = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') topvectorizerTfid = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') quevectorizerHash = HashingVectorizer(stop_words='english',non_negative=True, n_features=1000) topvectorizerHash = HashingVectorizer(stop_words='english',non_negative=True, n_features=1000) #quesparseHash = quevectorizerHash.transform(question) topsparseHash = topvectorizerHash.transform(topics) #tquesparseHash = quevectorizerHash.transform(tquestion) ttopsparseHash = topvectorizerHash.transform(ttopics) cfscaler = preprocessing.StandardScaler().fit(contextfollowers) tfscaler = preprocessing.StandardScaler().fit(topicsfollowers) cfscaled = cfscaler.transform(contextfollowers) tfscaled = tfscaler.transform(topicsfollowers) tcfscaled = cfscaler.transform(tcontextfollowers) ttfscaled = tfscaler.transform(ttopicsfollowers) def benchmark(clf, trainx, trainy, test, dataset): print 80 * '_' print "Training..." print clf
train_data.append(list[1]) train_target.append(labeldict.get(list[0])) print(cnt) categories = dict.keys() #print(train_data) # split a training set and a test set y_train = train_target print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(train_data) duration = time() - t0 print("Extracting features from the test data using the same vectorizer") t0 = time() duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, duration)) print() if opts.use_hashing:
H_df = pd.read_csv(full_data_set, usecols=[5, 8]) """### Counting the Full Data""" # counting the number of ratings in the full set rating_counts_full = H_df.groupby('Rating')['Rating'].count() rating_counts_full.head() # getting the ratios of the ratings rating_counts_full / len(H_df) H_df.describe() """##Creating the HashingVectorizer""" # creates the HashingVectorizer that will be used with the full data H_vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False) H_X = H_vectorizer.transform(H_df['Text']) H_y = H_df['Rating'] # splits the data 80/20 with random state 55 H_X_train, H_X_test, H_y_train, H_y_test = train_test_split(H_X, H_y, test_size=0.2, random_state=55) """##NB Classifier with HashingVectorizer""" H_modelNB = MultinomialNB() H_modelNB.fit(H_X_train, H_y_train) H_y_predNB = H_modelNB.predict(H_X_test) results_function(H_modelNB, H_X_test, H_y_test, H_y_predNB) """##kNN Classifier with HashingVectorizer and K = 3"""