def learning_curve(): n = 50000 nsteps = 10 full = cu.get_sample_data_frame(n) data = full.ix[0 : int(n * 0.6) - 1].reset_index() cval = full.ix[int(n * 0.6) : int(n * 0.8) - 1].reset_index() test = full.ix[int(n * 0.8) : n - 1].reset_index() step = len(data) / nsteps ndata = len(data) mvec = range(step, ndata + step, step) test_features = features.extract_features(test) data_error = [] cval_error = [] for i in range(len(mvec)): m = mvec[i] print "running for size", m train = data.ix[0 : m - 1].reset_index() fea = features.extract_features(train) rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=False, n_jobs=5) rf.fit(fea, train["OpenStatus"]) new_priors = cu.load_priors("train.csv") old_priors = cu.compute_priors(train.OpenStatus) # predict train probs = rf.predict_proba(fea) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(train) score = multiclass_log_loss(y_true, probs) data_error.append(score) # predict cval probs = rf.predict_proba(test_features) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) cval_error.append(score) return mvec, data_error, cval_error
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) score()
def main(): print("Reading the data") data = cu.get_dataframe(train_little) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") #classifier = MultinomialNB() #classifier = KNeighborsClassifier(n_neighbors=3, weights='distance') classifier = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1) #classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1) classifier.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_litte) test_features = features.extract_features(feature_names, data) probs = classifier.predict_proba(test_features) #print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) #old_priors = cu.get_priors(train_file) #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_litte) cu.write_submission(submission_litte, probs)
def test_make_dataset_from_sgf(self): with tempfile.NamedTemporaryFile() as sgf_file, \ tempfile.NamedTemporaryFile() as record_file: sgf_file.write(TEST_SGF.encode('utf8')) sgf_file.seek(0) preprocessing.make_dataset_from_sgf( utils_test.BOARD_SIZE, sgf_file.name, record_file.name) recovered_data = self.extract_data(record_file.name) start_pos = go.Position(utils_test.BOARD_SIZE) first_move = coords.from_sgf('fd') next_pos = start_pos.play_move(first_move) second_move = coords.from_sgf('cf') expected_data = [ ( features.extract_features(utils_test.BOARD_SIZE, start_pos), preprocessing._one_hot(utils_test.BOARD_SIZE, coords.to_flat( utils_test.BOARD_SIZE, first_move)), -1 ), ( features.extract_features(utils_test.BOARD_SIZE, next_pos), preprocessing._one_hot(utils_test.BOARD_SIZE, coords.to_flat( utils_test.BOARD_SIZE, second_move)), -1 ) ] self.assertEqualData(expected_data, recovered_data)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) data['OpenStatusMod'] = data['OpenStatus'].map(convert_status) #print(data['OpenStatusMod']) print("Extracting features") fea = features.extract_features(feature_names, data) #print(fea.columns) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1, random_state = 0) print("Training the model, created RFC") #rf.fit(fea, data["OpenStatus"]) rf.fit(fea, data["OpenStatusMod"]) print("Reading test file and making predictions") #data = cu.get_dataframe(test_file) data = cu.get_dataframe(full_train_file) print("Reading data frame") data['OpenStatusMod'] = data['OpenStatus'].map(convert_status) print("adding column") test_features = features.extract_features(feature_names, data) print("extract features") probs = rf.predict_proba(test_features) # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # print "new priors %s" %(new_priors) # print "old priors %s" %(old_priors) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print("Reading the data from " + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2) clf.fit(fea, data["OpenStatus"]) print "Listing feature importances:" cu.list_feature_importance(clf,feature_names) print("Reading test file and making predictions: " + test_file) data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = clf.predict_proba(test_features) if (update_posteriors): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" percep.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs #probs = percep.predict_proba(test_fea) # only available for binary classification probs = percep.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 20 # Make sure the topics are included as features for analysis feature_names.extend('Topic%d' % k for k in range(K)) print("Reading the vocabulary") vocab = [w.strip() for w in file('./vocab4.txt')] # How many words are in the vocabulary W = len(vocab) print("Reading the data") data = cu.get_dataframe(train_file) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) print("Allocating the topics") allocate_topics(lda, data, K, batchsize, D) print("Extracting features") fea = extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=4) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) allocate_topics(lda, data, K, batchsize, D) test_features = extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def make_submission(): data = None if os.path.exists('data.pik'): print("Unpickeling the data") data = pickle.load(open('data.pik')) else: print("Reading the data") data = cu.get_dataframe(full_train_file) pickle.dump(data,open('data.pik','w')) fea = None if os.path.exists('fea.pik'): print("Unpickeling the fea") fea = pickle.load(open('fea.pik')) else: print("Extracting features") fea = features.extract_features(feature_names, data) pickle.dump(fea,open('fea.pik','w')) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, oob_score=True, #criterion='entropy', n_jobs=2) rf.fit(fea, data["OpenStatus"]) print "Features Importance:" imps = zip(rf.feature_importances_, feature_names,) imps.sort(reverse=True) print '\n'.join([ str(_) for _ in imps ]) print "Generalization Error:", rf.oob_score_ print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) if True: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def gen_sentiment_vectors(reviews, word_lists, popular_words): sentiment_vectors = dict() for i in reviews: doc_tag = reviews[i]['title'] prev_sentiment = None for line in reviews[i]['reviews']: if line == ("<p>", ) or line == ("</p>", ): continue sentiment = line[0] sentence = line[1] features = f.extract_features(sentence, word_lists, popular_words, doc_tag, prev_sentiment) if sentiment in sentiment_vectors: sentiment_vectors[sentiment] = f.merge_features(sentiment_vectors[sentiment], features) else: sentiment_vectors[sentiment] = features prev_sentiment = sentiment # if i % 20 == 0: # print "Done with " + str(i) for sentiment in sentiment_vectors: sentiment_vectors[sentiment] = f.smooth_features(sentiment_vectors[sentiment]) return sentiment_vectors
def cross_validate(): print("Reading the data") data = cu.get_dataframe(train_file) print("Cross-Validating") rf = RandomForestClassifier(n_estimators=10, verbose=1, compute_importances=True, n_jobs=2) cv = cross_validation.KFold(len(data), k=10, indices=False) results = [] for traincv, testcv in cv: print "\t-- cv [%d]"%len(results) print "\t","extracting features" #... feacv = features.extract_features(feature_names, traincv) print "\t","learning" rf.fit(feacv, data["OpenStatus"]) print "\t","predicting" probs = rf.predict_proba(testcv) print "\t","evaluating" results.append( llfun(target[testcv], [x["OpenStatus"] for x in probas]) ) print "LogLoss: " + str( np.array(results).mean() )
def create_dataset(split = 0.5, size = None): """ Reads in a set of texts and split into train and test sets, tagged by author """ train_data, test_data = [],[] max_feats = defaultdict(float) for file_name in os.listdir(path)[:size]: base_name = os.path.basename(file_name) author = base_name.split('_',1)[0] print "Reading in from %s" % base_name with codecs.open(os.path.join(path, file_name),'r','utf8') as doc: content = doc.read() feat_vec = features.extract_features(content) for feat, value in feat_vec.iteritems(): if value > max_feats[feat]: max_feats[feat]=value length = len(feat_vec) feat_vec['author']=author if random.random()<split: train_data.append(feat_vec) else: test_data.append(feat_vec) print "Normalizing..." for feat_vec in train_data: for feat in feat_vec: feat_vec[feat]/=max_feats[feat] for feat_vec in test_data: for feat in feat_vec: feat_vec[feat]/=max_feats[feat] return train_data, test_data
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Writing short sample features file") ''' preview in console ''' print(fea.values[:4]) print fea.describe().to_string() ''' save the X features data (matrix)''' # cu.write_submission(train_features_short_file, fea.values) np.savetxt(train_features_short_file, fea.values, fmt='%d', delimiter=',', newline='\n') '''train_features_short = [fea, data["OpenStatus"]]''' closed_reasons = data["OpenStatus"] closed_reasons_count = Counter(closed_reasons) print(closed_reasons_count.keys()[0:5]) closed_reasons_enum = map(closed_reasons_count.keys().index, closed_reasons) print(closed_reasons_enum[:9]) print("Saving submission to %s" % submission_file) ''' save the y supervised classification data (vector) ''' np.savetxt(train_y_short_file, closed_reasons_enum, fmt='%d', delimiter=',', newline='\n') '''
def train(samples, vocabulary): logger.debug("Extracting features") X = [] for s in samples: X.append(extract_features(s[0], vocabulary)) X = sp.vstack(X, format="csr") y = np.array([s[1] for s in samples]) clf = RandomForestClassifier(n_estimators=30) if args["-c"]: logger.debug("Performing N-fold cross-validation (N=%s)" % args["-f"]) scores = cross_validation.cross_val_score( clf, X.toarray(), y, n_jobs=int(args["-j"]), cv=int(args["-f"]), scoring=args["-s"] ) print("F1: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2)) logger.debug("Training model on all data") clf.fit(X.toarray(), y) logger.debug("Done, returning model and vocabulary") return (clf, vocabulary)
def predict(clf, ua, vocabulary): """Predict if a patient is diagnosed with a disease.""" X = extract_features(ua, vocabulary) pred = clf.predict(X.toarray()) return X, pred
def preprocess(file): print(' Tokenizing...') tokens = [line.strip() for line in file] # Generate all word POS tags print(' Tagging parts of speech...') pos_tagged = nltk.pos_tag(tokens) # Generate all word lemma forms print(' Finding lemma forms...') l = lemmatizer() lemma_tagged = [{'word': token, 'lemma': l(token, pos), 'pos': pos} for (token, pos) in pos_tagged] # Make a list of lists # Go through all words, appending to a list within the list # If the word POS is '.', then flush the current list to the sentence list with that word # Then start working with a new list sentence_chunked = [] current_sentence = [] for word_form in lemma_tagged: current_sentence.append(word_form) if word_form['pos'] == '.': sentence_chunked.append(current_sentence) current_sentence = [] feature_tagged = [] for sentence in sentence_chunked: for i in range(0, len(sentence)): word_form = sentence[i] feature_tagged.append(extract_features(word_form, i, sentence)) return feature_tagged
def train(sentences): """Train NER tagger. Parameters ---------- sentences : iterable over list A sequence of lists of tokens. """ if not isinstance(sentences, list): sentences = list(sentences) logger.debug("Extracting features") vocabulary = dict((t[0], i) for s in sentences for i, t in enumerate(s)) X = [] for i, s in enumerate(sentences): X.append(extract_features(s, vocabulary)) X = sp.vstack(X, format='csr') # FIXME Only BIO tags for now y = np.array([bio_int[tok[2][0]] for s in sentences for tok in s]) params = { "loss": ["l1", "l2"], "multi_class": [True, False], "C": [1., 10., 100.], } logger.debug("Training linear SVMs") clf = GridSearchCV(LinearSVC(), params, n_jobs=-1).fit(X, y) logger.debug("Done, returning the best one") return (clf.best_estimator, vocabulary)
def classify(text): s = score(extract_features(text)) print s if s <= MAX_HAM_SCORE: return "ham" elif s>= MIN_SPAM_SCORE: return "spam" else: return "unsure"
def compile_data(input_file, label_file): cf = ff.extract_features(input_file) truth = la.do_label(cf, label_file) data = [] # we can throw away the time stamps now for x in cf: data.append(x[3]) return data, truth
def operate(word_form, i, sentence): feature_set = extract_features(word_form, i, sentence) key = generate_key(word_form) # Increment total count for sense output[key][0] += 1 # For each found feature, increment # count for that feature-value pair for feature, value in feature_set.items(): output[key][1][feature][value] += 1
def measure_model(datasize=1000, testsize=500): data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) # data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records # test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records # data = cu.get_dataframe('train-sample.csv') # test = cu.get_dataframe('public_leaderboard.csv') fea = features.extract_features(data) test_features = features.extract_features(test) rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5) rf.fit(fea, data["OpenStatus"]) probs = rf.predict_proba(test_features) new_priors = cu.load_priors("train.csv") old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, rf, fea
def make_dataset_from_selfplay(data_extracts): ''' Returns an iterable of tf.Examples. Args: data_extracts: An iterable of (position, pi, result) tuples ''' tf_examples = (make_tf_example(features_lib.extract_features(pos), pi, result) for pos, pi, result in data_extracts) return tf_examples
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") X = features.extract_features(feature_names, data) y = [ class_labels[i] for i in data["OpenStatus"]] skf = StratifiedKFold(y, 10) result_f1 = 0 result_logloss = 0 fold = 1 for train, test in skf: print "Fold %d" % fold fold+=1 X_train = [X.ix[i] for i in train] y_train = [y[i] for i in train] X_test = [X.ix[i] for i in test] y_test = [y[i] for i in test] if (options.__dict__['classifier'] == 'erf'): classifier = ExtraTreesClassifier(n_estimators=100, verbose=0, compute_importances=True, n_jobs=-1) elif(options.__dict__['classifier'] == 'mnb'): classifier = MultinomialNB() elif (options.__dict__['classifier'] == 'knn'): classifier = KNeighborsClassifier(n_neighbors=11) elif (options.__dict__['classifier'] == 'gbc'): classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1) classifier.fit(X_train, y_train) probs = classifier.predict_proba(X_test) if (options.__dict__['priors'] != 0): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_pred = probs y_test = np.array(y_test) logloss = multiclass_log_loss(y_test, y_pred, eps=1e-15) y_pred = classifier.predict(X_test) f1 = f1_score(y_test, y_pred, pos_label=None) print "Log Loss: %f f1: %f" % (logloss, f1) result_f1 += f1 result_logloss += logloss print '\navg LogLoss: %f avg f1: %f' % (result_logloss/10.0, result_f1/10.0)
def predict(self, ua): """Predict if a patient is diagnosed with a disease.""" if re.search(r"urllib|nagios|spider|bot|google|http_request|jeeves|yahoo|http", ua, re.IGNORECASE) is not None: return True X = extract_features(ua, self.vocabulary) pred = self.clf.predict(X.toarray()) if not pred[0]: return True return False
def predict(clf, sentence, vocabulary): """Predict BIO labels for a single sentence.""" X = extract_features(sentence, vocabulary) pred = [int_bio[y] for y in clf.predict(X)] # Heuristic repair: make output consistent, # but never worse than the raw prediction. for i in xrange(len(pred)): if pred[i] == "I" and (i == 0 or pred[i - 1] == "O"): pred[i] = "B" return pred
def profile_one_dim(im): im = gray_level(im) print(np.shape(im)) vertical_sum = np.sum(im, axis=0)/np.shape(im)[1] fig = plt.figure(0) fig.canvas.set_window_title('Projection Profile - ' + filename) plt.plot(vertical_sum) # plt.show() P, X, Y = zone_division(im, vertical_sum) density_symmetry, roughness_max, roughness_symmetry = extract_features(im, P, X, Y) fv = feature_vector(density_symmetry, roughness_max, roughness_symmetry, filename) all_vector.append(fv)
def main(): print("Reading the data") train_data = cu.get_dataframe(train_file) print("Extracting features") train_features = features.extract_features(feature_names, train_data) print("Reading test file and making predictions") test_data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, test_data) # print("Training random forest") # rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1) # rf.fit(train_features, train_data["OpenStatus"]) # probs = rf.predict_proba(test_features) # print("Training decision tree") # dt = DecisionTreeClassifier() # dt.fit(train_features, train_data["OpenStatus"]) # probs = dt.predict_proba(test_features) # print("Training adaboost") # ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME").fit(train_features, train_data["OpenStatus"]) # probs = ada.predict_proba(test_features) print("Training nearest neighbors") scaler = preprocessing.StandardScaler().fit(train_features) train_features_scaled = scaler.transform(train_features) test_features_scaled = scaler.transform(test_features) nbrs = KNeighborsClassifier(n_neighbors=10).fit(train_features_scaled, train_data["OpenStatus"]) probs = nbrs.predict_proba(test_features_scaled) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) actual = cu.get_actual(test_data["OpenStatus"]) print(cu.get_log_loss(actual, probs, 10**(-15)))
def calc_sentiment_prob(vectors, sent_counts, sentence, sentiment, word_lists, popular_words, doc_tag = None, prev_sentiment = None): features = f.extract_features(sentence, word_lists, popular_words, doc_tag, prev_sentiment) prob = math.log(float(sent_counts[sentiment]) / float(sent_counts['total'])) for feature in features: if feature not in vectors[sentiment]: feature = "<UNK>" feature_count = vectors[sentiment][feature] prob += math.log(float(feature_count + 1) / float(sent_counts[sentiment] + len(vectors[sentiment]))) return prob
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=100, verbose=2, compute_importances=True, n_jobs=-1) rf.fit(fea, data["OpenStatus"]) gb = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0) gb.fit(fea, data["OpenStatus"]) dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) dt.fit(fea, data["OpenStatus"]) et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) et.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) probs2 = gb.predict_proba(test_features) probs3 = dt.predict_proba(test_features) probs4 = et.predict_proba(test_features) for i in range(0, len(probs)): for j in range(0,5): probs[i][j] = (probs[i][j] + probs2[i][j] + probs3[i][j] + probs4[i][j])/4 print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append(features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) fea = features.extract_features(feature_names,data) print "Collecting statuses" y = [] for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y),k = 10) fold = 0 result_sum = 0 for train_index,test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "fitting this fold's data" rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) #_pred_probs = denormalize(rf.predict_proba(X_test)) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test,_pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold,result) print "Average MCLL score for this classifier = %0.11f" % (result_sum/10) else: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True print "Fitting" logit.fit(fea, y) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = logit.predict_proba(test_fea) if is_full_train_set == 0: print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): start = time.time() result_sum = 0 data = cu.get_dataframe("data/train-sample.csv") #test_data = cu.get_dataframe("data/public_leaderboard.csv") #use this for evaluating public_leaderboard print 'data loaded' fea = features.extract_features(feature_names, data) #test_fea = features.extract_features(feature_names,test_data) #use this for evaluating public_leaderboard print 'features extracted' knn = KNeighborsClassifier(n_neighbors=10, weights='distance') # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] ques_status = [ 'open', 'too localized', 'not constructive', 'off topic', 'not a real question' ] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation skf = StratifiedKFold(y, k=10) fold = 0 for train_index, test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] temp.append(fea['ReputationAtPostCreation'][i]) temp.append(fea['UserAge'][i]) temp.append(fea['Title'][i]) temp.append(fea['BodyMarkdown'][i]) temp.append(fea['OwnerUndeletedAnswerCountAtPostTime'][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] temp.append(fea['ReputationAtPostCreation'][i]) temp.append(fea['UserAge'][i]) temp.append(fea['Title'][i]) temp.append(fea['BodyMarkdown'][i]) temp.append(fea['OwnerUndeletedAnswerCountAtPostTime'][i]) X_test.append(temp) y_test.append(y[i]) y_test = vectorize_actual(y_test) # vectorize y_test knn.fit(X_train, y_train) # train the classifier predictions = knn.predict_proba(X_test) # predict the test fold # evaluating the performance result = eval_tool.mcllfun(y_test, predictions) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) finish = time.time() print "completed in %0.4f seconds" % (finish - start) ### Use this code for evaluting public_leaderboard '''knn.fit(fea,y)
def find_candidates(image, scale, overlap, xband, yband, svc, X_scaler, _count): image = color_scale(image) xstart, xstop = xband ystart, ystop = yband # generate HOG features over entire search region if config.HOG_FEAT: hog_conv = convert_color(image, config.HOG_SPACE) hog_region = hog_conv[ystart:ystop, xstart:xstop, :] if scale != 1.0: hog_region = cv2.resize(hog_region, (0, 0), fx=1.0 / scale, fy=1.0 / scale) hog_channel = config.HOG_CHANNEL if hog_channel == 'ALL': hog_features = [] for channel in range(hog_region.shape[2]): hog_features.append( get_hog_features(hog_region[:, :, channel], feature_vec=False)) hog_features = np.array(hog_features) else: hog_features = get_hog_features(hog_region[:, :, hog_channel], feature_vec=False)[np.newaxis, ...] # overlap = config.OVERLAP window = config.WINDOW_SIZE img_region = image[ystart:ystop, xstart:xstop, :] if scale != 1.0: img_region = cv2.resize(img_region, (0, 0), fx=1.0 / scale, fy=1.0 / scale) # start window sliding rewrite xspan = img_region.shape[1] yspan = img_region.shape[0] pix_per_step = np.int(window * (1 - overlap)) buff = np.int(window * overlap) if (xspan - buff) % pix_per_step == 0: xwindows = np.int((xspan - buff) / pix_per_step) - 1 else: xwindows = np.int((xspan - buff) / pix_per_step) if (yspan - buff) % pix_per_step == 0: ywindows = np.int((yspan - buff) / pix_per_step) - 1 else: ywindows = np.int((yspan - buff) / pix_per_step) if xwindows <= 0 or ywindows <= 0: raise Exception("Invalid config - area too small") print('scale,x,y', scale, xwindows, ywindows) boxes = [] cars = 0 for iy in range(ywindows): for ix in range(xwindows): leftx = ix * pix_per_step topy = iy * pix_per_step endx = leftx + window endy = topy + window subimg = img_region[topy:endy, leftx:endx] features = extract_features(subimg) features_sc = X_scaler.transform(features.reshape(1, -1)) prediction = svc.predict(features_sc) if prediction == 1: cv2.imwrite( 'output_images/cars/' + str(_count) + '_' + str(scale) + '.png', cv2.cvtColor(subimg * 255, cv2.COLOR_RGB2BGR)) cars += 1 xbox_left = np.int(leftx * scale) ytop_draw = np.int(topy * scale) win_draw = np.int(window * scale) box = ((xbox_left + xstart, ytop_draw + ystart), (xbox_left + win_draw + xstart, ytop_draw + win_draw + ystart)) confidence = svc.decision_function(features_sc) pred_box = PredictionBox(box, confidence) boxes.append(pred_box) if cars != 0: print('!!!', scale, ':', cars) return boxes
def run(self, position): 'Return a sorted list of (probability, move) tuples' processed_position = features.extract_features(position) probabilities = self.session.run( self.output, feed_dict={self.x: processed_position[None, :]})[0] return probabilities.reshape([go.N, go.N])
import numpy as np import pickle import cv2 import sys from color import convert_color from features import extract_features from detect_with_labels import cars_from_bboxes, draw_boxes dist_pickle = pickle.load( open("svc_classifier.p", "rb" ) ) svc = dist_pickle["svc"] X_scaler = dist_pickle["X_scaler"] p = dist_pickle["parameters"] features = extract_features(sys.argv[1:], color_space=p['color_space'], spatial_size=p['spatial_size'], hist_bins=p['hist_bins'], orient=p['orient'], pix_per_cell=p['pix_per_cell'], cell_per_block=p['cell_per_block'], hog_channel=p['hog_channel'], spatial_feat=p['spatial_feat'], hist_feat=p['hist_feat'], hog_feat=p['hog_feat']) scaled_features = X_scaler.transform(features) for prediction in svc.predict(scaled_features): print('car' if prediction else 'not car')
"entropy"] class_names = ["Attention", "HornsUp", "TrailArms"] print("Extracting features and labels for window size {} and step size {}...".format(window_size, step_size)) sys.stdout.flush() n_features = len(feature_names) X = np.zeros((0,n_features)) y = np.zeros(0,) for i,window_with_timestamp_and_label in slidingWindow(data, window_size, step_size): # omit timestamp and label from accelerometer window for feature extraction: window = window_with_timestamp_and_label[:,1:-1] # extract features over window: x = extract_features(window) # append features: X = np.append(X, np.reshape(x, (1,-1)), axis=0) # append label: y = np.append(y, window_with_timestamp_and_label[5, -1]) print("Finished feature extraction over {} windows".format(len(X))) print("Unique labels found: {}".format(set(y))) sys.stdout.flush() # %%--------------------------------------------------------------------------- # # Plot data points # # -----------------------------------------------------------------------------
while hdata[count][0] < window_with_timestamp_and_label[row][ 4] and count > 0: count = count - 1 print("changed count ", count) #remove timestamps from accel data temp = np.vstack((temp, window_with_timestamp_and_label[row][:-2])) #add hr data to accel hr_label = np.append(hdata[count][1], hdata[count][2]) window_with_timestamp_and_label[row] = np.append( temp[row + 1], hr_label) #add in label (hr_data is on form hr, t, label) #remove time and label for feature extraction window = window_with_timestamp_and_label[:, :-1] # extract features over window: x = extract_features( window ) #x, y, z, t (not reoriented) -> x, y, z, heart rate, label/class -> x, y, z, hr # append features: # shapes into 1 row with unspecified number of columns (so just 1 row of n_features) X = np.append(X, np.reshape(x, (1, -1)), axis=0) # append label: y = np.append( y, window_with_timestamp_and_label[10, -1]) #we don't know why this is 10? print("Finished feature extraction over {} windows".format(len(X))) print("Unique labels found: {}".format(set(y))) sys.stdout.flush() # %%--------------------------------------------------------------------------- #
def main(): if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append( features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print("Reading the data from:" + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: depth = len(feature_names) print "depth=" + str(depth) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=depth, init=None, random_state=None) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "Fitting for fold " + str(fold) rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) # priors distribution over classes based on the training set #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] # priors distribution over classes based on the updated training set's last month new_priors = [ 0.03410911204982466, 0.01173872976800856, 0.018430671606251586, 0.926642216133641, 0.009079270442274271 ] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "depth=" + str(depth) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) else: #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=len(feature_names), init=None, random_state=None) rf.fit(fea, y) print("Reading test file " + test_file + " and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) # commented out, because we want to adjust probabilities to the last month data anyway #if do_full_train == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('mode', help='extract | tsne | umap') parser.add_argument( 'data', help= '[features]: Filepath to an image or folder containing images to extract features from. [tsne/umap]: Filepath to a .csv file to read into a DataFrame. ' ) parser.add_argument('out', help='Output filepath of operation') parser.add_argument( '--feature-cols', '-f', help= '[tsne/umap]: Numerical data column indices to treat as features. Ex: "B,C,F", use "all" to consider all columns (excluding optional unique-col).' ) parser.add_argument( '--unique-col', '-u', help= '[tsne/umap]: The column index containing unique IDs for each row (typically "ID" or "Name" column). Not required. Omitted from "all" feature-cols' ) parser.add_argument( '--reduce', '-r', help= '[tsne/umap]: How many dimensions to reduce features to. Default is 2.', default='2') parser.add_argument( '--model', '-m', help= '[features]: Which model to use. ResNet50 | Xception | VGG16 | VGG19 | InceptionV3 | MobileNet. Default: ResNet50', default='ResNet50') args = parser.parse_args(argv[1:]) # === FEATURE EXTRACTION === # We expect an image filepath or folder of images if args.mode == 'features': assert os.path.exists(args.data),\ 'Features mode (data arg): File or directory not found: "{}"'\ .format(args.data) # Calculate and write to args.out features = extract_features(args.data, model=args.model, write_to=args.out) # === DIMENSION REDUCTION === # We expect a .csv file of features elif args.mode in ['tsne', 'umap']: # Make sure we know what columns are intended to be used numerically as a list of strings, or 'all' feature_cols = args.feature_cols if feature_cols is None: raise Exception( 'Feature reduction mode: No data column indices provided. Example usage: "--feature-cols B,C,F", "--feature-cols all"' ) elif feature_cols != 'all': feature_cols = [ s.strip() for s in feature_cols.split(',') if s.strip() != '' ] # Parse the data into a squashed pd.DataFrame with first column being unique keys df = parse_data(args.data, feature_cols, args.unique_col) if args.mode == 'tsne': tsne(df, dims=int(args.reduce), write_to=args.out) elif args.mode == 'umap': umap(df, write_to=args.out)
def yield_data(filename): with open(filename) as handle: for line in handle: s = line.strip().split(',') h1, h2 = s[-2], s[-1] yield from extract_features(h1, h2, include_target=True)
def predict(): """ Given a window of accelerometer data, predict the activity label. Then use the onActivityDetected(activity) function to notify the Android must use the same feature extraction that you used to train the model. """ # have to fix the window size prediction_array=np.array([]) p_time=np.array([]) clf = load("classifier.pickle") # maybe we are not even filling buffer but just running a for loop data_file_ss_11 = os.path.join('data', 'accel_data-12-08-BP-ss.csv') data_ss_11 = np.loadtxt(data_file_ss_11, delimiter=',', dtype = object, converters = {0: np.float, 1: np.float, 2: np.float, 3: lambda t: datetime.strptime(t.decode("utf-8"), "%d/%m/%Y %H:%M")}) data_ss_11 = np.insert(data_ss_11, 3, 0, axis = 1) hdata_file_ss_11 = os.path.join('data', 'BPM_2017-12-08-BP-ss.csv') hdata_ss_11 = np.loadtxt(hdata_file_ss_11, delimiter=',', dtype = object, converters = {0: lambda t: datetime.strptime(t.decode("utf-8"), "%d/%m/%Y %H:%M"), 1: np.float}) data = data_ss_11 hdata = hdata_ss_11 window_size=20 step_size=20 #because hr data in backwards count = len(hdata)-1 for i,window_with_timestamp_and_label in slidingWindow(data, window_size, step_size): temp = np.zeros((1,3)) #while time at row count is under time at accel, increase count (move to next row) #only have one window. Each row in window has own observation that needs hr for row in range(len(window_with_timestamp_and_label)): # print (hdata[count]) # print(" ") # print (window_with_timestamp_and_label[row]) while hdata[count][0] < window_with_timestamp_and_label[row][4] and count > 0: count=count-1 print("changed count ", count) if row==0: p_time=np.append(p_time,window_with_timestamp_and_label[row][4]) #remove timestamps from accel data temp = np.vstack((temp,window_with_timestamp_and_label[row][:-2])) #add hr data to accel hr_label = np.append(hdata[count][1],9) window_with_timestamp_and_label[row] = np.append(temp[row+1], hr_label) #add in label (hr_data is on form hr, t, label) #remove time and label for feature extraction window = window_with_timestamp_and_label[:,:-1] # extract features over window: # print("Buffer filled. Run your classifier.") prediction=clf.predict(np.reshape(extract_features(window),(1,-1)))[0] prediction_array= np.append(prediction_array,prediction) # print prediction # for i in range(0,len(prediction_array)): # p_time=np.append(p_time,i) plt.plot(p_time,prediction_array) plt.xlabel('Time') plt.ylabel('Predicted Label') plt.show() return
column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] train_file = '../../corpus/conllx/sv/swedish_talbanken05_train.conll' test_file = '../../corpus/conllx/sv/swedish_talbanken05_test.conll' train_sentences = conll.read_sentences(train_file) test_sentences = conll.read_sentences(test_file) formatted_train_corpus = conll.split_rows(train_sentences, column_names_2006) formatted_test_corpus = conll.split_rows(test_sentences, column_names_2006) for mode in [1, 3]: print("Extracting the features...") X_dict, y = features.extract_features(formatted_train_corpus, mode) print("Encoding the features...") # Vectorize the feature matrix and carry out a one-hot encoding vec = DictVectorizer(sparse=True) X = vec.fit_transform(X_dict) print("Training the model...") classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear', multi_class='ovr') model = classifier.fit(X, y) dump(model, './clfs/logres_mode=' + str(mode) + '_feats.joblib') dump(vec, './vectorizers/mode=' + str(mode) + '_feats_vectorizer.joblib') print("Predicting the chunks in the test set...")
def train(self): # Load data cars = self.fill_data(self.root_car) notcars = self.fill_data(self.root_notcars) # Show an example of each kind and print number of examples if self.debug: test_car = visualizer.read_and_draw_image(cars[0], 'Car') test_not_car = visualizer.read_and_draw_image(notcars[0], 'No_Car') print("Number of Car examples: ", len(cars)) print("Number of Non-Car examples: ", len(notcars)) # TODO see if image ranges from 0 to 1 car_features = features.extract_features( cars[0:self.num_train_examples], color_space=self.color_space, spatial_size=self.spatial_size, hist_bins=self.hist_bins, orient=self.orient, pix_per_cell=self.pix_per_cell, cell_per_block=self.cell_per_block, hog_channel=self.hog_channel, spatial_feat=self.spatial_feat, hist_feat=self.hist_feat, hog_feat=self.hog_feat) notcar_features = features.extract_features( notcars[0:self.num_train_examples], color_space=self.color_space, spatial_size=self.spatial_size, hist_bins=self.hist_bins, orient=self.orient, pix_per_cell=self.pix_per_cell, cell_per_block=self.cell_per_block, hog_channel=self.hog_channel, spatial_feat=self.spatial_feat, hist_feat=self.hist_feat, hog_feat=self.hog_feat) if self.debug: from skimage.feature import hog car_feat_image = cv2.cvtColor(test_car, cv2.COLOR_RGB2YCrCb) for channel in range(car_feat_image.shape[2]): channel_image = car_feat_image[:, :, channel] fd, hog_image = hog(channel_image, orientations=self.orient, pixels_per_cell=(self.pix_per_cell, self.pix_per_cell), cells_per_block=(self.cell_per_block, self.cell_per_block), visualise=True) visualizer.draw_two_images(channel_image, hog_image, title='Car_Channel_' + str(channel), save=True) notcar_feat_image = cv2.cvtColor(test_not_car, cv2.COLOR_RGB2YCrCb) for channel in range(notcar_feat_image.shape[2]): channel_image = notcar_feat_image[:, :, channel] fd, hog_image = hog(channel_image, orientations=self.orient, pixels_per_cell=(self.pix_per_cell, self.pix_per_cell), cells_per_block=(self.cell_per_block, self.cell_per_block), visualise=True) visualizer.draw_two_images(channel_image, hog_image, title='No_Car_Channel_' + str(channel), save=True) # TODO normalize data # TODO try different colorspaces # TODO try color HOG X = np.vstack((car_features, notcar_features)).astype(np.float64) print(X.shape) X_scaler = StandardScaler().fit(X) self.X_scaler = X_scaler scaled_X = X_scaler.transform(X) y = np.hstack( (np.ones(len(car_features)), np.zeros(len(notcar_features)))) rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split( scaled_X, y, test_size=0.2, random_state=rand_state) print('Using:', self.orient, 'orientations', self.pix_per_cell, 'pixels per cell and', self.cell_per_block, 'cells per block') print('Feature vector length:', len(X_train[0])) # Use a linear SVC svc = LinearSVC(C=1000) self.svc = svc # Check the training time for the SVC t = time.time() svc.fit(X_train, y_train) t2 = time.time() print(round(t2 - t, 2), 'Seconds to train SVC...') # Check the score of the SVC print('Test Accuracy of SVC = ', round(svc.score(X_test, y_test), 4)) # Check the prediction time for a single sample t = time.time() n_predict = 10 print('My SVC predicts: ', svc.predict(X_test[0:n_predict])) print('For these', n_predict, 'labels: ', y_test[0:n_predict]) t2 = time.time() print(round(t2 - t, 5), 'Seconds to predict', n_predict, 'labels with SVC')
male_count = 0 female_count = 0 try: index = -1 while True: index += 1 if (male_count + female_count) % 100 == 0: conn.commit() print(f"a: {male_count + female_count}, m: {male_count}, f: {female_count}") c.execute("SELECT id, male, body FROM posts WHERE length(body) > 150 ORDER BY ROWID DESC LIMIT ? OFFSET ?;", (500, index * 500)) posts = c.fetchall() if len(posts) == 0: print(f"No more posts.") conn.commit() exit() for post in posts: post_id = post[0] is_male = post[1] body = post[2] x = features.extract_features(body) c.execute("INSERT INTO examples VALUES (?, ?, ?, ?);", (post_id, len(body), is_male, json.dumps(x))) if is_male: male_count += 1 else: female_count += 1 except KeyboardInterrupt: conn.commit()
def load_and_train_all_features(): # Read in car and non-car images non_vehicle_images = glob.glob('training_data/non-vehicles/*/*.png') vehicle_images = glob.glob('training_data/vehicles/*/*.png') train_jpeg = False #non_vehicle_images = glob.glob('training_data/non-vehicles_smallset/*/*.jpeg') #vehicle_images = glob.glob('training_data/vehicles_smallset/*/*.jpeg') #train_jpeg=True #print('USING SMALLSET JPEG IMAGES!!!! us mping to load in features') print('Trainig Data:') print('Car images:', len(vehicle_images)) print('Non-Car images:', len(non_vehicle_images)) cars = [] notcars = [] for image in vehicle_images: cars.append(image) for image in non_vehicle_images: notcars.append(image) # experemted with play with these values to see how your classifier # performs under different binning scenarios colorspace = 'YCrCb' # Can be RGB, HSV, LUV, HLS, YUV, YCrCb orient = 9 # HOG orientations pix_per_cell = 8 # HOG pixels per cell cell_per_block = 2 # HOG cells per block hog_channel = 'ALL' # Can be 0, 1, 2, or "ALL" spatial = 32 spatial_size = (spatial, spatial) # Spatial binning dimensions hist_bins = 32 # Number of histogram bins spatial_feat = True # Spatial features on or off hist_feat = True # Histogram features on or off hog_feat = True # HOG features on or off y_start_stop = [400, 656] # Min and max in y to search in slide_window() train_jpeg = False print('Generating Features for Cars') car_features = extract_features(cars, color_space=colorspace, spatial_size=(spatial, spatial), hist_bins=hist_bins, hist_range=(0, 256), orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat, train_jpeg=train_jpeg) print('Generating Features for non car images') notcar_features = extract_features(notcars, color_space=colorspace, spatial_size=(spatial, spatial), hist_bins=hist_bins, hist_range=(0, 256), orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat, train_jpeg=train_jpeg) # Create an array stack of feature vectors X = np.vstack((car_features, notcar_features)).astype(np.float64) # Fit a per-column scaler X_scaler = StandardScaler().fit(X) # Apply the scaler to X scaled_X = X_scaler.transform(X) # Define the labels vector y = np.hstack((np.ones(len(car_features)), np.zeros(len(notcar_features)))) # Split up data into randomized training and test sets rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split( scaled_X, y, test_size=0.2, random_state=rand_state) print('Using spatial binning of:', spatial, 'and', hist_bins, 'histogram bins') print('Feature vector length:', len(X_train[0])) # Use a linear SVC svc = LinearSVC(C=1.0) #clf = CalibratedClassifierCV(svc) # performance was same on small sample set. clf = svc # Check the training time for the SVC t = time.time() clf.fit(X_train, y_train) t2 = time.time() print(round(t2 - t, 2), 'Seconds to train SVC...') # Check the score of the SVC print('Test Accuracy of SVC = ', round(clf.score(X_test, y_test), 4)) # Check the prediction time for a single sample t = time.time() n_predict = 10 print('My SVC predicts: ', clf.predict(X_test[0:n_predict])) print('For these', n_predict, 'labels: ', y_test[0:n_predict]) t2 = time.time() print(round(t2 - t, 5), 'Seconds to predict', n_predict, 'labels with SVC') print('saving model to classifier-svm.pkl') joblib.dump(clf, 'models/classifer-svm.pkl') joblib.dump(X_scaler, 'models/xscaler.pkl') return clf, X_scaler
def predict(window): x = extract_features(window) label = int(classifier.predict(x)[0]) onActivityDetected(class_names[label]) return
def play(network, args, device=None): ''' Plays out a self-play match, returning a MCTSPlayer object containing: - the final position - the n x 362 tensor of floats representing the mcts search probabilities - the n-ary tensor of floats representing the original value-net estimate where n is the number of moves in the game''' readouts = strat_args.num_readouts # defined in strategies.py # Disable resign in 5% of games if random.random() < args.resign_disable_pct: resign_threshold = -1.0 else: resign_threshold = None player = MCTSPlayer(network, device=device, resign_threshold=resign_threshold) player.initialize_game() # Must run this once at the start to expand the root node. first_node = player.root.select_leaf() features = extract_features(first_node.position, NEW_FEATURES) prob, val = network.policy_value_fn(features, device=device) first_node.incorporate_results(prob.flatten(), val.flatten(), first_node) while True: start = time.time() player.root.inject_noise() current_readouts = player.root.N # we want to do "X additional readouts", rather than "up to X readouts". while player.root.N < current_readouts + readouts: player.tree_search() if args.verbose >= 3: print(player.root.position) print(player.root.describe()) if player.should_resign(): player.set_result(-1 * player.root.position.to_play, was_resign=True) break move = player.pick_move() player.play_move(move) if player.root.is_done(): player.set_result(player.root.position.result(), was_resign=False) break if (args.verbose >= 2) or (args.verbose >= 1 and player.root.position.n % 10 == 9): print("Q: {:.5f}".format(player.root.Q)) dur = time.time() - start print("%d: %d readouts, %.3f s/100. (%.2f sec)" % (player.root.position.n, readouts, dur / readouts * 100.0, dur), flush=True) if args.verbose >= 3: print("Played >>", coords.to_gtp(coords.from_flat(player.root.fmove))) if args.verbose >= 2: utils.dbg("%s: %.3f" % (player.result_string, player.root.Q)) utils.dbg(player.root.position, player.root.position.score()) return player
def setUpClass(self): # Set up data for the whole TestCase self.train_data, self.train_images = read_data_from_file('synimg/train/data.csv', max_per_class=MAX_PER_CLASS) self.label_encoder, self.train_data = get_labels(self.train_data, print_classes=False) # one-hot encode, returns in column 'style_id' self.X_train = extract_features(self.train_images) self.y_train = list(self.train_data['style_id'])
def extract(self, image_paths, layer_names, flipped=False, batch_size=64, should_reshape_vectors=True, verbose=2, spatial_pool=None): """ Extract features from the image """ try: image_paths.__getattribute__('__len__') except AttributeError: raise TypeError('image_paths must be a container of paths') if len(self.feature_norm_method) > 1: raise NotImplementedError() if spatial_pool not in [None, 'max', 'sum']: raise ValueError('Unknown spatial pool: {}'.format(spatial_pool)) if spatial_pool is not None: should_reshape_vectors = False if not isinstance(layer_names, list): layer_names = [layer_names] if len(layer_names) > 1 and not should_reshape_vectors: raise ValueError( 'Cannot stack features from several layers without reshaping') getter = image_getter.ImageGetterFromPaths( image_paths, im_shape=self.img_resize_shape, rgb_batch=True) feature_dict = extract_features( flipped, self, layer_names=layer_names, image_getter=getter, im_shape=self.img_resize_shape, mean=None, batch_size=batch_size, verbose=verbose, should_reshape_vectors=should_reshape_vectors) # feed to the net_stream augmented images anf pool features after features = np.hstack(feature_dict.values()) if spatial_pool is not None and len(features.shape) != 4: raise ValueError( 'Cannot do a spatial pool on features with shape: {}'.format( features.shape)) if spatial_pool == 'max': features = np.max(features, axis=(1, 2)) elif spatial_pool == 'sum': features = np.sum(features, axis=(1, 2)) # print 'features.shape={}'.format(features.shape) if 'unit_norm' in self.feature_norm_method: if not should_reshape_vectors: raise ValueError( 'Cannot do unit_norm without reshaping the vectors') sklearn.preprocessing.normalize(features, norm='l2', axis=1, copy=False) assert len(features) == len(image_paths) return features
def run(self, position): processed_position = features.extract_features(position, features=self.features) probabilities = self.session.run( self.output, dict={self.x: processed_position[None, :]})[0] return probabilities.reshape(go.N, go.N)
def _make_tf_example_from_pwc(position_w_context): f = dual_net.get_features() features = features_lib.extract_features(position_w_context.position, f) pi = _one_hot(coords.to_flat(position_w_context.next_move)) value = position_w_context.result return make_tf_example(features, pi, value)
def evaluate_model(this, samples, targets): predictions = this.model.predict(samples) for t, p in zip(targets, predictions): print "%s, %s" % (t[0], this.num_class_map[p]) def predict(this, sample): prediction = this.model.predict(sample) return this.num_class_map[prediction[0]] if __name__ == '__main__': ## Parse command line arguments parser = argparse.ArgumentParser( description=random_forest_meta['program_description']) parser.add_argument('command', **arg_command) parser.add_argument('dataset', **arg_dataset) parser.add_argument('classes', **arg_classes) args = parser.parse_args() cmd = args.command[0] dataset = args.dataset[0] if cmd == "train": gt, features = extract_features(args.dataset[0]) c = classifier(features, gt, random_forest_model) if cmd == "test": c = classifier(None, None, None, "models/random_forest.model") gt, features = extract_features(args.dataset[0]) c.evaluate_model(features, gt)
def _make_tf_example_from_pwc(position_w_context): features = features_lib.extract_features(position_w_context.position) pi = _one_hot(coords.flatten_coords(position_w_context.next_move)) value = position_w_context.result return make_tf_example(features, pi, value)
n_features = len(feature_names) X = np.zeros((0, n_features)) aX = np.zeros((0, n_features)) y = np.zeros(0, ) for i, window_with_timestamp_and_label in slidingWindow( data, window_size, step_size): # omit timestamp and label from accelerometer window for feature extraction: window1 = window_with_timestamp_and_label[:, 1:4] window2 = window_with_timestamp_and_label[:, 4:7] # extract features over window: #accel x = extract_features(window1) #gyroscope gx = extract_features(window2) #heart rate is a feature of its own, therefore don't extract # append features to array: aX = np.reshape(np.append(np.reshape(x, (1, -1)), np.reshape(gx, (1, -1))), (1, -1)) bX = np.reshape( np.append(aX, window_with_timestamp_and_label[(i - 1) % 20, -2]), (1, -1)) X = np.append(X, bX, axis=0) y = np.append(y, window_with_timestamp_and_label[10, -1]) print("Finished feature extraction over {} windows".format(len(X)))
def pipeline_v1(vehicles, non_vehicles, params=None, save=False): """ This function performs feature engineering, trains a Linear SVC, and optionally saves the fitted model. """ params = params or { # color space 'cspace': 'YCrCb', # Can be RGB, HSV, LUV, HLS, YUV, or YCrCb # spatial binning params 'spatial_size': (24, 24), # color histogram params 'hist_bins': 32, 'hist_range': (0, 256), # HOG params 'orient': 9, 'pix_per_cell': 8, 'cell_per_block': 2, 'hog_channel': 'ALL' # Can be 0, 1, 2, or "ALL" } t = time.time() vehicle_features = extract_features(vehicles, **params) non_vehicle_features = extract_features(non_vehicles, **params) t2 = time.time() print(round(t2 - t, 2), 'Seconds to extract features...') # Create an array stack of feature vectors X = np.vstack((vehicle_features, non_vehicle_features)).astype(np.float64) # Fit a per-column scaler X_scaler = StandardScaler().fit(X) # Apply the scaler to X scaled_X = X_scaler.transform(X) # Define the labels vector y = np.hstack( (np.ones(len(vehicle_features)), np.zeros(len(non_vehicle_features)))) # Split up data into randomized training and test sets X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2) print('Using:', params['orient'], 'orientations, ', params['pix_per_cell'], 'pixels per cell, and', params['cell_per_block'], 'cells per block') print('Feature vector length:', len(X_train[0])) # Use a linear SVC svc = LinearSVC() # Check the training time for the SVC t = time.time() svc.fit(X_train, y_train) t2 = time.time() print(round(t2 - t, 2), 'Seconds to train SVC...') # Check the score of the SVC acc = round(svc.score(X_test, y_test), 4) print('Test Accuracy of SVC = ', acc) # Check the prediction time for a single sample t = time.time() n_predict = 10 print('My SVC predicts: ', svc.predict(X_test[0:n_predict])) print('For these', n_predict, 'labels: ', y_test[0:n_predict]) t2 = time.time() print(round(t2 - t, 5), 'Seconds to predict', n_predict, 'labels with SVC') if save: now = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') pickle_fname = ( './saved_models/{}|test_acc={}|train_samples={}|test_samples={}.p'. format(now, acc, len(y_train), len(y_test))) with open(pickle_fname, 'wb') as f: pickle.dump( { 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test, 'X_scaler': X_scaler, 'svc': svc, 'params': params }, f, pickle.HIGHEST_PROTOCOL) print('Saved model and params to {}'.format(pickle_fname))
def _extract_features(positions): return features.extract_features(self.hparams.board_size, positions)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mnbayes = MultinomialNB(alpha=1.0, fit_prior=True) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation #skf = StratifiedKFold(y,k = 10) skf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) mnbayes.fit( X_train, y_train ) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) y_test = vectorize_actual(y_test) # vectorize y_test _pred_probs = mnbayes.predict_proba(X_test) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Fitting" mnbayes.fit( fea, y ) #, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) print "Making predictions" global probs probs = mnbayes.predict_proba(test_fea) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
n_samples = 1000 time_elapsed_seconds = (data[n_samples,0] - data[0,0]) / 1000 sampling_rate = n_samples / time_elapsed_seconds # TODO: list the class labels that you collected data for in the order of label_index (defined in collect-labelled-data.py) class_names = ["sitting", "walking"] #... print("Extracting features and labels for window size {} and step size {}...".format(window_size, step_size)) sys.stdout.flush() X = [] Y = [] for i,window_with_timestamp_and_label in slidingWindow(data, window_size, step_size): window = window_with_timestamp_and_label[:,1:-1] feature_names, x = extract_features(window) X.append(x) Y.append(window_with_timestamp_and_label[10, -1]) X = np.asarray(X) Y = np.asarray(y) n_features = len(X) print("Finished feature extraction over {} windows".format(len(X))) print("Unique labels found: {}".format(set(Y))) print("\n") sys.stdout.flush() # %%--------------------------------------------------------------------------- # # Train & Evaluate Classifier
def train(text, category): for f in extract_features(text): increase_count(f, category) increase_total_count(category)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" percep.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Making predictions" global probs #probs = percep.predict_proba(test_fea) # only available for binary classification probs = percep.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)