def train(modelnames=[], features=[], limit=0, stemmer_type=None, predict=False, standardized=False, plot=False): """ ---------------------------------------- train - cross validate - predict - plot ---------------------------------------- """ X, y, z = get_features(limit=limit, features=features, stemmer_type=stemmer_type, db_name="yelp_train", standardized=False) del z #! not used when training for name in modelnames: model = filter(lambda x: x['name'] == name, model_config)[0] #! --------------------- module_ = __import__(model['module'], fromlist=model['from']) class_ = getattr(module_, model['name']) clf = class_(**model['kwargs']) model_name = str(clf.__class__).split(".")[-1].split("'")[0] print clf cross_validate(X,y,clf,folds=5,model_name=model_name,plot=plot) if model['feature_imp']: print 'Feature Importances =======', list(clf.feature_importances_) gc.collect() if predict: print '====== predicting ......' #! grab the complete test set for prediction Xtest, ytest, ztest = get_features(limit=0, features=features, stemmer_type=stemmer_type, db_name="yelp_test", standardized=False) predict_and_save(X, y, ztest, Xtest, clf, features) print '====== predicting done ......'
def main(): print("Reading in the training data") train = data_io.read_train() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Extracting features") features = [] target = [] for author_id, row in train.iterrows(): for paper_id in row["DeletedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(1) features.append(s) for paper_id in row["ConfirmedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(0) features.append(s) print("Target Length: %d" % len(target)) print("Feature Length: %d" % len(features)) feature_matrix = pd.DataFrame(features) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) try: classifier.fit(feature_matrix, target) except: import pdb pdb.set_trace() print("Saving the classifier") data_io.save_model(classifier)
def test(dataset_size, model_type): """ opens fit dataset and trains SVM/LogReg/Forest model with it, then tests it""" print "MODEL TEST", dataset_size, model_type dset = dataset.read('contradictions', dataset_size) data, targets = [], [] for case in dset['content']: data.append(case) targets.append(case['contradiction']) fit_data, test_data = [], [] fit_cases, test_cases, fit_target, test_target = train_test_split( data, targets, test_size=0.25, shuffle=True, random_state=0) for fit_case in fit_cases: fit_data.append( get_features(fit_case['sentence'], fit_case['hypothesis'])) for test_case in test_cases: test_data.append( get_features(test_case['sentence'], test_case['hypothesis'])) model = ClassificationModel(model_type) start_time = time.time() model.train(fit_data, fit_target, dataset_size) elapsed_time = time.time() - start_time test_results = model.test(test_data) with open( config.CONTRADICTIONS_RESULTS_PATH.format(dataset_size, model_type), 'wb') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ 'hypothesis', 'sentence', 'type', 'contradiction', 'prediction', 'features' ]) for (test_case, result, features) in zip(test_cases, test_results, test_data): csv_writer.writerow([ test_case['hypothesis'], test_case['sentence'], test_case['type'], test_case['contradiction'], result, features ]) precision = metrics.precision_score(test_target, test_results) recall = metrics.recall_score(test_target, test_results) f1_score = metrics.f1_score(test_target, test_results) print "FIT TIME", elapsed_time print "PRECISION", precision print "RECALL", recall print "F1 SCORE", f1_score model.save(dataset_size)
def main(): input_data, labels = input_parser.parse_input() X, Y = ft.get_features(input_data, labels) print("X.columns:") print(X.columns) folds = 5 print("Selecting rows for " + str(folds) + "-fold validation") kf = KFold(n_splits=folds) kf.get_n_splits(X) summed_accuracy = 0 fold_cnt = 1 for train_index, test_index in kf.split(X): print('Fold: ' + str(fold_cnt)) X_train, X_test = X.loc[train_index, ], X.loc[test_index, ] Y_train, Y_test = Y[train_index], Y[test_index] print(X_train) model = KNeighborsClassifier(n_neighbors=3) model.fit(X_train, Y_train) predictions = model.predict(X_test) summed_accuracy += accuracy_score(Y_test, predictions) print(confusion_matrix(Y_test, predictions)) print(classification_report(Y_test, predictions)) fold_cnt += 1 print("Total accuracy: " + str(summed_accuracy / folds))
def _original_o(self): img2 = self.image.copy() imgd = img2.copy() cv2.imwrite(dir + "input.jpg", imgd) image, m, orientations = preprocess(imgd) for i in range(image.shape[0]): for j in range(image.shape[1]): if image[i][j] > 50: image[i][j] = 1 else: image[i][j] = 0 image, xmax, xmin, ymax, ymin = cropfingerprint(image) orientations = orientations[xmin:xmax + 1, ymin:ymax + 1] cv2.imwrite(dir + "imagen_mejorada.jpg", image * 255) z = ZhangSuen(image) img = z.performThinning() thinned = img.copy() cv2.imwrite(dir + "salida_adelgazado.jpg", (1 - img) * 255) coords, mask = z.extractminutiae(thinned) cv2.imwrite(dir + "salida_minucias.jpg", mask * 255) fincoords = z.remove_minutiae( coords, cv2.imread(dir + "input.jpg", 0)[xmin:xmax + 1, ymin:ymax + 1]) vector = z.get_ridge_count(fincoords, image) feature_vectors = features.get_features(fincoords, vector, orientations) return feature_vectors
def extract_by_mask(maskshp, raster, out, nodata=0): """Same as the 'extractByMask_ds' function except that the input is raster file. """ with rasterio.open(raster) as src: extract_by_mask_rio_ds(get_features(maskshp), src, out, nodata=nodata)
def main(corpus_file_name, annotations_file_name): vectors_features_list = list() labels = list() annotations = parse_annotation(annotations_file_name) for sent_id, sent_str in read_lines(corpus_file_name): sent = nlp(sent_str) print("#id:", sent_id) print("#text:", sent.text) print() entities = sent.ents for i, first_ent in enumerate(entities): for second_ent in entities[:i] + entities[i + 1:]: pair_ent = (str(first_ent), str(second_ent)) if pair_ent in annotations[sent_id].keys(): rel = annotations[sent_id][pair_ent] vectors_features_list.append( get_features(first_ent, second_ent)) labels.append(rel) transform_of_features, features_map, model = create_model( vectors_features_list, labels) write_feature_map( 'C:/Users/DELL/PycharmProjects/NLP/Assignment_4/feature_map.txt', features_map) write_logistic_regression_model( 'C:/Users/DELL/PycharmProjects/NLP/Assignment_4/model_file', model)
def load_file(self, path, verbs): print(path) """ Open RNC XML and get all unique tokens """ tree = ET.parse(path) for elem in tree.iter('w'): word = ''.join(elem.itertext()).lower().replace('`', '') # remove stress for item in elem.iter('ana'): info = item print(info) try: info_prev = [t for t in info.getparent().getprevious() if t.tag == 'ana'][0] except TypeError: info_prev = None except IndexError: info_prev = None #print(ET.tostring(info.getparent().getprevious(), encoding='utf-8')) break #lemma = [item.get("lex") for item in elem.iter('ana')] # todo: deal with homonymy? lemma = info.get('lex') # get POS tag tag = info.get("gr").split('=')[0].split(',')[0] if lemma in verbs and tag == 'V': features = get_features(info, info_prev) verb = Verb(lemma, word, *features) if verb.form == 'partcp': self.partcp.add(verb) elif verb.form == 'ger': self.gerund.add(verb) self.verbs.add(verb)
def log_feature_importances(model, importance_plot_file): final_features = get_features(model.steps[0][1]) features = {f"f{ii}": feature for ii, feature in enumerate(final_features)} importances = ( model.steps[-1][1].get_booster().get_score(importance_type="gain") ) feature_importances = ( pd.DataFrame( [ { "feature": feature, "importance": get(coded_feature, importances, 0.0), } for coded_feature, feature in features.items() ] ) .sort_values("importance", ascending=True) .reset_index(drop=True) ) ax = feature_importances.plot(y="importance", x="feature", kind="barh") ax.get_figure().subplots_adjust(left=0.25) ax.get_figure().savefig(importance_plot_file) mlflow.log_artifact(importance_plot_file)
def classify_base(data, y, tests, debug): results = [] for test, test_name in tqdm(tests, file=sys.stdout, total=len(tests)): X = get_features(data, test) test_results = TestResults(test_name, len(X.columns), []) result_records = get_classifiers_results_records( X, y, test_name, debug) for record in result_records: test_results.results_list.append(record) results += [test_results] dfs_res = [] headers = ['Features (#features)', 'Acc.', 'Prec.', 'Recall', 'F'] for i in results: df_res = pd.DataFrame( [('{} ({}) {}'.format(i.tested_features, i.num_features, item.classifier_name), item.accuracy, item.precision, item.recall, item.f1) for item in i.results_list], columns=headers) dfs_res += [df_res] dfs_res = pd.concat(dfs_res, axis=0) dfs_res.to_csv(os.path.join(OUTPUTS_DIR, "classifier_base.csv"), index=False)
def classify_base_best_classifier(data, y, tests): cont_results = [] pat_results = [] for test, test_name in tqdm(tests, file=sys.stdout, total=len(tests)): X = get_features(data, test) cont_test_results = TestResults(test_name, len(X.columns), []) pat_test_results = TestResults(test_name, len(X.columns), []) cont_result_record, pat_result_record = get_best_classifier_results_records( X, y) cont_test_results.results_list.append(cont_result_record) pat_test_results.results_list.append(pat_result_record) cont_results += [cont_test_results] pat_results += [pat_test_results] cont_cls_results = [] pat_cls_results = [] for i in cont_results: res = i.results_list[0] # only 1 item res = [res.precision, res.recall, res.f1] cont_cls_results += [res] for i in pat_results: res = i.results_list[0] # only 1 item res = [res.precision, res.recall, res.f1] pat_cls_results += [res] tstatistic, pvalue = stats.ttest_ind(cont_cls_results, pat_cls_results) headers = ['t-statistic', 'p-value'] df = pd.DataFrame([(t, p) for t, p in zip(tstatistic, pvalue)], columns=headers) df.insert(0, 'scorer', ['precision', 'recall', 'f1-score']) df.to_csv(os.path.join(OUTPUTS_DIR, "t-test_best_classifier.csv"), index=False)
def rank(self): tokenizer = RegexpTokenizer(r'\w+') ps = nltk.stem.PorterStemmer() weights = get_features(self.verbose) ranks = PriorityQueue() for filename in os.listdir("./dump-texts"): file = os.path.join("dump-texts",filename) url = filename[10:-4].replace('--','/') if self.verbose: print "\nSCORING " + url with open(file, 'r') as f: text = f.read().lower().decode('utf-8') text = re.sub(r'\d+', '', text) filtered_text = [w for w in tokenizer.tokenize(text) if w not in stopwords.words('english')] cur_tokens = [ps.stem(x) for x in filtered_text] score = 0 text_bonus = 0 for t in cur_tokens: if 'buy' in t: text_bonus += 2 elif 'price' in t: text_bonus += 2 if t in weights.keys(): score += weights[t] score += text_bonus score = score/len(cur_tokens) score += self._calculate_url_bonus(url) if self.verbose: print "SCORE = " + str(score) + "\n" ranks.put((-1*score, url)) self._dump_ranks(ranks) print "************* Ranks computed!"
def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:, 1] paper_ids_sorted = sorted(zip(preds, row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def create_training_dataset(image_list, label_list): print('[INFO] Creating training dataset on %d image(s).' % len(image_list)) X = [] y = [] for i, (image, label) in enumerate(zip(image_list, label_list)): image_file, image = image label_file, label = label image_name = os.path.basename(image_file) print(f'Now on {image_name}') if int(image_name.split('.')[0]) < 22: p = 800 else: p = 100 regions = ft.get_regions(image, p) features = ft.get_features(image, regions) labels = ft.get_labels(label, regions) X.append(features) y.append(labels) X = np.vstack(X) y = np.concatenate(y) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print('[INFO] Feature vector size:', X.shape) return X, y
def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:,1] paper_ids_sorted = sorted(zip(preds,row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def all_sat(constraint): s = [] f = features.get_features() for k in f: if constraint.constrain(f[k]): s.append(k) return s
def prob_classify(self, source): """ wrapper for `prob_classify` of the nltk classifier """ source = dumb_strip(source) featureset = get_features(source) return self.classifier.prob_classify(featureset)
def analyse_file(input_file, save_counts=False): """ Calculates readability formulae for a single file :param input_file: input file :param save_counts: saves lists of what was counted if set to true :return: dictionary containing formulae, features and counts for the document """ # get file content with open(input_file, 'r') as content_file: text = content_file.read() content_file.close() # get counts tokenized_sentences = get_tokenized_sentences(text) if save_counts: rval = cnt.get_and_save_counts(tokenized_sentences, input_file) else: rval = cnt.get_counts(tokenized_sentences) # get features rval.update(feat.get_features(rval)) # get formulae rval.update(rf.get_formulae(rval)) return rval
def main(): print("Reading in the training data") train = data_io.read_train() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Extracting features") features = [] target = [] for author_id, row in train.iterrows(): for paper_id in row["DeletedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(1) features.append(s) for paper_id in row["ConfirmedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(0) features.append(s) print("Target Length: %d" % len(target)) print("Feature Length: %d" % len(features)) feature_matrix = pd.DataFrame(features) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) try: classifier.fit(feature_matrix, target) except: import pdb;pdb.set_trace() print("Saving the classifier") data_io.save_model(classifier)
def add_user(): hands = [get_hand(), get_hand(), get_hand()] feature_vectors = [get_features(h) for h in hands] hands[0].plot() return feature_vectors
def load(self): self.features = [] for i in xrange(self.repeat): self.features.append(get_features(self.path + "/" + str(i+1) + ".wav")) self.mean = np.mean([len(x) for x in self.features]) self.std = np.std([len(x) for x in self.features])
def information(filepath): ''' for every inkml file, read with beautfilsoup obtain information: uid <- annotation traces <- trace traceGroups <- traceGroup preprocess & features - interpolate - remove hooks - remove duplicates - normalize - smoothen features: - parallelity ''' try: ###################################################### # remove informaiton from file ###################################################### result = get_information(filepath) if not result: print(f'No traces found for: {filepath}') return if len(result) == 3: uid, traces, traces_map = result if len(result) == 4: uid, traces, traces_map, traceGroup = result ###################################################### # preprocess & get features ###################################################### traces = preprocess(traces, filepath) if len(result) == 3: rows = get_features(filepath, uid, traces, traces_map) else: rows = get_features(filepath, uid, traces, traces_map, traceGroup) ###################################################### if not rows: # some files have only one trace id..hence no res return return rows except Exception as e: print('Error for %s: %s' % (filepath, str(e)))
def classify_question_types(data, y, tests, debug): answer_ranges = [('([1-9]|1[0-4])', '1-14'), ('(1[5-8])', '15-18')] results = [] for test, test_name in tqdm(tests, file=sys.stdout, total=len(tests)): results_types = { '1-14': TestResults(test_name, len(test), []), '15-18': TestResults(test_name, len(test), []) } for regex, ans_range in answer_ranges: X = get_features(data, test, regex) results_types[ans_range].num_features = len(X.columns) answer_results = AnswersResults(ans_range, []) result_records = get_classifiers_results_records( X, y, test_name + ' q{}'.format(ans_range), debug) for record in result_records: answer_results.results_list.append(record) results_types[ans_range].results_list.append(answer_results) results += [results_types] headers = ['Acc. q{}', 'Prec. q{}', 'Recall q{}', 'F q{}'] for _, ans_range in answer_ranges: dfs_res = [] features_list = [] for result in results: results_types = result[ans_range] df_tests = [] # iterate classifier names, all answer_ranges use the same classifiers. choose 0 for cls in results_types.results_list[0].results_list: feat_head = '{} ({}) {}'.format(results_types.tested_features, results_types.num_features, cls.classifier_name) features_list.append(feat_head) for item in results_types.results_list: ans_headers = [ head.format(item.answer_number) for head in headers ] df_test = pd.DataFrame( [(i.accuracy, i.precision, i.recall, i.f1) for i in item.results_list], columns=ans_headers) df_tests += [df_test] df_tests = pd.concat(df_tests, axis=1) dfs_res += [df_tests] dfs_res = pd.concat(dfs_res, axis=0) dfs_res.insert(0, 'Features (#features)', features_list) dfs_res.to_csv(os.path.join( OUTPUTS_DIR, "classifier_answers_{}.csv".format(ans_range)), index=False)
def test(self, remain_time_budget=None): super(EnhancementStage, self).train(remain_time_budget) if self._stage_test_loop_num == 0 or self._spec_len_status == 2: self._spec_len_status = 0 self._feature_params['mode'] = 'test' x = get_features(self.ctx.raw_test_data, self._feature_params) x = np.array(x) self._pre_test_x = x[:, :, :, np.newaxis] log(f"stage_loop_num={self._stage_loop_num}, preprocess {len(self._pre_test_x)} test data, shape {self._pre_test_x.shape}" ) while self._stage_loop_num <= self._decide_warmup_loops(): self.train(remain_time_budget=remain_time_budget) score = 0 if self._decide_use_all_data() is False: score = balanced_acc_metric(self._pre_val_y, self._model.predict(self._pre_val_x)) if (score - 0.01 > self.ctx.max_score < 0.90) or (score >= self.ctx.max_score >= 0.90): self._better_score_cnt += 1 log("resnet score {} max_score {}".format(score, self.ctx.max_score)) if self._is_predict() and self._decide_use_all_data(): if self._is_ensenmble(): if self._is_good_train(): preds = self._model.predict(self._pre_test_x, batch_size=8) # normalize logits # preds = (preds - np.min(preds)) / (np.max(preds) - np.min(preds)) self._all_preds[self._stage_loop_num] = preds log("this round is good train") else: log("this round is bad train") preds = self._ensemble_preds() else: preds = self._model.predict(self._pre_test_x, batch_size=8) self._last_pred = preds self._last_pred_loop = self._stage_loop_num elif self._decide_use_all_data() is False: if len(self.ctx.lr_last_preds) > 0: preds = self.ctx.lr_last_preds else: preds = self.ctx.ensemble_predicts() else: preds = self._last_pred self._stage_test_loop_num += 1 if self._stage_loop_num >= self._stage_end_loop_num \ or ( self._stage_test_loop_num >= 8 and self._decide_use_all_data() is False and self.ctx.max_score > 0.4): self._need_transition = True if self._need_transition: self._transition() return preds
def load(self): self.features = [] for i in xrange(self.repeat): self.features.append( get_features(self.path + "/" + str(i + 1) + ".wav")) self.mean = np.mean([len(x) for x in self.features]) self.std = np.std([len(x) for x in self.features])
def predict(self, text: str) -> int: """ Predict review score based on review title. :param text: Review title :return: Predicted score, in [0, 100] range """ doc = self.nlp(text) X = get_features([doc], max_length) y = self.model.predict(X) return int(round(y[0][0] * 100))
def get_trade(): """ This function gets the predicted trade for today and the relative capital amount to trade returns the prediction (what to go long/short in for today) and how much capital to spend """ df = pd.read_csv("Data/database.csv", index_col='Date') df = train_model.compute_label(df=df) features = get_features() prediction, weight = train_model.fit_model(df=df, data=features) return prediction, weight
def process(img): img = crop.crop(img) t = classify.classify(img) if t == Type.BAND: return t, None, img, img, (None, None, None, None) mask, _, _ = extract.extract(img) masked = cv2.bitwise_and(img, img, mask=mask) x, y, w, h = watch_features.bounding_box(mask) #img = img[y:y+h, x:x+w] f = features.get_features(img) return t, f, img, masked, (x, y, w, h)
def get_features(record): d = { 'OGid': record.OGid, 'start': record.start, 'stop': record.stop, 'ppid': record.ppid } if not (len(record.segment) == 0 or 'X' in record.segment or 'U' in record.segment): d.update(features.get_features(record.segment)) return d
def predict(self, token, tokens, weights=None): """Gready head prediction used for training.""" scores = [] features = [] for head in tokens: feats = get_features(head, token, tokens, **self.feature_opts) score = self.score(feats) features.append(feats) scores.append(score) guess = np.argmax(scores) return guess, features
def original_stuff(self): img2 = self.image.copy() #img2 = shiftcorrection(img2).copy() # cv2.imwrite("shifted1.jpg", img2) #angle,xc,yc = correctrotation(img2) #img2 = 255 - img2 #self.checker1 = img2.copy() #rows, cols = img2.shape #M = cv2.getRotationMatrix2D((cols/2,rows/2),angle,1) #dst = cv2.warpAffine(img2,M,(cols,rows)) #dst = 255 - dst imgd = img2.copy() #print("original angle") #print angle cv2.imwrite(dir + "input.jpg", imgd) image, m, orientations = preprocess(imgd) for i in range(image.shape[0]): for j in range(image.shape[1]): if image[i][j] > 50: image[i][j] = 1 else: image[i][j] = 0 # print("done") # image = scipy.ndimage.binary_closing(image, structure=np.ones((3,3))).astype(np.int) # image = scipy.ndimage.binary_opening(image, structure=np.ones((3,3))).astype(np.int) image, xmax, xmin, ymax, ymin = cropfingerprint(image) orientations = orientations[xmin:xmax + 1, ymin:ymax + 1] # orientations, xmax, xmin, ymax, ymin = helper.find_roi(image,orientations) # image = image[xmin:xmax+1, ymin:ymax+1] cv2.imwrite(dir + "intermediate-input.jpg", image * 255) z = ZhangSuen(image) img = z.performThinning() thinned = img.copy() cv2.imwrite(dir + "thinnedimage-input.jpg", (1 - img) * 255) # print "dome" coords, mask = z.extractminutiae(thinned) cv2.imwrite(dir + "minu-input.jpg", mask * 255) fincoords = z.remove_minutiae( coords, cv2.imread(dir + "input.jpg", 0)[xmin:xmax + 1, ymin:ymax + 1]) # rotatecoords, angle, maskedimage = z.rotate_minutiae(fincoords, cv2.imread("1.jpg", 0)) # cv2.imwrite("minutiaeextracted.jpg", (maskedimage)*255) vector = z.get_ridge_count(fincoords, image) feature_vectors = features.get_features(fincoords, vector, orientations) return feature_vectors
def ts_forecasting(): args = input_cmd() # get energy consumption data load = args.load f_steps = args.steps data = get_dataset(load_to_predict=load) c_target = data["energy"] t_target, f_target, fcast_range = forecast_split(c_target, n_steps=f_steps) # ML methods features, target = get_features(t_target) lags = [int(f.split("_")[1]) for f in features if "lag" in f] forecaster = Forecaster(f_steps, lags=lags) print("Forecast with Linear Regression model") model, cv_score, test_score = linear_model(features, target) if args.fcast == "direct": fcast_linear = forecaster.direct(t_target, linear_model) elif args.fcast == "recursive": fcast_linear = forecaster.recursive(t_target, model) fcast_score = mape(f_target, fcast_linear) print(f""" Linear Regression scores -------------- Cross-validation MAPE: {round(cv_score, 2)}% Test MAPE: {round(test_score, 2)}% Direct Forecast MAPE: {round(fcast_score, 2)}% """) print("Forecast with XGBoost model") model, cv_score, test_score = xgboost_model(features, target, max_evals=25) if args.fcast == "direct": fcast_xgb = forecaster.direct(t_target, xgboost_model) elif args.fcast == "recursive": fcast_xgb = forecaster.recursive(t_target, model) fcast_score = mape(f_target, fcast_xgb) print(f""" XGBoost scores -------------- Cross-validation MAPE: {round(cv_score, 2)}% Test MAPE: {round(test_score, 2)}% Recursive Forecast MAPE: {round(fcast_score, 2)}% """)
def predict(): i = 5 X_test = feat.get_features(f_test) scaler = joblib.load("models/scaler" + str(i) + ".save") model = joblib.load("models/model" + str(i) + ".save") x_test_scaled = scaler.transform(X_test) joblib.dump(x_test_scaled, "models/x_to_pred_scaled" + str(i) + ".save") y_predicted = model.predict(x_test_scaled) output = pd.DataFrame({ "index": f_test["index_absolute"][:], "sleep_stage": y_predicted }) output.to_csv("output" + str(i) + ".csv", index=False) print("over")
def __gen_features(): # Get train and test data print("Loading train and test data") x_train, y_train, x_test, y_test = get_data(DATA_DIRS, SETTINGS['train_test_split']) if DEBUG_VARS['trim_data'] is not None: x_train = x_train[:len(x_train) // DEBUG_VARS['trim_data']] y_train = y_train[:len(y_train) // DEBUG_VARS['trim_data']] x_test = x_test[:len(x_test) // DEBUG_VARS['trim_data']] y_test = y_test[:len(y_test) // DEBUG_VARS['trim_data']] # Get fetures t1 = time.time() print('Extracting features from the train data') x_train = get_features(x_train, SETTINGS['feature']) print('Extracting features from the test data') x_test = get_features(x_test, SETTINGS['feature']) t2 = time.time() print('Feature extraction took', round(t2 - t1, 4), 'Seconds') if SETTINGS['feature']['save']: with open( path.join(SETTINGS['feature']['path'], SETTINGS['feature']['name']), 'wb') as f: print("Saving features.") features = { 'x_train': x_train, 'y_train': y_train, 'x_test': x_test, 'y_test': y_test } pickle.dump(features, f, pickle.HIGHEST_PROTOCOL) with open(path.join(SETTINGS['feature']['path'], 'settings.p'), 'wb') as f: pickle.dump(SETTINGS['feature'], f, pickle.HIGHEST_PROTOCOL) return x_train, y_train, x_test, y_test
def train(): n_pts = None #change to test on a subset of the data X_features = feat.get_features(f_train, n_pts) joblib.dump(X_features, "models/X_features.save") if n_pts != None: y_train_vals = y_train_raw["sleep_stage"][:n_pts].values else: y_train_vals = y_train_raw["sleep_stage"][:].values # on passe l'output en chaine de caracteres y_train_vals = [str(y) for y in y_train_vals] #train model and save results print("training") train_model.train_full_model(X_features, y_train_vals)
def train_nn(save_model=False): ''' Train neural network model :param save_model: True if model should be saved to file ''' ### TRAINING ### # Create model # TODO: Here you can modify the architecture of the neural network model and experiment with different parameters model = Sequential() model.add( Dense( 1, # TODO: Number of hidden layer neurons input_dim=len(get_features()), activation='relu')) # TODO: Possible to add additional neural network layers here # TODO: Use model.add(Dense(number of hidden layer neurons, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Output layer model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # TODO: Optional; add early stopping as callback history = model.fit( x=x_train, y=y_train, validation_data=[x_val, y_val], batch_size= 50, # Number of data samples to run through network before parameter update epochs= 1, # TODO: Number of times to run entire training set through network shuffle=True, callbacks=[]).history score = model.evaluate(x_test, y_test, batch_size=50) # Evaluate model on test set print('Test loss:%f' % (score[0])) print('Test accuracy:%f' % (score[1])) if save_model: model.save('./nn_model.h5') print("Model saved") plot_training_history(history)
def arguments(argv=sys.argv[1:]): parser = argparse.ArgumentParser() names = ', '.join(tests.__all__) parser.add_argument( 'tests', nargs='*', help='The list of tests to run. Tests are: ' + names) features = ', '.join(FEATURES) parser.add_argument( '--features', default=[], action='append', help='A list of features separated by colons. Features are: ' + features) parser.add_argument( '--force', '-f', action='store_true', help='Do not wait for a prompt') parser.add_argument( '--verbose', '-v', action='store_true', help='More verbose output') args = parser.parse_args(argv) test_list = args.tests or tests.__all__ all_tests = [(t, getattr(tests, t, None)) for t in test_list] bad_tests = [t for (t, a) in all_tests if a is None] if bad_tests: common.printer(test_list, all_tests, bad_tests) raise ValueError('Bad test names: ' + ', '.join(bad_tests)) all_tests = tuple(a for (t, a) in all_tests) if args.features: features = set(':'.join(args.features).split(':')) check_features(features) else: features = get_features() return all_tests, features, args.verbose, args.force
def test_classify(): feats = features.get_features() print feats good = 0 wrong = 0 spam_files = toolkit.get_files('spam/train') for sf in spam_files: if classify_wrap(sf, feats, 0): good += 1 else: wrong += 1 print "After SPAM: good: %d, wrong: %d" % (good, wrong) ham_files = toolkit.get_files('ham/train') for hf in ham_files: if not(classify_wrap(hf, feats, 0)): good += 1 else: wrong += 1 print "good: %d, wrong: %d" % (good, wrong)
def next_track(self,sleep_time=5.0): """ Get the next song features Take it from the queue (waits infinitely if needed...!) Sleep time between iterations when waiting is sleep_time (seconds) """ # get data while True: data = _get_data() if data != None: break time.sleep(sleep_time) self._nTracksGiven += 1 # get features return features.get_features(data,pSize=self._pSize, usebars=self._usebars, keyInv=self._keyInv, songKeyInv=self._songKeyInv, positive=self._positive, do_resample=self._do_resample, partialbar=self._partialbar, btchroma_barbts=None)
def load(self): self.features = get_features(self.audio_path) self.frame_cnt = len(self.features)
import numpy as np, math import world_cup import features import match_stats import world_cup history_size = 4 game_summaries = features.get_game_summaries() data = features.get_features(history_size) club_data = data[data['competitionid'] <> 4] # Show the features latest game in competition id 4, which is the world cup. print data[data['competitionid'] == 4].iloc[0] import power reload(power) reload(world_cup) def points_to_sgn(p): if p > 0.1: return 1.0 elif p < -0.1: return -1.0 else: return 0.0 power_cols = [ ('points', points_to_sgn, 'points'), ] power_data = power.add_power(club_data, game_summaries, power_cols) power_data.to_csv('/tmp/out.csv',sep=';')
def consume(self, lang, source): source = strip_gubbins(source, lang) featureset = get_features(source) self.featuresets.append((featureset, lang))
import numpy as np import matplotlib.pyplot as plt import data_io as dl import metrics as m import features as f from definitions import target_fields from sklearn import svm, cross_validation from sklearn.ensemble import GradientBoostingRegressor import pywt import time data = dl.get_data('train') spectra = data['spectra'] targets = data['targets'] x_train_all = f.get_features(data) clfs = { # 'Ca': svm.SVR(C=10000.0), # 'P': svm.SVR(C=5000.0), # 'pH': svm.SVR(C=10000.0), # 'SOC': svm.SVR(C=10000.0), # 'Sand': svm.SVR(C=10000.0), 'Ca': GradientBoostingRegressor(n_estimators=200), 'P': GradientBoostingRegressor(n_estimators=200), 'pH': GradientBoostingRegressor(n_estimators=200), 'SOC': GradientBoostingRegressor(n_estimators=200), 'Sand': GradientBoostingRegressor(n_estimators=200), } mode = 'cv'
import numpy as np, math import world_cup, features, match_stats, power import world_cup, pandas as pd history_size = 3 game_summaries = features.get_game_summaries() data = features.get_features(history_size) club_data = data[data['competitionid'] <> 4] pd.set_option('display.max_rows', 5000) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # Don't train on games that ended in a draw, since they have less signal. train = club_data.loc[club_data['points'] <> 1] train = club_data (model, test) = world_cup.train_model( train, match_stats.get_non_feature_columns()) print "\nRsquared: %0.03g" % model.prsquared def print_params(model, limit=None): params = model.params.copy() params.sort(ascending=False) del params['intercept'] if not limit: limit = len(params) print("Positive features")
models = [] means = [] std_devs = [] for i in range(len(spoken)): #print "fitting to HMM and decoding ..." n_components = 3 arr = [] # make an HMM instance and execute fit model = GaussianHMM(n_components, covariance_type="diag", n_iter=1000) for j in range(n_samples): (rate,sig) = wav.read(fpaths[i][j]) features = get_features(sig) arr.append(len(features)) model.fit([features]) models.append(model) means.append(np.mean(arr)) std_devs.append(np.std(arr)) #print("done\n") correct_answers = [] with open('Test/'+test_folder+'/answer.txt') as answers: for entry in answers: correct_answers.append(entry.split()) tot_words = len(correct_answers) right = 0.0
def kernel_generate_fromcsv( input_path, input_csv_fname, output_fname, # -- featfunc, # -- simfunc = DEFAULT_SIMFUNC, kernel_type = DEFAULT_KERNEL_TYPE, nosphere = DEFAULT_NOSPHERE, # -- variable_name = DEFAULT_VARIABLE_NAME, #input_path = DEFAULT_INPUT_PATH, # -- overwrite = DEFAULT_OVERWRITE, noverify = DEFAULT_NOVERIFY, ): assert(kernel_type in VALID_KERNEL_TYPES) # add matlab's extension to the output filename if needed if path.splitext(output_fname)[-1] != ".mat": output_fname += ".mat" # can we overwrite ? if path.exists(output_fname) and not overwrite: warnings.warn("not allowed to overwrite %s" % output_fname) return # -------------------------------------------------------------------------- # -- get training and testing filenames from csv print "Processing %s ..." % input_csv_fname (train_fnames, train_labels, test_fnames, test_labels) = csv2tt(input_csv_fname, input_path=input_path) ntrain = len(train_fnames) ntest = len(test_fnames) assert(ntrain>0) assert(ntest>0) if not noverify: all_fnames = sp.array(train_fnames+test_fnames).ravel() verify_fnames(all_fnames) # -------------------------------------------------------------------------- # -- train x train train_features = get_features(train_fnames, featfunc, kernel_type, simfunc, info_str = 'training') if nosphere: sphere_vectors = None else: print "Sphering train features ..." sphere_vectors = get_sphere_vectors(train_features) train_features = sphere_features(train_features, sphere_vectors) # XXX: this should probably be refactored in kernel.py print "Computing '%s' kernel_traintrain ..." % (kernel_type) if kernel_type == "dot": kernel_traintrain = dot_kernel(train_features) elif kernel_type == "ndot": kernel_traintrain = ndot_kernel(train_features) elif kernel_type == "exp_mu_chi2": chi2_matrix = chi2_kernel(train_features) chi2_mu_train = chi2_matrix.mean() kernel_traintrain = ne.evaluate("exp(-chi2_matrix/chi2_mu_train)") elif kernel_type == "exp_mu_da": da_matrix = da_kernel(train_features) da_mu_train = da_matrix.mean() kernel_traintrain = ne.evaluate("exp(-da_matrix/da_mu_train)") assert(not (kernel_traintrain==0).all()) # -------------------------------------------------------------------------- # -- train x test test_features = get_features(test_fnames, featfunc, kernel_type, simfunc, info_str = 'testing') if not nosphere: print "Sphering test features ..." test_features = sphere_features(test_features, sphere_vectors) # XXX: this should probably be refactored in kernel.py print "Computing '%s' kernel_traintest ..." % (kernel_type) if kernel_type == "dot": kernel_traintest = dot_kernel(train_features, test_features) elif kernel_type == "ndot": kernel_traintest = ndot_kernel(train_features, test_features) elif kernel_type == "exp_mu_chi2": chi2_matrix = chi2_kernel(train_features, test_features) kernel_traintest = ne.evaluate("exp(-chi2_matrix/chi2_mu_train)") elif kernel_type == "exp_mu_da": da_matrix = da_kernel(train_features, test_features) kernel_traintest = ne.evaluate("exp(-da_matrix/da_mu_train)") assert(not (kernel_traintest==0).all()) # -------------------------------------------------------------------------- # -- write output file # first make sure we don't record the original input_path # since this one could change train_fnames, _, test_fnames, _ = csv2tt(input_csv_fname) print print "Writing %s ..." % (output_fname) data = {"kernel_traintrain": kernel_traintrain, "kernel_traintest": kernel_traintest, "train_labels": train_labels, "test_labels": test_labels, "train_fnames": train_fnames, "test_fnames": test_fnames, } try: io.savemat(output_fname, data, format="4") except IOError, err: print "ERROR!:", err
def processing(frame, wframe): temp_feature = features.get_features(wframe) temp_descriptor = features.get_descriptor(wframe, temp_feature) points_list, patternimage_size = features.verify(temp_descriptor, temp_feature, #'/home/max/Pictures/logotipos/QR_Maxkalavera.png' '/home/max/Pictures/logotipos/fime.jpg' #'/home/max/Pictures/logotipos/logo006.jpg' ) wframe_size = wframe.shape frame_size = frame.shape ratio = [float(frame_size[0])/wframe_size[0], float(frame_size[1])/wframe_size[1]] print "Number of points:", len(points_list[0]) if len(points_list[0]) >= 10: (h, m) = cv2.findHomography( numpy.array(points_list[1]), numpy.array(points_list[0]), cv2.RANSAC, ransacReprojThreshold = 3.0) matches = m.ravel().tolist() if True: for i in range(len(points_list[0])): pt = points_list[0][i] if matches[i] > 0: cv2.circle(frame, (int(pt[0]*ratio[0]), int(pt[1]*ratio[1])), 3, (255,0,255), -1) else: cv2.circle(frame, (int(pt[0]*ratio[0]), int(pt[1]*ratio[1])), 3, (0, 255,255), -1) patternimage_rectsize = numpy.float32( [ [0, 0], [0, patternimage_size[0] - 1], [patternimage_size[1] - 1, patternimage_size[0] - 1], [patternimage_size[1] - 1, 0] ] ).reshape(-1,1,2) wframe_rounding_box = cv2.perspectiveTransform(patternimage_rectsize, h) frame_rounding_box = list() for cord in wframe_rounding_box: cord = cord[0] frame_rounding_box.append(( int(cord[0]*ratio[0]), int(cord[1]*ratio[1]) )) if False: cv2.line(frame, frame_rounding_box[0], frame_rounding_box[1], (255, 0, 0), 1, 8 , 0); cv2.line(frame, frame_rounding_box[1], frame_rounding_box[2], (255, 0, 0), 1, 8 , 0); cv2.line(frame, frame_rounding_box[2], frame_rounding_box[3], (255, 0, 0), 1, 8 , 0); cv2.line(frame, frame_rounding_box[3], frame_rounding_box[0], (255, 0, 0), 1, 8 , 0); x_pts = [x for x, y in frame_rounding_box] y_pts = [y for x, y in frame_rounding_box] try: frame_roi = frame[min(y_pts):max(y_pts), min(x_pts):max(x_pts)] gray = cv2.cvtColor(frame_roi, cv2.COLOR_BGR2GRAY) ret, thresh = cv2.threshold(gray,0,255,cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) cv2.imshow("thresh", thresh) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) #temp, classified_points, means = cv2.kmeans(data=numpy.concatenate(contours), K=2, bestLabels=None, # criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_MAX_ITER, 1, 10), attempts=1, # flags=cv2.KMEANS_RANDOM_CENTERS) if True: cv2.drawContours(frame_roi, contours, -1,(0,255,0),3) all_contour = numpy.concatenate(contours) hull = cv2.convexHull(all_contour) if False: cv2.drawContours(frame_roi, hull, -1,(0,0,255), 4) if False: approx = cv2.approxPolyDP(numpy.concatenate(all_contour), 0.1*cv2.arcLength(all_contour, True),True) cv2.drawContours(frame_roi, approx, -1,(255,0,0), 4) for cnt in contours: hull = cv2.convexHull(cnt) cv2.drawContours(frame_roi, hull, -1,(0,0,255), 2) cv2.imshow("ROI", frame_roi) except Exception,e: print str(e)
from sklearn.metrics import roc_curve, auc from sklearn.naive_bayes import MultinomialNB, GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier #getting frames of all input sentences fram=frames.get_frames() print fram #getting top 2 frames for each process #import features feat=features.get_features() print feat def features_extract(df,df2): #creating new data frame to include features df6=pd.DataFrame(columns=['Process','Feature','Label']) df6['Process']=df['Process'] df6['Label']=df['Label'] df6['Sentence']=df['Sentence'] #df6=df6[df6['Process']!='accumulation'].reset_index() #print df6 #setting feature=1 if any of the top 2 frames are present in each sentence
def _features(): processed = json.loads(request.args.get('processed')) options = json.loads(request.args.get('options')) features = get_features(processed, options) return jsonify(features=features)
args = parser.parse_args() dirname = args.dirname outfilename = args.filename total_files = 0 total_parsed = 0 with open(outfilename,"a") as of: for root,dirs,filenames in os.walk(dirname): for f in filenames: full_path = os.path.join(root,f) l = len(f) if l > 4 and f[l-4:l] == '.csv': total_files = total_files + 1 print "analyzing",os.path.join(root,f) try: raw_data = loader.loadFile(full_path,delim=',',skip=1) results = features.get_features(raw_data) of.write('\t'.join([str(full_path),str(results['density']),str(results['density_minus_one']),str(results['density_all_nums']),str(results['density_strict']),str(results['fnumcols']),str(results['hasdate']),str(results['sum_covariance']),str(results['sum_abs_covariance']),str(results['max_abs_covariance']),str(results['total_unique_labels']),str(results['first_unique_labels'])])+'\n') #print results total_parsed = total_parsed + 1 except Exception as e: print "error occured while trying to parse",f,":" print traceback.format_exc() elif (l > 4 and f[l-4:l] == '.tsv') or f == 'data.txt': total_files = total_files + 1 print "analyzing",os.path.join(root,f) try: raw_data = loader.loadFile(full_path,delim='\t',skip=1) results = features.get_features(raw_data) of.write('\t'.join([str(full_path),str(results['density']),str(results['density_minus_one']),str(results['density_all_nums']),str(results['density_strict']),str(results['fnumcols']),str(results['hasdate']),str(results['sum_covariance']),str(results['sum_abs_covariance']),str(results['max_abs_covariance']),str(results['total_unique_labels']),str(results['first_unique_labels'])])+'\n') #print results total_parsed = total_parsed + 1
print 'EN identifier =',identifier a,b,c,d,e = EXTRAS.get_our_analysis(identifier) segstart, chromas, beatstart, barstart, duration = a,b,c,d,e if segstart == None: print 'EN gave us None, must start again' sys.exit(0) analysis_dict = {'segstart':segstart,'chromas':chromas, 'beatstart':beatstart,'barstart':barstart, 'duration':duration} del a,b,c,d,e,segstart,chromas,beatstart,barstart,duration print 'analysis retrieved from Echo Nest' # features from online (positive=False to compare with old school method) online_feats = features.get_features(analysis_dict,pSize=8,usebars=2, keyInv=True,songKeyInv=False, positive=False,do_resample=True, btchroma_barbts=None) online_feats = online_feats[np.nonzero(np.sum(online_feats,axis=1))] print 'features from online computed, shape =',online_feats.shape # retrieve feature using TZAN and compare to what we got print 'comparing features from upload and online' print 'reuploading songfile =',songfile a,b,c,d,e = TZAN.get_en_feats(songfile) pitches, seg_start, beat_start, bar_start, duration = a,b,c,d,e print'number of segments (upload/online):',np.array(seg_start).shape,',',np.array(analysis_dict['segstart']).shape a = np.array(seg_start) b = np.array(analysis_dict['segstart']) assert a.shape == b.shape a - b assert np.abs(np.array(seg_start).flatten()-np.array(analysis_dict['segstart']).flatten()).max() < .001
if minj == letters.index(c): score += 1 print ''.join(pred) print ''.join(real) return means, stds, score/float(len(test)) if __name__ == '__main__': if len(sys.argv) != 4: print 'Usage: %s training|test soundf textf' % sys.argv[0] soundf = sys.argv[2] textf = sys.argv[3] rate, data, text = load_data(soundf, textf) starts, ends, chunks = get_chunk_starts(data) f = get_features(data, starts, ends, include_fft=True, include_cepstrum=True) if sys.argv[1] == 'training': means, stds, score = naive_bayes(text, f) print 'Naive Bayes', score logreg_score, logreg = logistic_test(text, f) svm_score, svm = svm_test(text, f) joblib.dump(logreg, 'cache/logistic.pkl') print 'Logistic test', logreg_score print 'SVM test', svm_score else: try: logreg = joblib.load('cache/logistic.pkl') except: print 'Run `%s training 7` to train your model first' % sys.argv[0]
def load_features(): global PC_3s PC_3s = get_features(S_BEFORE,S_AFTER,SAMPLE_RATE,FPC)