def main(): detector = get_detector('yolo') while True: time.sleep(0.01) _queue = REDIS_DB.lrange(REDIS_QUEUE, 0, BATCH_SIZE - 1) for _q in _queue: all_features = { 'primary': [], 'object_bitmap': [0 for _ in range(len(detector.classes))] } _q = json.loads(_q.decode("utf-8")) img = utils.base64_decode(_q['image'], _q['shape']) all_features['secondary'] = extract_features(img.copy()) response = detector.predict(img) for obj in response: box = obj['box'] x1, y1, x2, y2 = box[0], box[1], box[2], box[3] if(x2-x1 >= 75 and y2-y1 >= 75): features = extract_features(img[y1:y2, x1:x2]) all_features['primary'].append({ 'features': features, 'label': obj['label'], 'name': obj['name'], 'box': obj['box'] }) all_features['object_bitmap'][obj['label']] = 1 REDIS_DB.set(_q['id'], json.dumps(all_features)) REDIS_DB.ltrim(REDIS_QUEUE, len(_queue), -1)
def generic_features(infile, outfile): '''Extracts features from any lyric dataset in the format of (title, artist, lyrics, label)''' lyric_dict = {} lyrics = [] labels = [] cfig = ['lem', 'plex'] f = open(infile, 'rt', encoding="utf8") reader = csv.reader(f) for r in reader: lyric_dict[(r[0], r[1], r[3])] = r[2] #title, artist, label:lyrics for key in lyric_dict: lyrics.append(lyric_dict[key]) labels.append(key[2]) features, header = fe.extract_features(lyrics, cfig) f2 = open(outfile, 'w', encoding="utf8") f2.write('title,' + 'artist,' + ','.join(header) + ',' + 'popular' + '\n') for song, row in zip(lyric_dict.keys(), features): str_row = [str(i) for i in row] f2.write(song[0] + ',' + song[1] + ',' + ','.join(str_row) + ',' + song[2] + '\n') f.close() f2.close() return features, labels
def decade_features(): '''Extracts features from the 2006-2015 lyrics dataset''' infile = "datasets/lyrics_10Year.csv" lyric_dict = {} lyrics = [] cfig = ['lem', 'plex'] f = open(infile, 'rt') reader = csv.reader(f) for r in reader: lyric_dict[(r[0], r[1])] = r[3] #title, artist:lyrics for key in lyric_dict: lyrics.append(lyric_dict[key]) features, header = fe.extract_features(lyrics, cfig) f2 = open("datasets/nltk_10Year.csv", 'w') f2.write('title,' + 'artist,' + ','.join(header) + '\n') for song, row in zip(lyric_dict.keys(), features): str_row = [str(i) for i in row] f2.write(song[0] + ',' + song[1] + ',' + ','.join(str_row) + '\n') f.close() f2.close() return features
def main(): """ main function, called if module is run as main program""" # load raw csv file and extract relevant lines (marked by 'GEN' for general) with open('data/sarcasm_v2.csv') as datafile: raw_data = list(csv.reader(datafile)) data = [line[-1] for line in raw_data if line[0] == 'GEN'] labels = [line[1] for line in raw_data if line[0] == 'GEN'] # load config file with open('conf.txt') as conffile: conf_all = set(line.strip() for line in conffile) # compute score, for each line in the config individually and all together confs = [line for line in conf_all] confs.append([line for line in conf_all]) for conf in confs: print('computing score for: %s... ' % conf, end='') features = fe.extract_features(data, conf) score = cross_val_score(svm.SVC(), features, labels, scoring='accuracy', cv=10).mean() score = round(score, 3) print(score) with open('experiments2.csv', 'a') as f: f.write(";".join( ('{:%Y-%m-%d;%H:%M:%S}'.format(datetime.datetime.now()), str(conf), str(score), '\n')))
def extract_features_from_rawdata(chunk, header, period, features): with open( os.path.join(os.path.dirname(__file__), "resources/channel_info.json")) as channel_info_file: channel_info = json.loads(channel_info_file.read()) data = [convert_to_dict(X, header, channel_info) for X in chunk] return extract_features(data, period, features)
def search_windows(img, windows, clf, scaler, color_space='RGB', patch_size=(64, 64), feature_extractor_params=None): if color_space != 'RGB': img = convert_color(img, color_space) # 1) Create an empty list to receive positive detection windows on_windows = [] # 2) Iterate over all windows in the list for window in windows: # 3) Extract the test window from original image test_img = cv2.resize( img[window[0][1]:window[1][1], window[0][0]:window[1][0]], patch_size) # 4) Extract features for that window using single_img_features() features = extract_features(test_img, **feature_extractor_params) # 5) Scale extracted features to be fed to classifier test_features = scaler.transform(np.array(features).reshape(1, -1)) # 6) Predict using your classifier prediction = clf.predict(test_features) # 7) If positive (prediction == 1) then save the window if prediction == 1: on_windows.append(window) # 8) Return windows for positive detections return on_windows
def classify_sentences(list_of_sentences, model_folder): #Create feature file index_features_filename = os.path.join(model_folder, __name_index) model_filename = os.path.join(model_folder, __name_model) feature_index = Cfeature_index() feature_index.load_from_file(index_features_filename) ################################################ #Create the feature file for classification ################################################ my_feat_file = NamedTemporaryFile(delete=False) for list_of_tokens in list_of_sentences: these_feats = extract_features(list_of_tokens) feature_index.encode_example_for_classification( these_feats, my_feat_file) my_feat_file.close() ################################################ #Run the classifier svm_clasify #SVM values will be a list of floats, one per sentence, with the value associated by SVM svm_values = run_svm_classify(my_feat_file.name, model_filename) os.remove(my_feat_file.name) return svm_values
def _classify_patch(self, patch): features = extract_features(patch, **self.feature_extractor_args) X = features.reshape(1, -1).astype(np.float64) if self.feature_scaler is not None: X = self.feature_scaler.transform(X) probabilities = self.classifier.predict_proba(X)[0] prediction = probabilities[1] > self.low_threshold return prediction, probabilities[1]
def final_train(): train_kds, train_dss, train_labels = extract_features() test_kds, test_dss, test_labels = extract_features('testing') accuracies = [] for _ in range(3): model = WindowBasedEnsembleClassifier(False, 40, 'gini', 8, 8, 192, 192, 1.0 / 7, 1.0 / 7, 20) model.fit(train_kds, train_dss, train_labels) predicted = model.predict(test_kds, test_dss) acc = accuracy_score(test_labels, predicted) accuracies.append(acc) print("total accuracy: {}".format(acc)) for i in range(10): t_ids = [m for m, t in enumerate(test_labels) if t == i] t_acc = accuracy_score(test_labels[t_ids], predicted[t_ids]) print("number {} accuracy: {}".format(i, t_acc)) print("mean accuracy: {}".format(np.mean(accuracies)))
def extract_features_from_text(text): num_hashtag = len(text.split('#')) - 1 feature_dict = extract_features(("", text, 0, 0, num_hashtag, "")) return [ feature_dict['num_propagation_words'], feature_dict['text_length'], feature_dict['num_hashtag'], feature_dict['num_powerful_words'], feature_dict['num_personal_pronoun'] ]
def post(self): args = parser.parse_args() decodeit = open('./server_files/saveimg.jpeg', 'wb') decodeit.write(base64.b64decode(bytes(args["image"], 'utf-8'))) decodeit.close() fe.extract_features() model = pickle.load(open("./xgboost.pkl", 'rb')) test = pd.read_csv('test.csv', header=None) x_test = np.array(test.iloc[1:, 1:10]) y_test = np.array(test.iloc[1:, 11]) res = model.predict(x_test) print(res) class_ = "" if res == 0: class_ = "benign" else: class_ = "malicious" return {"class": class_}
def model_with_only_features(): print('## Download and load the data') dataset = get_dataset() print('## Only separate using spaces') cleaned_X = [val.split(' ') for val in dataset['X']] print('## Feature Extraction') all_features_X = extract_features(cleaned_X, dataset['X']) print('## Training and Evaluation') train_the_model(all_features_X, dataset['Y'])
def model_with_all_preprocessing_and_features(no_bag_of_words=False): print('## Download and load the data') dataset = get_dataset() print('## Clean the data') cleaned_X = clean_data(dataset['X']) print('## Feature Extraction') all_features_X = extract_features(cleaned_X, dataset['X'], no_bag_of_words=no_bag_of_words) print('## Training and Evaluation') train_the_model(all_features_X, dataset['Y'])
def parameter_tuner(): train_kds, train_dss, train_labels = extract_features() # print(np.bincount(train_labels.flatten())) # new_x = [] # new_y = [] # for i in range(train_kds.shape[0]): # for j in range(len(train_kds[i])): # t = train_kds[i][j] # # new_x.append(train_dss[i][j]) # f = [t.pt[0], t.pt[1], t.angle, t.size, t.response] # f.extend(train_dss[i][j]) # new_x.append(f) # new_y.append(train_labels[i]) # k = 50 # new_x = np.array(new_x) # new_y = np.array(new_y) # # model = RandomForestClassifier() # model = KNeighborsClassifier(k, n_jobs=2) # model = KNN() # model = svm.LinearSVC(verbose=True, max_iter=10000) # model.fit(new_x, new_y) kf = KFold(n_splits=3, shuffle=True) accuracies = [] pars = ['entropy', 'gini'] for par in pars: print(par) r = [] for train_index, test_index in kf.split(train_kds): print("TRAIN:", train_index, "TEST:", test_index) kds_temp_train, dss_temp_train, kds_temp_test, dss_temp_test = \ train_kds[train_index], train_dss[train_index], train_kds[test_index], train_dss[test_index] y_temp_train, y_temp_test = train_labels[ train_index], train_labels[test_index] model = WindowBasedEnsembleClassifier(False, 40, par, 8, 8, 192, 192, 1.0 / 7, 1.0 / 7, 20) model.fit(kds_temp_train, dss_temp_train, y_temp_train) predicted = model.predict(kds_temp_test, dss_temp_test) acc = accuracy_score(y_temp_test, predicted) r.append(acc) print("total accuracy: {}".format(acc)) for i in range(10): t_ids = [m for m, t in enumerate(y_temp_test) if t == i] t_acc = accuracy_score(y_temp_test[t_ids], predicted[t_ids]) print("number {} accuracy: {}".format(i, t_acc)) r = np.array(r) accuracies.append(r.mean()) print("after") acc_max_arg = np.argmax(accuracies) print("best {}: {}".format(pars[acc_max_arg], accuracies[acc_max_arg])) for a, c in zip(accuracies, pars): print("{}: {}".format(c, a))
def sort_samples(samples_array, sorter): prot = feature_extractor.protocol_attr() for sample in samples_array: if sample.sampler_type not in sorted_samples: sorted_samples[sample.sampler_type] = {} brand_prod = sample.brand + "_" + sample.product if brand_prod not in sorted_samples[sample.sampler_type]: sorted_samples[sample.sampler_type][brand_prod] = {} for channel in sample.values.columns[1:]: if (sample.values[channel] == 0).all(): continue card_channel = sample.card + "_" + channel if card_channel not in sorted_samples[ sample.sampler_type][brand_prod]: sorted_samples[ sample.sampler_type][brand_prod][card_channel] = [] ch_data = channel_data() ch_data.sample_id = sample.ID ch_data.note = sample.note ch_data.tags = sample.tags ch_data.values = sample.values[["time", channel]] ch_data.values[channel] -= ch_data.values[channel][30] ch_data.values[channel] = signal_process.smooth( ch_data.values[channel]) ch_data.derviate_1 = signal_process.get_derivative_1( ch_data.values[channel]) ch_data.derviate_2 = signal_process.get_derivative_2( ch_data.values[channel]) ch_data.picks_list = feature_extractor.get_picks_indexes( ch_data, 0, ch_data.values.size) ch_data.protocol = prot feature_extractor.extract_features(ch_data, prot) sorted_samples[ sample.sampler_type][brand_prod][card_channel].append(ch_data) datestr = constants.get_date_str() features_results_dir = constants.path_result_dir + datestr + constants.path_features_dir features_file_name = features_results_dir + "features_" + "_" + datestr + ".csv" if not os.path.exists(features_results_dir): os.makedirs(features_results_dir) feature_extractor.flush_features_data_frame(features_file_name, sorter)
def main(): """ main function, called if module is run as main program""" if len(sys.argv) == 1: print('usage: bawe_gender_classifier.py <CORPUS_DIR> <CONF_FILE_NAME>') exit(0) data_dir = sys.argv[1] # read file list into dictionary mapping file id to gender with open(data_dir + '/BAWE_balanced_subset.csv', 'r') as gender_file: meta_lines = [line.rstrip().split(',') for line in gender_file] gender_dict = {row[0]: row[1] for row in meta_lines[1:]} # read essay contents and gender labels into lists essays = [] gender_labels = [] for student, gender in gender_dict.items(): with open('%s/%s.txt' % (data_dir, student)) as f: text = f.read() # remove vestigial xml text = re.sub('<[^<]+?>', '', text) essays.append(text) gender_labels.append(gender) # read conf file if len(sys.argv) > 2: with open(sys.argv[2]) as conf_file: conf_all = set(line.strip() for line in conf_file) else: conf_all = [] # compute score, for each line in the config individually and all together # note: preprocessing every time is very wasteful but i did not want to # change the feature_extractor interfaces from what was given # each line individually confs = [line for line in conf_all] if len(conf_all) > 1: # all lines together confs.append([line for line in conf_all]) for conf in confs: print('computing score for: %s... ' % (conf if conf else 'all features'), end='') features = fe.extract_features(essays, conf) score = cross_val_score(GaussianNB(), features, gender_labels, scoring='accuracy', cv=10).mean() score = round(score, 3) print(score) with open('experiments.csv', 'a') as f: f.write(";".join( ('{:%Y-%m-%d;%H:%M:%S}'.format(datetime.datetime.now()), str(conf), str(score), '\n')))
def query(self, model_path, n_samples_query, n_results, custom=False, weights=False): vertices, element_dict, info = read_model(model_path) shape = Shape(vertices, element_dict, info) shape = process(shape, n_vertices_target=self.n_vertices_target) feature_dict = extract_features(shape, self.n_bins, n_samples=n_samples_query) feature_df = data_dict_parser(feature_dict) feature_df, _ = sample_normalizer( feature_df, *self.sample_normalization_parameters, divide_distributions=self.divide_distributions) feature_df_numeric = feature_df.select_dtypes(np.number) #Make sure columns identical and ordered assert list(feature_df_numeric.columns) == list( self.df_numeric.columns), "Column mismatch!" query_vector = feature_df_numeric.iloc[0, :].values.astype(np.float32) if not custom: distances, indices = self.faiss_knn.query(query_vector, n_results) else: distances, indices = self.custom_knn.query(query_vector, n_results, weights=weights) distances = distances.flatten().tolist() #Flatten batch dimension indices = indices.flatten().tolist() df_slice = self.df[self.df.index.isin(indices)] df_slice['distance'] = df_slice.index.map( lambda x: distances[indices.index(x)]) #Add missing data to query df feature_df['file_name'] = str(model_path) feature_df['classification'] = 'query_input' feature_df['distance'] = 0 # Put it at top of slice df_slice = pd.concat([df_slice, feature_df]) df_slice = df_slice.sort_values('distance') return distances, indices, df_slice
def process_subset(self, file_list, apply_processing, n_vertices_target, n_bins, process_index): print(f' {process_index} : Starting subset processor!') data_subset = {k: [] for k in self.columns + self.col_array} for index, file in enumerate(file_list): if index % 50 == 0: print(f' {process_index} : Is at {index}/{len(file_list)}!') vertices, element_dict, info = read_model(Path(file)) shape = Shape(vertices, element_dict, info) if apply_processing: shape = process(shape, n_vertices_target=n_vertices_target) else: shape.make_pyvista_mesh() id = os.path.basename(file).split(".")[0].replace("m", "") if id in self.classification_dict.keys(): classification = self.classification_dict[id] else: classification = None data_subset["classification"].append(classification) data_subset["file_name"].append(file) #Get features feature_dict = extract_features(shape, n_bins=n_bins, n_samples=self.n_samples) #Add them to total data for key, val in feature_dict.items(): data_subset[key].append(val) print(f'{process_index} : Finished!') return data_subset
def classify_sentences(list_of_sentences, model_folder): # Create feature file index_features_filename = os.path.join(model_folder, __name_index) model_filename = os.path.join(model_folder, __name_model) feature_index = Cfeature_index() feature_index.load_from_file(index_features_filename) ################################################ # Create the feature file for classification ################################################ my_feat_file = NamedTemporaryFile(delete=False) for list_of_tokens in list_of_sentences: these_feats = extract_features(list_of_tokens) feature_index.encode_example_for_classification(these_feats, my_feat_file) my_feat_file.close() ################################################ # Run the classifier svm_clasify # SVM values will be a list of floats, one per sentence, with the value associated by SVM svm_values = run_svm_classify(my_feat_file.name, model_filename) os.remove(my_feat_file.name) return svm_values
"education": [0], "marital-status": [0], "occupation": [0], "relationship": [0], "race": [0], "sex": [0], "native-country": [0], } def sigmoid(z): sig = 1 / (1.0 + np.exp(-z)) return np.clip(sig, 0.00000000000001, 0.99999999999999) training_x, training_y, testing_x = feature_extractor.extract_features( sys.argv[1], sys.argv[2], feature_config, True) num_training_data = training_x.shape[0] num_features = training_x.shape[1] true_training_x = [] false_training_x = [] for i, row in enumerate(training_x.tolist()): if training_y[i, 0] == 1.0: true_training_x.append(row) else: false_training_x.append(row) true_training_x = np.matrix(true_training_x, dtype=np.float64) false_training_x = np.matrix(false_training_x, dtype=np.float64)
def test_bounding_box_volume(bounding_box_volume): # Bounding box volume should be same as volume for cube assert np.abs(bounding_box_volume - 8) < 10e-3, "Bounding box volume test failed" def test_bounding_box_ratio(bounding_box_ratio): # Ratio should be 1 since bounding box is cube assert np.abs(bounding_box_ratio - 1) < 10e-3, "Bouding box ratio test failed" def test_diameter(diameter): # Cube is already convex so diameter just dist of opposing corners dist = np.linalg.norm(np.array([-1, -1, -1]) - np.array([1, 1, 1])) assert np.abs(dist - diameter) < 10e-3, "Dimateter test failed" if __name__ == '__main__': # cube. off is a 2x2x2 cube path = Path(r"data/cube.off") reader = FileReader() vertices, element_dict, info = reader.read(path) shape = Shape(vertices, element_dict, info) feature_dict = extract_features(shape,n_bins=10,n_samples=1e+6) test_volume(feature_dict['volume']) test_surface_area(feature_dict['surface_area']) test_bounding_box_volume(feature_dict['bounding_box_volume']) test_bounding_box_ratio(feature_dict['bounding_box_ratio']) test_diameter(feature_dict['diameter'])
def load_dataset(label, path, shuffle=False): messages = load_messages(path) if shuffle: random.shuffle(messages) return [(extract_features(msg), label) for msg in messages]
for example in examples: if not (example == "count.txt"): examples_path = classes_path + "/" + example images_names = os.listdir(examples_path) ratio_step = int((images_names.__len__() - 1) / 35) for w in range(0, 35): h = w * ratio_step if not (images_names[h].__len__() == 14): # in the case of non image pick h += 1 images_path = examples_path + "/" + images_names[h] features = extractor.extract_features(images_path) for attribute in attributes: data_temp.append(features[attribute]) print(images_path) data_temp.append(class_id) data.append(list(data_temp)) del data_temp[:] # clear if (class_name != "count.txt"): class_id = class_id + 1
# -*- coding: UTF-8 -*- import numpy from sklearn import preprocessing def reform(datasets): new_datasets = [] scaler = None for dataset in datasets: new_dataset_x = [] new_dataset_y = [] for x, y in zip(dataset[0],dataset[1]): for i in range(0, len(x)/10*10, 10): new_dataset_x.append(x[i:i+10,:].flatten()) new_dataset_y.append(y) new_dataset_x = numpy.asarray(new_dataset_x) new_dataset_y = numpy.asarray(new_dataset_y) new_datasets.append((new_dataset_x, new_dataset_y)) return tuple(new_datasets) if __name__ == '__main__': from loader import load_data from feature_extractor import extract_features datasets = extract_features(load_data()[0]) new_datasets = reform(datasets) print new_datasets[0][0][0].shape
clustered_junction_file = sys.argv[sys.argv.index('-exf') + 3] intersection_db = sys.argv[sys.argv.index('-exf') + 4] out_file = sys.argv[sys.argv.index('-exf') + 5] file = open(clustered_junction_file, 'r') clustered_junctions = json.loads(file.read()) file.close() from feature_extractor import extract_features pickle_in = open(grid_file, "rb") l = pickle.load(pickle_in) pickle_in.close() file = open(trail_file) data = json.loads(file.read()) file.close() extract_features(l, data, os.path.join(out_folder, out_file), clustered_junctions, intersection_db, way_folder) except Exception as e: raise e # Creating a grid of the clustered junctions. This is used to efficiently check if at a given point # in trail, we are in the proximity of a junction or not. This section creates a model and saves it # in a file named as the second command line parameter. The model, when loaded, provides a method # that takes the current lat and long as input along with the distance in meters we define the notion # of proximity. The method returns false if the point given is not near any of the the junctions and # a tuple (True, lat, lng) where lat and lng are the latitude and longitude of the juction point the # current point is nearest from. if '-l' in sys.argv: try: inp_file = open(sys.argv[sys.argv.index('-l') + 1]) out_file_name = sys.argv[sys.argv.index('-l') + 2] junction_info = json.loads(inp_file.read())
def err(loss): return np.sum(np.abs(loss)) def cross_entropy(y_hat, y): return np.sum(-(y_hat.transpose().dot(np.log(y)) + (1 - y_hat).transpose().dot(np.log(1 - y)))) model_file_name = "./model" is_model_existed = os.path.isfile(model_file_name) if not is_model_existed: training_x, training_y, testing_x = feature_extractor.extract_features( sys.argv[1], sys.argv[2], feature_config, is_normalized) num_validating_data = training_x.shape[0] // 10 validating_x = training_x[:num_validating_data] validating_y = training_y[:num_validating_data] training_x = training_x[num_validating_data:] training_y = training_y[num_validating_data:] num_training_data = training_x.shape[0] num_features = training_x.shape[1] weights = [0.0 for _ in range(num_features)] # Training parameters num_iterations = 5e5 learning_rate = 1e1 is_regularized = True
def axis_features(idx): axisname = data.dtype.names[idx] col = data[axisname] return extract_features(None, col, self.config, self.section)
'--load', action='store_true', help='include to load models from <expdir> instead of training new') return vars(parser.parse_args()) if __name__ == "__main__": args = handle_args() print(args) header, data = load_data(args['datafile']) labels = numpy.asarray([[line[i] for i in class_idx] for line in data]).T.tolist() conf = load_conf_file(args['conffile']) features = fe.extract_features( [line[1] for line in data], conf ) # this will only pass the status update text to the feature extractor if not args['load']: # train new models, evaluate, store for i in range(len(class_idx)): trait = header[class_idx[i]] clf = svm.SVC(class_weight='balanced').fit(features, labels[i]) predicted = cross_val_predict(clf, features, labels[i], cv=10, n_jobs=-1) print("%s: %.2f" % (header[class_idx[i]], metrics.accuracy_score(labels[i], predicted))) with open("%s/%s.pkl" % (args['expdir'], trait), 'wb') as f:
def main(): area = sys.argv[1] # 'rome' 'tuscany' 'london' type_user = sys.argv[2] # 'crash' 'nocrash' overwrite = int(sys.argv[3]) country = 'uk' if area == 'london' else 'italy' min_length = 1.0 min_duration = 60.0 print(datetime.datetime.now(), 'Crash Prediction - Train Test Partitioner') if not overwrite: print(datetime.datetime.now(), '(restart)') path = './' path_imn = path + 'imn_new/' path_dataset = path + 'dataset/' path_traintest = path + 'traintest/' path_quadtree = path + 'quadtree/' traj_table = 'tak.%s_traj' % country evnt_table = 'tak.%s_evnt' % country crash_table = 'tak.%s_crash' % country if area == 'london' and type_user == 'nocrash': users_filename = path_dataset + '%s_%s_users_list.csv' % (area, 'all') users_filename_crash = path_dataset + '%s_%s_users_list.csv' % ( area, 'crash') else: users_filename = path_dataset + '%s_%s_users_list.csv' % (area, type_user) users_filename_crash = None users_list = pd.read_csv(users_filename).values[:, 0].tolist() users_list = sorted(users_list) if users_filename_crash is not None: users_list_crash = pd.read_csv( users_filename_crash).values[:, 0].tolist() users_list_crash = sorted(users_list_crash) users_list = [uid for uid in users_list if uid not in users_list_crash] nbr_users = len(users_list) print(datetime.datetime.now(), 'Reading quadtree') quadtree_poi_filename = path_quadtree + '%s_personal_osm_poi_lv17.json.gz' % area fout = gzip.GzipFile(quadtree_poi_filename, 'r') quadtree = json.loads(fout.readline()) fout.close() print(datetime.datetime.now(), 'Reading quadtree features') quadtree_features_filename = path_quadtree + '%s_quadtree_features.json.gz' % area fout = gzip.GzipFile(quadtree_features_filename, 'r') quadtrees_features_str = json.loads(fout.readline()) quadtrees_features = {int(k): v for k, v in quadtrees_features_str.items()} fout.close() processed_users = set() if overwrite: for index in range(0, 7): output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) if os.path.exists(output_filename): os.remove(output_filename) else: processed_users = set() for index in range(0, 7): output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) if os.path.isfile(output_filename): fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.add(customer_obj['uid']) fout.close() window = 4 datetime_from = datetime.datetime.strptime('2017-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') datetime_to = datetime.datetime.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') print(datetime.datetime.now(), 'Generating month boundaries') months = pd.date_range(start=datetime_from, end=datetime_to, freq='MS') boundaries = [[lm, um] for lm, um in zip(months[:-window], months[window:])] training_months = list() test_months = list() for i in range(len(boundaries) - 1): training_months.append(boundaries[i]) test_months.append(boundaries[i + 1]) index = 0 tr_data_map = dict() ts_data_map = dict() for tr_months, ts_months in zip(training_months, test_months): tr_data_map[tuple(tr_months)] = index ts_data_map[tuple(ts_months)] = index index += 1 print(datetime.datetime.now(), 'Initializing quadtree features') tr_quadtree_features = dict() for m in quadtrees_features: for lu, index in tr_data_map.items(): if lu[0].month <= m < lu[1].month: if index not in tr_quadtree_features: tr_quadtree_features[index] = dict() for path in quadtrees_features[m]: if path not in tr_quadtree_features[index]: tr_quadtree_features[index][path] = { 'nbr_traj_start': 0, 'nbr_traj_stop': 0, 'nbr_traj_move': 0, 'traj_speed_sum': 0, 'traj_speed_count': 0, 'nbr_evnt_A': 0, 'nbr_evnt_B': 0, 'nbr_evnt_C': 0, 'nbr_evnt_Q': 0, 'nbr_evnt_start': 0, 'nbr_evnt_stop': 0, 'speed_A_sum': 0, 'max_acc_A_sum': 0, 'avg_acc_A_sum': 0, 'speed_B_sum': 0, 'max_acc_B_sum': 0, 'avg_acc_B_sum': 0, 'speed_C_sum': 0, 'max_acc_C_sum': 0, 'avg_acc_C_sum': 0, 'speed_Q_sum': 0, 'max_acc_Q_sum': 0, 'avg_acc_Q_sum': 0, 'nbr_crash': 0, } for k, v in quadtrees_features[m][path].items(): tr_quadtree_features[index][path][k] += v ts_quadtree_features = dict() for m in quadtrees_features: for lu, index in tr_data_map.items(): if lu[0].month <= m < lu[1].month: if index not in ts_quadtree_features: ts_quadtree_features[index] = dict() for path in quadtrees_features[m]: if path not in ts_quadtree_features[index]: ts_quadtree_features[index][path] = { 'nbr_traj_start': 0, 'nbr_traj_stop': 0, 'nbr_traj_move': 0, 'traj_speed_sum': 0, 'traj_speed_count': 0, 'nbr_evnt_A': 0, 'nbr_evnt_B': 0, 'nbr_evnt_C': 0, 'nbr_evnt_Q': 0, 'nbr_evnt_start': 0, 'nbr_evnt_stop': 0, 'speed_A_sum': 0, 'max_acc_A_sum': 0, 'avg_acc_A_sum': 0, 'speed_B_sum': 0, 'max_acc_B_sum': 0, 'avg_acc_B_sum': 0, 'speed_C_sum': 0, 'max_acc_C_sum': 0, 'avg_acc_C_sum': 0, 'speed_Q_sum': 0, 'max_acc_Q_sum': 0, 'avg_acc_Q_sum': 0, 'nbr_crash': 0, } for k, v in quadtrees_features[m][path].items(): ts_quadtree_features[index][path][k] += v print(datetime.datetime.now(), 'Connecting to database') con = database_io.get_connection() cur = con.cursor() count = 0 imn_filedata = gzip.GzipFile( path_imn + '%s_imn_%s.json.gz' % (area, type_user), 'r') print(datetime.datetime.now(), 'Calculating features and partitioning dataset') for row in imn_filedata: if len(row) <= 1: print('new file started ;-)') continue user_obj = json.loads(row) uid = user_obj['uid'] count += 1 if uid in processed_users: continue if count % 10 == 0: print( datetime.datetime.now(), 'train test partition %s %s [%s/%s] - %.2f' % (area, type_user, count, nbr_users, 100 * count / nbr_users)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None trajectories = imh['trajectories'] tr_data = dict() ts_data = dict() # partitioning imn for train and test for imn_months in user_obj: if imn_months == 'uid': continue # print(imn_months) m0 = int(imn_months.split('-')[0]) m1 = int(imn_months.split('-')[1]) for lu, index in tr_data_map.items(): if lu[0].month <= m0 < m1 < lu[1].month: if index not in tr_data: tr_data[index] = { 'uid': uid, 'crash': False, 'trajectories': dict(), 'imns': dict(), 'events': dict(), } tr_data[index]['imns'][imn_months] = user_obj[imn_months] for lu, index in ts_data_map.items(): if lu[0].month <= m0 < lu[1].month: if index not in ts_data: ts_data[index] = { 'uid': uid, 'crash': False, 'trajectories': dict(), 'imns': dict(), 'events': dict(), } ts_data[index]['imns'][imn_months] = user_obj[imn_months] # partitioning trajectories for train and test for tid, traj in trajectories.items(): for lu, index in tr_data_map.items(): if lu[0] <= traj.start_time() < lu[1] and index in tr_data: tr_data[index]['trajectories'][tid] = traj for lu, index in ts_data_map.items(): if lu[0] <= traj.start_time() < lu[1] and index in ts_data: ts_data[index]['trajectories'][tid] = traj # partitioning events for train and test for eid, evnt in events.items(): # print(evnt) for lu, index in tr_data_map.items(): if lu[0] <= evnt[0]['date'] < lu[1] and index in tr_data: tr_data[index]['events'][eid] = evnt[0] for lu, index in ts_data_map.items(): if lu[0] <= evnt[0]['date'] < lu[1] and index in ts_data: ts_data[index]['events'][eid] = evnt[0] # get has crash next month for lu, index in tr_data_map.items(): if index not in tr_data: continue query = """SELECT * FROM %s WHERE uid = '%s' AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % ( crash_table, uid, str( lu[1]), str(lu[1] + relativedelta(months=1))) cur.execute(query) rows = cur.fetchall() has_crash_next_month = len(rows) > 0 tr_data[index]['crash'] = has_crash_next_month for lu, index in ts_data_map.items(): if index not in ts_data: continue query = """SELECT * FROM %s WHERE uid = '%s' AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % ( crash_table, uid, str( lu[1]), str(lu[1] + relativedelta(months=1))) cur.execute(query) rows = cur.fetchall() has_crash_next_month = len(rows) > 0 ts_data[index]['crash'] = has_crash_next_month tr_features, ts_features = feature_extractor.extract_features( uid, tr_data, ts_data, quadtree, tr_quadtree_features, ts_quadtree_features) for index in tr_features: if index in ts_features: output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) store_obj = { 'uid': uid, 'train': tr_features[index], 'test': ts_features[index] } feature_extractor.store_features(output_filename, store_obj) imn_filedata.close()
[timestamps, action_history, annotations, feedbacks, durations, rows] = \ load_path(sys.argv[1]) records = [] valid_rows = [] for i in range(len(action_history)): print i record = {} actions = convert_paths(action_history[i], timestamps[i]) if len(actions) == 0: print "Reply " + str(i) + " corrupted." continue record["actions"] = actions record["strokes"] = annotations[i] record["feedback"] = feedbacks[i] record["duration"] = durations[i] records.append(record) valid_rows.append(rows[i]) if len(records) == 60: break print "Successfully converted " + str(len(records)) + " replays." with open((sys.argv[1].split(".")[0] + "_records.txt").format(i), 'w') as f: f.write(json.dumps(records)) feature_extractor.extract_features(valid_rows, sys.argv[2])
def is_spam(tweet, lang): if lang != 'en': return False return 'spam' == CLASSIFIER_EN.classify(extract_features(tweet['text']))
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='mse', optimizer=sgd) return model from loader import load_data from feature_extractor import extract_features from utils import reform if __name__ == '__main__': print "... loading data" datasets, n_classes = load_data() print "... extracting features" datasets = extract_features(datasets) print "... reforming data" train_set, test_set = reform(datasets) print "*********************" print " Model 1 - MLP" print "*********************" print "... building MLP" mlp = MLP(n_classes) print "... training MLP" mlp.fit(train_set[0], np_utils.to_categorical(train_set[1], n_classes), batch_size=10, nb_epoch=5, show_accuracy=True) print "... evaluating MLP"
for example in examples: if not( example == "count.txt" ): examples_path = classes_path + "/" + example images_names = os.listdir(examples_path) ratio_step = int((images_names.__len__()-1)/35) for w in range(0, 35): h = w * ratio_step if not(images_names[h].__len__() == 14): # in the case of non image pick h += 1 images_path = examples_path + "/" + images_names[h] features = extractor.extract_features( images_path ) for attribute in attributes: data_temp.append( features[ attribute ] ) print( images_path ) data_temp.append( class_id ) data.append(list(data_temp)) del data_temp[:] # clear if( class_name != "count.txt" ): class_id = class_id + 1
def train_classifier_from_folders(list_folders,output_folder): #Create the training feature file with format: #CLASS label FEAT FEAT FEAT if os.path.exists(output_folder): shutil.rmtree(output_folder) os.mkdir(output_folder) features_folder = os.path.join(output_folder,'features') os.mkdir(features_folder) feature_filename = os.path.join(output_folder,__name_feature) fd_feats = open(feature_filename,'w') for f in list_folders: this_pid = os.fork() if this_pid == 0: print 'Extracting features from',f,' ' while True: sys.stdout.write('.') sys.stdout.flush() time.sleep(1) else: #For each sent file for sent_file in glob.glob(os.path.join(f,'*.sents')): fin = open(sent_file,'r') for line in fin: fields = line.decode('utf-8').strip().split(' ') class_label = fields[0] ##Conver + - o +1 -1 if class_label[0]=='+': class_label='+1' elif class_label[0]=='-': class_label='-1' tokens = fields[1:] these_feats = extract_features(tokens) write_to_output(class_label, these_feats, fd_feats) fin.close() ##Done os.kill(this_pid,signal.SIGTERM) print fd_feats.close() print 'Feature file:',feature_filename ##Convert this features file in the index file training_filename = os.path.join(output_folder,__name_training) fd_train = open(training_filename,'w') feature_file_obj = Cfeature_file(feature_filename) index_features = Cfeature_index() index_features.encode_feature_file_to_svm(feature_file_obj,out_fic=fd_train) print 'Training instances saved to ',training_filename fd_train.close() #Save the index of features that will be used for the classification index_filename = os.path.join(output_folder,__name_index ) index_features.save_to_file(index_filename) print 'Index of features saved to',index_filename #Train the model using the file training_filename model_filename = os.path.join(output_folder,__name_model) params = '-c 0.5 -x 1' run_svmlight_learn(training_filename,model_filename,params) print 'Model trained and saved to',model_filename
def train_classifier_from_folders(list_folders, output_folder): #Create the training feature file with format: #CLASS label FEAT FEAT FEAT if os.path.exists(output_folder): shutil.rmtree(output_folder) os.mkdir(output_folder) features_folder = os.path.join(output_folder, 'features') os.mkdir(features_folder) feature_filename = os.path.join(output_folder, __name_feature) fd_feats = open(feature_filename, 'w') for f in list_folders: this_pid = os.fork() if this_pid == 0: print 'Extracting features from', f, ' ' while True: sys.stdout.write('.') sys.stdout.flush() time.sleep(1) else: #For each sent file for sent_file in glob.glob(os.path.join(f, '*.sents')): fin = open(sent_file, 'r') for line in fin: fields = line.decode('utf-8').strip().split(' ') class_label = fields[0] ##Conver + - o +1 -1 if class_label[0] == '+': class_label = '+1' elif class_label[0] == '-': class_label = '-1' tokens = fields[1:] these_feats = extract_features(tokens) write_to_output(class_label, these_feats, fd_feats) fin.close() ##Done os.kill(this_pid, signal.SIGTERM) print fd_feats.close() print 'Feature file:', feature_filename ##Convert this features file in the index file training_filename = os.path.join(output_folder, __name_training) fd_train = open(training_filename, 'w') feature_file_obj = Cfeature_file(feature_filename) index_features = Cfeature_index() index_features.encode_feature_file_to_svm(feature_file_obj, out_fic=fd_train) print 'Training instances saved to ', training_filename fd_train.close() #Save the index of features that will be used for the classification index_filename = os.path.join(output_folder, __name_index) index_features.save_to_file(index_filename) print 'Index of features saved to', index_filename #Train the model using the file training_filename model_filename = os.path.join(output_folder, __name_model) params = '-c 0.5 -x 1' run_svmlight_learn(training_filename, model_filename, params) print 'Model trained and saved to', model_filename
def main(_): os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu if FLAGS.cfg_file is None: raise ValueError('You must supply the cfg file !') cfg = _cfg_from_file(FLAGS.cfg_file) train_cfg = cfg['train'] # print all configs print('############################ cfg ############################') for k in cfg: print('%s: %s' % (k, cfg[k])) tf.logging.set_verbosity(tf.logging.INFO) ####################################################################### ############## sigle GPU version ############## ####################################################################### #### get dataset #### cls_dataset = dataset.get_dataset(dataset_folder=cfg['dataset_folder'], split=train_cfg['train_split'], cfg=train_cfg['dataset_opt']) #### build training dataset pipline ##### im_batch, label_batch = dataset.build_input_pipline( phase='train', dataset=cls_dataset, min_resize_value=cfg.get('min_resize_value', None), max_resize_value=cfg.get('max_resize_value', None), # train cfgs: batch_size=train_cfg['batch_size'], num_epoch=int( math.ceil( float(train_cfg['iters']) * train_cfg['batch_size'] / cls_dataset.num_examples)), shuffle=True, aug_opt=train_cfg.get('aug_opt', None), crop_size=cfg['corp_size'], ) ##### get logits #### logits, endpoints = feature_extractor.extract_features( images=im_batch, num_classes=cls_dataset.num_classes, output_stride=cfg['output_stride'], global_pool=True, model_variant=cfg['model_variant'], weight_decay=train_cfg.get('weight_decy', 0), dropout_keep_prob=train_cfg.get('dropout_keep_prob', 1.0), regularize_depthwise=train_cfg.get('regularize_depthwise', False), reuse=tf.AUTO_REUSE, is_training=True, fine_tune_batch_norm=train_cfg.get('fine_turn_batch_norm', False), cfg=cfg) ##### build loss #### total_loss = build_loss(logits=logits, labels=label_batch, endpoints=endpoints, loss_opt=train_cfg['loss_opt']) #### build optiizer #### global_step = slim.create_global_step() learning_rate = _configure_learning_rate( num_samples_per_epoch=cls_dataset.num_examples, global_step=global_step, train_cfg=train_cfg) optimizer = _configure_optimizer( learning_rate=learning_rate, train_cfg=train_cfg, ) #### build train tensor #### grads_and_vars = optimizer.compute_gradients( loss=total_loss, var_list=_get_variables_to_train(train_cfg=train_cfg), ) grad_updates = optimizer.apply_gradients(grads_and_vars=grads_and_vars, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # batch norm update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') #### add summaries #### # Add summaries for model variables. for model_var in slim.get_model_variables(): tf.summary.histogram(model_var.op.name, model_var) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES): tf.summary.scalar('losses/%s' % loss.op.name, loss) if train_cfg['loss_opt'].get('use_reg_loss', False): tf.summary.scalar( 'losses/reg_loss', tf.get_default_graph().get_tensor_by_name( 'make_total_loss/reg_loss:0')) if train_cfg['loss_opt'].get('use_aux_loss', False): tf.summary.scalar( 'losses/aux_loss', tf.get_default_graph().get_tensor_by_name( 'make_total_loss/aux_loss/value:0')) tf.summary.scalar( 'total_loss', tf.get_default_graph().get_tensor_by_name( 'make_total_loss/total_loss:0')) # merge all summaries merged_summaries = tf.summary.merge_all() summaries_writer = tf.summary.FileWriter(logdir=FLAGS.output_dir, graph=tf.get_default_graph()) #### set up session config #### # savers: model_variables = slim.get_model_variables() model_variables.append(tf.train.get_or_create_global_step()) for mv in model_variables: print(mv.op.name) ckpt_saver = tf.train.Saver(var_list=model_variables, max_to_keep=10) new_ckpt_path = os.path.join(FLAGS.output_dir, cfg['model_variant'] + '.ckpt') save_ckpt_every = train_cfg.get('save_ckpt_every', 5000) # session config: sess_cfg = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess_cfg.gpu_options.allow_growth = True #### train the model #### with tf.Session(config=sess_cfg) as sess: # init sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # restore vars from pretrained ckpt: if train_cfg.get('pretrian_ckpt_file', None) is not None: pretrain_ckpt = train_cfg['pretrian_ckpt_file'] tf.logging.info('restore ckpt from: %s', pretrain_ckpt) restor_saver = tf.train.Saver(var_list=_var_to_restore( train_cfg.get('exclude_scopes', None))) restor_saver.restore(sess, pretrain_ckpt) # train for i in range(train_cfg['iters']): if (i % save_ckpt_every == 0): all_summaries, loss_now = sess.run( [merged_summaries, train_tensor]) # write summaries summaries_writer.add_summary(all_summaries, i) # save ckpt ckpt_saver.save(sess, new_ckpt_path, global_step=i) else: loss_now = sess.run(train_tensor) if i % 20 == 0: tf.logging.info('global step: %d, loss= %f', i, loss_now) # Final run all_summaries, loss_now = sess.run([merged_summaries, train_tensor]) # write summaries summaries_writer.add_summary(all_summaries, train_cfg['iters']) # save ckpt ckpt_saver.save(sess, new_ckpt_path, global_step=train_cfg['iters']) print("End of Train !!!")
essays = [] genderlabels = [] students = [] for student, gender in gender_dict.items(): with open('%s/%s.txt' % (datadir, student)) as f: text = f.read() text = re.sub('<[^<]+?>', '', text) # remove vestigial xml essays.append(text) genderlabels.append(gender) students.append(student) return essays, genderlabels, students def load_conf_file(): conf = set(line.strip() for line in open(conffile)) return conf def predict_gender(X, Y): scores = cross_val_score(GaussianNB(), X, Y, scoring='accuracy', cv=10) return scores.mean() if __name__ == "__main__": gender_dict = load_balanced_gender_labels() essays, genderlabels, students = load_essays(gender_dict) conf = load_conf_file() features = fe.extract_features(essays, conf) print(predict_gender(features, genderlabels))
def _extract_features(images, model_options, weight_decay=0.0001, reuse=None, is_training=False, fine_tune_batch_norm=False): """Extracts features by the particular model_variant. Args: images: A tensor of size [batch, height, width, channels]. model_options: A ModelOptions instance to configure models. weight_decay: The weight decay for model variables. reuse: Reuse the model variables or not. is_training: Is training or not. fine_tune_batch_norm: Fine-tune the batch norm parameters or not. Returns: concat_logits: A tensor of size [batch, feature_height, feature_width, feature_channels], where feature_height/feature_width are determined by the images height/width and output_stride. end_points: A dictionary from components of the network to the corresponding activation. """ features, end_points = feature_extractor.extract_features( images, output_stride=model_options.output_stride, multi_grid=model_options.multi_grid, model_variant=model_options.model_variant, weight_decay=weight_decay, reuse=reuse, is_training=is_training, fine_tune_batch_norm=fine_tune_batch_norm) if not model_options.aspp_with_batch_norm: return features, end_points else: batch_norm_params = { 'is_training': is_training and fine_tune_batch_norm, 'decay': 0.9997, 'epsilon': 1e-5, 'scale': True, } with slim.arg_scope( [slim.conv2d, slim.separable_conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, padding='SAME', stride=1, reuse=reuse): with slim.arg_scope([slim.batch_norm], **batch_norm_params): depth = 512 branch_logits = [] if model_options.add_image_level_feature: pool_height = scale_dimension(model_options.crop_size[0], 1. / model_options.output_stride) pool_width = scale_dimension(model_options.crop_size[1], 1. / model_options.output_stride) image_feature = slim.avg_pool2d( features, [pool_height, pool_width], [pool_height, pool_width], padding='VALID') image_feature = slim.conv2d( image_feature, depth, 1, scope=_IMAGE_POOLING_SCOPE) image_feature = tf.image.resize_bilinear( image_feature, [pool_height, pool_width], align_corners=True) image_feature.set_shape([None, pool_height, pool_width, depth]) branch_logits.append(image_feature) # Employ a 1x1 convolution. branch_logits.append(slim.conv2d(features, depth, 1, scope=_ASPP_SCOPE + str(0))) if model_options.atrous_rates: # Employ 3x3 convolutions with different atrous rates. for i, rate in enumerate(model_options.atrous_rates, 1): scope = _ASPP_SCOPE + str(i) if model_options.aspp_with_separable_conv: aspp_features = _split_separable_conv2d( features, filters=depth, rate=rate, weight_decay=weight_decay, scope=scope) else: aspp_features = slim.conv2d( features, depth, [3, 1], rate=rate, scope=scope) branch_logits.append(aspp_features) # Merge branch logits. concat_logits = tf.concat(branch_logits, 3) concat_logits = slim.conv2d( concat_logits, depth, 1, scope=_CONCAT_PROJECTION_SCOPE) concat_logits = slim.dropout( concat_logits, keep_prob=0.5, is_training=is_training, scope=_CONCAT_PROJECTION_SCOPE + '_dropout') return concat_logits, end_points
def gen_axis_features(idx): axisname = vis.data.dtype.names[idx] col = vis.data[axisname] return extract_features(None, col, config, section)