def dist_sqrt_area_rand_triangle(data): mesh = data["poly_data"] verts_list = FeatureExtractor.generate_random_ints(0, len(mesh.points) - 1, (FeatureExtractor.number_vertices_sampled, 3)) triangle_areas = PSBDataset._get_cell_areas(mesh.points, verts_list) sqrt_areas = np.sqrt(triangle_areas) del verts_list return {"hist_sqrt_area_rand_three_verts": FeatureExtractor.make_bins(sqrt_areas, FeatureExtractor.number_bins)}
def preprocessing(self, pca=False, tsne=False, umap=False): feature_extractor = FeatureExtractor() self.x_all = feature_extractor.fit_transform(self.x_all) self.x_train = feature_extractor.fit_transform(self.x_train) self.x_test = feature_extractor.fit_transform(self.x_test) self.x_all_trans_no_pca = np.copy(self.x_all) # Apply dimensionality reduction scaler = StandardScaler() self.x_all = scaler.fit_transform(self.x_all) self.x_train = scaler.fit_transform(self.x_train) self.x_test = scaler.fit_transform(self.x_test) if pca or tsne or umap: self.x_all = self.apply_Dim_Reduction(self.x_all, apply_pca=pca, apply_tSNE=tsne, apply_umap=umap) self.x_train = self.apply_Dim_Reduction(self.x_train, apply_pca=pca, apply_tSNE=tsne, apply_umap=umap) self.x_test = self.apply_Dim_Reduction(self.x_test, apply_pca=pca, apply_tSNE=tsne, apply_umap=umap) # Visualization of the data if self.visualize: self.visualize_inputs_(self.x_all_trans_no_pca) self.visualize_pca_inputs() self.visualize_tsne_inputs(self.x_all.shape[0])
def getClf(): records = [] for cla, file in [(0, "frameOf1.frame"), (1, "frameOf2.frame")]: de = Deserialization(file) frames = de.frames[100:-100] featureExtractor = FeatureExtractor() for frame in frames: frame_record = featureExtractor.getFeature(frame, cla) records.append(frame_record) data = np.array(records) features = data[:, :-1] labels = data[:, -1] print features.shape print labels.shape X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42) clf = svm.SVC() clf.fit(X_train, y_train) predicted_test = clf.predict(X_test) print np.mean(predicted_test == y_test) return clf
def __init__(self, config): self.layer = config["layer"] self.input_layer = config["input_layer"] self.feature_extractor = FeatureExtractor( config["path_to_deploy_file"], config["path_to_model_file"], input_layer_name=self.input_layer)
def __init__(self, config, image_paths): self.layer = config["model_layer"] self.search_index_path = config["search_index_path"] self.feature_extractor = FeatureExtractor(config["path_to_model_file"], embedding_layer=self.layer) self.image_paths = image_paths self.search_index = AnnoyIndex(4096, metric="euclidean")
def initialise_everything(): print('''This procedure can take up to hours to finish. The program will now run: - Normalisation pipeline over the shape database. (~3hrs) - Feature extraction over shape database. (~2hrs)\n Are you sure you want to continue (y/n)?\n ''') choice = input(">> ") if choice == "n" or choice == "no": return with open('config.json') as f: data = json.load(f) path_psd = data["DATA_PATH_PSB"] path_normed = data["DATA_PATH_NORMED"] path_feature = data["FEATURE_DATA_FILE"] db = PSBDataset() if len(os.listdir(path_psd)) == 0: print("No valid dataset found.\nPoint to a valid dataset.") return else: prompt_for_class_files(path_psd) choice = input( "Do you wish to go back to the menu to change the current classification settings? (y/n)\n>> " ) if choice == "n": return if not os.path.isfile(path_normed): print("No valid normalised dataset found.\nRunning normalisation.") norm = Normalizer(db) norm.run_full_pipeline() if not os.path.isfile(path_feature): print("No valid feature file found.\nRun feature extraction.") FE = FeatureExtractor(db) FE.run_full_pipeline()
def main(): dataset_path = "/path/to/Caltech-101" modelzoo_path = "/path/to/VGG16" # create an instance convnet = FeatureExtractor( prototxt_path=os.path.join(modelzoo_path, "vgg16_deploy.prototxt"), caffemodel_path=os.path.join(modelzoo_path, "vgg16.caffemodel"), target_layer_name="fc7", image_size=224, mean_values=[103.939, 116.779, 123.68]) # header f = open("caltech101_vggnet_fc7_features.csv", "w") header = ["filepath"] for i in xrange(4096): header.append("feat%d" % (i+1)) header = ",".join(header) + "\n" f.write(header) # extract features categories = os.listdir(dataset_path) for category in pyprind.prog_bar(categories): file_names = os.listdir(os.path.join(dataset_path, category)) for file_name in file_names: img = cv2.imread(os.path.join(dataset_path, category, file_name)) feat = convnet.transform(img) feat_str = [os.path.join(category, file_name)] for value in feat: feat_str.append(str(value)) row = ",".join(feat_str) f.write("%s\n" % row) f.flush() f.close()
class Indexer(object): def __init__(self, config): self.layer = config["layer"] self.input_layer = config["input_layer"] self.feature_extractor = FeatureExtractor( config["path_to_deploy_file"], config["path_to_model_file"], input_layer_name=self.input_layer) #self.image_paths = image_paths def index_batch(self, batch_size, start_index=0, stop_index=None): batches = [ self.image_paths[x:x + batch_size] for x in range(0, len(self.image_paths), batch_size) ] batch_num = 0 if not stop_index: stop_index = len(batches) batches = batches[start_index:stop_index] for batch in batches: batch_num += 1 print("Indexing batch ", batch_num, len(batch)) fv_dict = self.feature_extractor.extract_batch(batch, layer=self.layer) self.write_to_lmdb(fv_dict) def index(self, img): fv = self.feature_extractor.extract_from_img(img, layer=self.layer) return fv def write_to_lmdb(self, fv_dict): env = self.connection with env.begin(write=True) as txn: for k in fv_dict: txn.put(k.encode('ascii'), fv_dict[k].tostring())
def extract_features(json_file): class_id = json_file.split('_')[-1] class_id = int(class_id[0]) feature_extractor = FeatureExtractor(json_file) feature_list = feature_extractor.extract_features() return feature_list, class_id
def __init__(self, data_type, mode, debug_limit): log_csv_path = '{0}/../data/{1}/log_{1}.csv'.format( base_dir, data_type) feature_path = '{0}/../data/feature/user_feature_{1}.csv'.format( base_dir, data_type) FeatureExtractor.__init__(self, mode, log_csv_path, feature_path, debug_limit)
def __init__(self, vocab, options): import dynet as dy from feature_extractor import FeatureExtractor global dy self.model = dy.ParameterCollection() self.trainer = dy.AdamTrainer(self.model, alpha=options.learning_rate) self.activations = { 'tanh': dy.tanh, 'sigmoid': dy.logistic, 'relu': dy.rectify, 'tanh3': (lambda x: dy.tanh(dy.cwise_multiply(dy.cwise_multiply(x, x), x))) } self.activation = self.activations[options.activation] self.costaugFlag = options.costaugFlag self.feature_extractor = FeatureExtractor(self.model, options, vocab) self.labelsFlag = options.labelsFlag mlp_in_dims = options.lstm_output_size * 2 self.unlabeled_MLP = biMLP(self.model, mlp_in_dims, options.mlp_hidden_dims, options.mlp_hidden2_dims, 1, self.activation) if self.labelsFlag: self.labeled_MLP = biMLP(self.model, mlp_in_dims, options.mlp_hidden_dims, options.mlp_hidden2_dims, len(self.feature_extractor.irels), self.activation) self.proj = options.proj
def __init__(self): self.extractor = FeatureExtractor() self.inited = False self.last_less_sharp_points = None self.last_less_flat_points = None self.last_position = np.eye(4)
def __init__(self, words, pos, rels, cpos, langs, w2i, ch, options): import dynet as dy # import here so we don't load Dynet if just running parser.py --help for example global dy self.model = dy.ParameterCollection() self.trainer = dy.AdamTrainer(self.model, alpha=options.learning_rate) self.activations = {'tanh': dy.tanh, 'sigmoid': dy.logistic, 'relu': dy.rectify, 'tanh3': (lambda x: dy.tanh(dy.cwise_multiply(dy.cwise_multiply(x, x), x)))} self.activation = self.activations[options.activation] self.oracle = options.oracle self.headFlag = options.headFlag self.rlMostFlag = options.rlMostFlag self.rlFlag = options.rlFlag self.k = options.k #dimensions depending on extended features self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0) self.feature_extractor = FeatureExtractor(self.model,options,words,rels,langs,w2i,ch,self.nnvecs) self.irels = self.feature_extractor.irels mlp_in_dims = options.lstm_output_size*2*self.nnvecs*(self.k+1) self.unlabeled_MLP = MLP(self.model, 'unlabeled', mlp_in_dims, options.mlp_hidden_dims, options.mlp_hidden2_dims, 4, self.activation) self.labeled_MLP = MLP(self.model, 'labeled' ,mlp_in_dims, options.mlp_hidden_dims, options.mlp_hidden2_dims,2*len(self.irels)+2,self.activation)
def __init__(self, x, y): # instantiate feature extractor self.fe = FeatureExtractor() self.x_text = x self.X = self.fe.fit(x, y) self.Y = y input_shape = self.X.shape[1:] # instantiate keras model self.model = mlp_model(input_shape) self.optimizer = keras.optimizers.Adam(lr=1e-3) # Create callback for early stopping on validation loss. If the loss does # not decrease in two consecutive tries, stop training. self.callbacks = [keras.callbacks.EarlyStopping( monitor='val_loss', patience=2)] self.model.compile(optimizer=self.optimizer, loss="binary_crossentropy", metrics=[precision_m, recall_m]) self.history = self.model.fit( self.X, self.Y, epochs=10, callbacks=self.callbacks, validation_split=0.1, verbose=0, # Logs once per epoch. batch_size=32) self.precision = self.history.history['val_precision_m'][-1] self.recall = self.history.history['val_recall_m'][-1]
def __init__(self, mode, data_type, log_csv_path, feature_path, debug_limit): self.db = SimpleCourseDB() self.db.build() print 'finish build course DB!' FeatureExtractor.__init__(self, mode, data_type, log_csv_path, feature_path, debug_limit)
def predict_svm(self, example): ''' :param example: str (example comment) :return: str (constructiveness prediction for the example) Description: Given a comment example, example, this class method returns whether the comment is constructive or not based on the trained model for constructiveness. ''' # Build a feature vector for the example example_df = pd.DataFrame.from_dict({ 'pp_comment_text': [example], 'constructive': ['?'] }) print(example_df) fe = FeatureExtractor(example_df) fe.extract_features() feats_df = fe.get_features_df() # Get the prediction score and find the winner prediction = self.svm_pipeline.predict(feats_df)[0] prediction_winner = 'Non-constructive' if prediction == 0 else 'Constructive' return prediction_winner.upper()
def __init__(self,texts=None,n=16,step_size=1,k=100,kmeans_args = None): self.n = n self.step_size = step_size self.k = k self.kmeans=None self.kmeans_args = kmeans_args FeatureExtractor.__init__(self)
def __init__(self, input_stream, output_stream, w=-1, h=-1, fps=-1, frames=-1, force_gray=False, repetitions=1, options=None, resume=False, reset_stream_when_resuming=False): self.input_stream = input_stream self.output_stream = output_stream self.repetitions = repetitions self.__completed_repetitions = 0 self.__start_time = None self.__elapsed_time = None self.__rho = options['rho'] self.steps = 0.0 self.measured_fps = 0.0 self.save_scores_only = options['save_scores_only'] options['stream'] = self.input_stream self.input_stream.set_options(w, h, fps, force_gray, frames) self.fe = FeatureExtractor( w, h, options, resume) # here is the TensorFlow based feature extractor! self.blink_steps = [] if resume: out("RESUMING...") self.load(reset_stream_when_resuming)
def __init__(self, prototype_dict, output_folder, opt): self.prototypes = prototype_dict self.opt = opt self.feature_extractor = FeatureExtractor(None) self.feature_vector_protoypes = self.calc_FV_protoypes() self.output_folder = output_folder self.metrics = {"precision": [], "recall": [], "f1": []}
def __init__(self, training_data_path, colour_space, num_orientations, pixels_per_cell, cells_per_block, hog_channel, spatial_size, hist_bins, toggle_spatial_features=True, toggle_histogram_features=True, toggle_hog_features=True): self.orientations = num_orientations self.pixels_per_cell = pixels_per_cell self.cells_per_block = cells_per_block self.feature_extractor = FeatureExtractor( colour_space, num_orientations, pixels_per_cell, cells_per_block, hog_channel, spatial_size, hist_bins, toggle_spatial_features, toggle_histogram_features, toggle_hog_features) self.classifier = CarClassifier(training_data_path, self.feature_extractor) self.fleet = VehicleFleet() self.heatmap = None self.frames = 0 self.labels = None
def dist_two_rand_verts(data): distances = [] mesh = data["poly_data"] indices_tuples = FeatureExtractor.generate_random_ints(0, len(mesh.points) - 1, (FeatureExtractor.number_vertices_sampled, 2)) verts_tuples = [mesh.points[tup] for tup in indices_tuples] distances = np.linalg.norm(np.abs(np.diff(np.array(verts_tuples), axis=1)).reshape(-1, 3), axis=1) del indices_tuples return {"hist_rand_dist_two_verts": FeatureExtractor.make_bins(distances, FeatureExtractor.number_bins)}
def test_stem_words(self): f = FeatureExtractor() s = f.stem_words({ 'connect': 1, 'connected': 1, 'connecting': 1, 'connection': 1 }) self.assertTrue(s == {'connect': 4})
def dist_bar_vert(data): distances = [] mesh = data["poly_data"] bary_center = mesh.center indices = FeatureExtractor.generate_random_ints(0, len(mesh.points) - 1, (FeatureExtractor.number_vertices_sampled, 1)) rand_verts = mesh.points[indices] distances = np.linalg.norm(np.abs(rand_verts.reshape(-1, 3) - bary_center), axis=1) del indices return {"hist_dist_bar_vert": FeatureExtractor.make_bins(distances, FeatureExtractor.number_bins)}
def __init__(self, mode, data_type, log_csv_path, feature_path, label_path, debug_limit): FeatureExtractor.__init__(self, mode, data_type, log_csv_path, feature_path, debug_limit) labels = {} with open(label_path, 'r') as r: for line in r: eid, dropout = line.strip().split(',') if str.isdigit(eid): labels[int(eid)] = int(dropout) self.labels = labels
def __init__(self, data_path, train_length=2500): fe = FeatureExtractor(data_path) (self.trainX, self.trainY, self.testX, self.testY, self.eng_tokenizer, self.hindi_tokenizer) = fe.get_train_test_data(train_length) self.l = fe.l self.eng_vocab_size = fe.eng_vocab_size self.hindi_vocab_size = fe.hindi_vocab_size self.eng_length = fe.eng_length self.hindi_length = fe.hindi_length
def predict_from_ngram(self, ngram): """Predict class from a ngram. args: ngram (str): n-gram """ feat_extr = FeatureExtractor(self.config_path) feat_val_list = [val for val in feat_extr.iter_feature_values(ngram)] return self.predict_from_feat_val_list(feat_val_list)
def main_training(): lexicon_loader = LexiconLoader() scored_lexicon: dict = lexicon_loader.load_all_and_merge() tr_tweets_loader = LabeledTweetsLoader(TRAINING_INPUT_FILENAME) tr_labeled_tweets = tr_tweets_loader.parse_tokens_and_labels( tr_tweets_loader.load_lines()) token_summarizer = TokenSummarizer(scored_lexicon) feature_extractor = FeatureExtractor(scored_lexicon) vu = VocabUtil() nn_input_preparer = NNInputPreparer(vu) tr_feature_vectors = [] # 2D array of feature vectors for labeled_tweet in tr_labeled_tweets: known_token_sequence = token_summarizer.get_known_tokens( labeled_tweet[0]) feature_vector = feature_extractor.compute_feature_vector( known_token_sequence) tr_feature_vectors.append(feature_vector) tr_network_input = np.array(tr_feature_vectors) tr_targets = [labeled_tweet[1] for labeled_tweet in tr_labeled_tweets] tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( tr_targets) dev_tweets_loader = LabeledTweetsLoader(DEV_INPUT_FILENAME) dev_labeled_tweets = dev_tweets_loader.parse_tokens_and_labels( dev_tweets_loader.load_lines()) dev_feature_vectors = [] # 2D array of feature vectors for labeled_tweet in dev_labeled_tweets: known_token_sequence = token_summarizer.get_known_tokens( labeled_tweet[0]) feature_vector = feature_extractor.compute_feature_vector( known_token_sequence) dev_feature_vectors.append(feature_vector) dev_network_input = np.array(dev_feature_vectors) dev_targets = [labeled_tweet[1] for labeled_tweet in dev_labeled_tweets] dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( dev_targets) # Every epoch is cheap (< 1ms), so we don't need the ability to continue training from a previous model. print("Commencing new training run") model_creator = ModelCreator(vu) model = model_creator.create_two_dense_model(hidden_layer_size=HIDDEN_SIZE) cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5' checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=False) model.fit(tr_network_input, tr_targets_one_hot_encoded, batch_size=32, epochs=MAX_EPOCHS, validation_data=(dev_network_input, dev_targets_one_hot_encoded), callbacks=[checkpoint])
def __init__(self, prefix='_p_', min_df=1, max_per=1.0, binarize=False, transform=None, replace_num='#', source=None, subdir=None, pseudotype=None, splits_file=None, stage='training', suffix='', lower=True, scale_factor=None): name = 'pkl' assert transform != 'tfidf' FeatureExtractor.__init__(self, name=name, prefix=prefix, min_df=min_df, max_per=max_per, binarize=binarize, transform=transform, replace_num=replace_num, source=source, subdir=subdir, pseudotype=pseudotype, splits_file=splits_file, stage=stage, suffix=suffix, lower=lower, scale_factor=scale_factor)
def __init__(self): self.root_path_ = os.path.split(os.path.realpath(__file__))[0] self.model_path_ = os.path.join(self.root_path_, 'test.pkl') self.clf_ = joblib.load(self.model_path_) self.fte_ = FeatureExtractor() self.sal_ = SALineupWrapper() self.sal_opt_args_ = ' --productname=sc --script-malware=true --loglevel=all' self.sal_path_ = os.path.join(self.sal_.get_path(), 'salineup') self.behavior_path_ = os.path.join(self.root_path_, 'behavior')
def __init__(self, movie_dict=None, act_set=None, slot_set=None, db=None, corpus=None, train=True, _reload=False, n_hid=100, batch=128, ment=0., inputtype='full', upd=10, sl='e2e', rl='e2e', pol_start=600, lr=0.005, N=1, tr=2.0, ts=0.5, max_req=2, frac=0.5, name=None): self.movie_dict = movie_dict self.act_set = act_set self.slot_set = slot_set self.database = db self.max_turn = dialog_config.MAX_TURN self.training = train self.feat_extractor = FeatureExtractor(corpus, self.database.path, N=N) out_size = len(dialog_config.inform_slots) + 1 in_size = len(self.feat_extractor.grams) + len( dialog_config.inform_slots) slot_sizes = [ self.movie_dict.lengths[s] for s in dialog_config.inform_slots ] self._init_model(in_size, out_size, slot_sizes, self.database, \ n_hid=n_hid, learning_rate_sl=lr, batch_size=batch, ment=ment, inputtype=inputtype, \ sl=sl, rl=rl) self._name = name if _reload: self.load_model(dialog_config.MODEL_PATH + self._name) if train: self.save_model(dialog_config.MODEL_PATH + self._name) self._init_experience_pool(batch) self.episode_count = 0 self.recent_rewards = deque([], 1000) self.recent_successes = deque([], 1000) self.recent_turns = deque([], 1000) self.recent_loss = deque([], 10) self.discount = 0.99 self.num_updates = 0 self.pol_start = pol_start self.tr = tr self.ts = ts self.max_req = max_req self.frac = frac self.upd = upd
def __init__(self, vocab, options): # import here so we don't load Dynet if just running parser.py --help for example from multilayer_perceptron import MLP from feature_extractor import FeatureExtractor import dynet as dy global dy global LEFT_ARC, RIGHT_ARC, SHIFT, SWAP LEFT_ARC, RIGHT_ARC, SHIFT, SWAP = 0, 1, 2, 3 self.model = dy.ParameterCollection() self.trainer = dy.AdamTrainer(self.model, alpha=options.learning_rate) self.activations = { 'tanh': dy.tanh, 'sigmoid': dy.logistic, 'relu': dy.rectify, 'tanh3': (lambda x: dy.tanh(dy.cwise_multiply(dy.cwise_multiply(x, x), x))) } self.activation = self.activations[options.activation] self.oracle = options.oracle self.headFlag = options.headFlag self.rlMostFlag = options.rlMostFlag self.rlFlag = options.rlFlag self.k = options.k self.recursive_composition = options.use_recursive_composition #ugly hack #dimensions depending on extended features self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0) + (1 if self.recursive_composition else 0) self.feature_extractor = FeatureExtractor(self.model, options, vocab, self.nnvecs) self.irels = self.feature_extractor.irels if options.no_bilstms > 0: mlp_in_dims = options.lstm_output_size * 2 * self.nnvecs * ( self.k + 1) else: mlp_in_dims = options.lstm_input_size * self.nnvecs * (self.k + 1) self.unlabeled_MLP = MLP(self.model, 'unlabeled', mlp_in_dims, options.mlp_hidden_dims, options.mlp_hidden2_dims, 4, self.activation) self.labeled_MLP = MLP(self.model, 'labeled', mlp_in_dims, options.mlp_hidden_dims, options.mlp_hidden2_dims, 2 * len(self.irels) + 2, self.activation)
def launch(cfg_path): print('[INFO]', 'Starting ...') print('[INFO]', 'Loading config') json_cfg = load_config(cfg_path) print('[DEBUG]', json_cfg) json_cfg['config_file'] = os.path.basename(cfg_path) extractor = FeatureExtractor(json_cfg) extractor.start()
def __init__(self, mode, data_type, log_csv_path, enrollment_path, label_path, module_path, feature_path, debug_limit): self.db = SimpleCourseDB(mode, data_type, log_csv_path, enrollment_path, label_path, module_path, feature_path, debug_limit) self.db.build() print 'finish build course DB!' log_csv_path = base_dir + '/../../data/log_train.csv' FeatureExtractor.__init__(self, mode, data_type, log_csv_path, feature_path, debug_limit)
def __init__(self): self.feature_extractor = FeatureExtractor() self.frames = 10 # num of frames to aggregate heatmaps over self.heatmaps = [] # collection of heatmaps over past 10 frames self.cummulative_heatmap = np.zeros( (720, 1280)).astype(np.float64) # cummulative heat map over 10 frames self.cars_detected = 0 # count of cars detected in this frame self.contours_detected = []
def main(): caffe_alexnet_path = "/path/to/caffe-modelzoo/AlexNet" caffe_vgg16_path = "/path/to/caffe-modelzoo/VGG16" caffe_googlenet_path = "/path/to/caffe-modelzoo/GoogleNet" keys_path = "/path/to/dataset/keys.txt" data_path = "/path/to/dataset/images" dst_path = "/path/to/dataset/features.npy" modelname = "VGG16" # load pre-trained model if modelname == "AlexNet": if not os.path.exists(os.path.join(caffe_alexnet_path, "imagenet_mean.npy")): convert_mean_file(caffe_alexnet_path) convnet = FeatureExtractor( prototxt_path=os.path.join(caffe_alexnet_path, "alexnet_deploy.prototxt"), caffemodel_path=os.path.join(caffe_alexnet_path, "alexnet.caffemodel"), target_layer_name="fc6", image_size=227, mean_path=os.path.join(caffe_alexnet_path, "imagenet_mean.npy") ) elif modelname == "VGG16": convnet = FeatureExtractor( prototxt_path=os.path.join(caffe_vgg16_path, "vgg16_deploy.prototxt"), caffemodel_path=os.path.join(caffe_vgg16_path, "vgg16.caffemodel"), target_layer_name="fc6", image_size=224, mean_values=[103.939, 116.779, 123.68] ) elif modelname == "GoogleNet": googlenet = FeatureExtractor( prototxt_path=os.path.join(caffe_googlenet_path, "googlenet_deploy.prototxt"), caffemodel_path=os.path.join(caffe_googlenet_path, "googlenet.caffemodel"), target_layer_name="pool5/7x7_s1", image_size=224, mean_values=[104.0, 117.0, 123.0] ) else: print "Unknown model name: %s" % modelname sys.exit(-1) # data list keys = load_keys(keys_path) # feature extraction feats = [] for key in keys: img = cv2.imread(os.path.join(data_path, key)) assert img is not None feat = convnet.transform(img) feats.append(feat) feats = np.asarray(feats) np.save(dst_path, feats) print "Done."
def train_model(X_df, y_array, skf_is): fe = FeatureExtractor() fe.fit(X_df, y_array) X_array = fe.transform(X_df) # Regression train_is, _ = skf_is X_train_array = np.array([X_array[i] for i in train_is]) y_train_array = np.array([y_array[i] for i in train_is]) reg = Regressor() reg.fit(X_train_array, y_train_array) return fe, reg
def makefeatures(self, sents_list, ppindexlist): """ ARGS sent_list: [[s1word1,s1word2,...], [s2word1,s2word2,...],...] RETURNS _features: a list of feature set (dict) """ _features = [] for sent, ppindex in zip(sents_list, ppindexlist): fe = FeatureExtractor(sent, ppindex, "succ") _features.append(fe.features()) return _features
def generate_seti(filenames, for_test=False): files = [] for filename in filenames: for fname in glob.glob(filename): files.append(fname) print 'logs_to_seti reading from files: %s' % (str(files)) setis = [] # Read each file where each row represents a training example. for fname in files: num_lines = 0 num_invalid_lines = 0 num_bad_entry_lines = 0 bad_entry_lines = [] # Read examples from file. with open(fname, 'rb') as csvfile: reader = csv.reader(csvfile) reader.next() # ignore header i = 0 invalid_lines = [] for csv_line in reader: num_lines += 1 bad_line, reason = is_bad_line(csv_line) if bad_line: num_invalid_lines += 1 continue #try: renter_form, err = _to_renter_form(csv_line) if renter_form is None: print err num_bad_entry_lines += 1 bad_entry_lines.append(csv_line) continue fe = FeatureExtractor(for_test=for_test) seti = fe.to_seti(renter_form) setis.append(seti) #except Exception as e: # num_invalid_lines += 1 # invalid_lines.append(i) # print 'e: %s' % (str(e)) # PrintException() # print 'Could not parse line %d. %d cols. \n%s' % (i, len(csv_line), csv_line) i += 1 # Finished handling file. print 'File: %s' % fname valid_lines = num_lines-num_invalid_lines-num_bad_entry_lines print 'Num lines: %d. Valid: %d. Invalid: %d. Bady entry: %d' % (num_lines, valid_lines, num_invalid_lines, num_bad_entry_lines) if len(setis) == 0: raise Exception('No setis generated!') return setis
def processDir(corpusName, mailCorpus, maildir): mailIterator = mailCorpus.getFilesList(maildir) mailStorage = MailStorage(corpusName) featureExtractor = FeatureExtractor() progress = ProgressDisplay(len(mailIterator), 'Processing emails') # Output files are named 1 to numMails index = 1 for mail in mailIterator: processed = processMail(maildir, mail, mailCorpus) features = featureExtractor.process(processed) mailStorage.store(features, str(index)) index += 1 progress.update()
def extract_data(self, id, extraction_method, label_type): extractor = FeatureExtractor() feature_vector = extractor.extract_feature_vector(id, extraction_method) if label_type == 'compiler': label = self.extract_compiler_label(id) # for compiler estimation elif label_type == 'optimization_level': label = self.extract_optimization_level_label(id) # for optimization level estimation elif label_type == 'test': return feature_vector # for test data else: sys.stderr.write('Unknown label type specified') sys.exit() return label, feature_vector
def update_database_from_file(self, file_name, asm_file_path, gdl_file_path, compiler=None, optimization_level=None): file_name += '_' + compiler + '_' + optimization_level parser = IDAFileParser() extractor = FeatureExtractor() db_constructor = DatabaseConstructor() # Update file_name table db_constructor.insert_file_name(file_name) # Update instruction_sequence table instruction_list = parser.extract_instruction(asm_file_path) db_constructor.insert_instruction_sequence(file_name, instruction_list) # Update instruction_code_block table code_block_list = parser.extract_code_block(asm_file_path) db_constructor.insert_code_block(file_name, code_block_list) # Update opcode_variety table opcode_list = parser.extract_opcode(asm_file_path) db_constructor.append_opcode_variety(opcode_list) # Update bigram_variety table bigram_list = extractor.extract_ngram_list(opcode_list, 2) db_constructor.append_bigram_variety(bigram_list) # Update trigram_variety table trigram_list = extractor.extract_ngram_list(opcode_list, 3) db_constructor.append_trigram_variety(trigram_list) # Update api table api_list = parser.extract_api(gdl_file_path) db_constructor.insert_api(file_name, api_list) # Update api_variety table db_constructor.append_api_variety(api_list) if compiler is not None: # Update compiler_information table db_constructor.insert_compiler_information(file_name, compiler) if optimization_level is not None: # Update optimization_level_information table db_constructor.insert_optimization_level_information(file_name, optimization_level)
def predict_model(self, model_file=None, output_file=None, output_probability_file=None): """ Predict classes on self.data and output to output_file :param model_file: Model file to read model in from. Otherwise looks for self.classifier :param output_file: File to save predictions in :param output_probability_file: File to save predicted probabilities in :return: predicted classes (array) """ if not self.classifier: if not model_file: raise Exception("No model to predict with.") else: with open(model_file) as f: self.classifier = pickle.load(f) if self.data is None: raise Exception("Trying to predict using model with no data loaded.") self.featureExtractor = FeatureExtractor(self.data) feature_matrix = self.featureExtractor.extract_full_feature_matrix() self.predictions = self.classifier.predict(feature_matrix) if output_file is not None: np.savetxt(output_file, self.predictions, delimiter=",", fmt="%d") if output_probability_file is not None: pred_probs = self.classifier.predict_proba(feature_matrix) np.savetxt(output_probability_file, pred_probs, delimiter=",", fmt="%.3f") return self.predictions
def train_model(self, model_out_file): """ Extract the features from self.data and train the classifier. Output pickled model to model_out_file :param model_out_file: :return: None """ if self.data is None: raise Exception("Trying to train model without any data.") sys.stderr.write("Extracting features from data.\n") self.featureExtractor = FeatureExtractor(self.data) feature_matrix = self.featureExtractor.extract_full_feature_matrix() labels = np.array([0 if lab == "Romantic" else 1 for lab in self.data["is_romantic"]]) sys.stderr.write("Training classifier.\n") self.classifier = LogisticRegression() if self.classifier_type == "logit" else DecisionTreeClassifier() self.classifier.fit(feature_matrix, labels) sys.stderr.write("Saving classifier.\n") with open(model_out_file, "w") as f: pickle.dump(self.classifier, f)
class TestFeatureExtractor(unittest.TestCase): ''' Unit tests for the FeatureExtractor class. Does simple tests to insure that the feature vector we get back is of the right length and has frequency data that makes sense. More tests should be added. ''' def setUp(self): '''Sets up the test by constructing feature vectors to get tested''' self.record1 = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", IUPAC.protein), id="YP_025292.1", name="HokC", description="toxic membrane protein, small") self.seq1 = self.record1.seq self.feature_extractor = FeatureExtractor() self.feature_vector1 = self.feature_extractor.extract_features(self.seq1) def test_feature_vector_length(self): '''Tests that the feature vector is 400 elements long''' self.assertEqual(len(self.feature_vector1), 400, msg="Feature vector not 400 long") def test_dipeptide_frequency_sum(self): '''Tests that the dipeptide frequencies sum to 1''' checksum = 0.0 for i in range(0,400): checksum += self.feature_vector1[i] self.assertAlmostEqual(checksum, 1.0, places=5, msg="Frequencies don't sum to 1")
def __init__(self, movie_dict=None, act_set=None, slot_set=None, db=None, corpus=None, train=True, _reload=False, n_hid=100, batch=128, ment=0., inputtype='full', upd=10, sl='e2e', rl='e2e', pol_start=600, lr=0.005, N=1, tr=2.0, ts=0.5, max_req=2, frac=0.5, name=None): self.movie_dict = movie_dict self.act_set = act_set self.slot_set = slot_set self.database = db self.max_turn = dialog_config.MAX_TURN self.training = train self.feat_extractor = FeatureExtractor(corpus,self.database.path,N=N) out_size = len(dialog_config.inform_slots)+1 in_size = len(self.feat_extractor.grams) + len(dialog_config.inform_slots) slot_sizes = [self.movie_dict.lengths[s] for s in dialog_config.inform_slots] self._init_model(in_size, out_size, slot_sizes, self.database, \ n_hid=n_hid, learning_rate_sl=lr, batch_size=batch, ment=ment, inputtype=inputtype, \ sl=sl, rl=rl) self._name = name if _reload: self.load_model(dialog_config.MODEL_PATH+self._name) if train: self.save_model(dialog_config.MODEL_PATH+self._name) self._init_experience_pool(batch) self.episode_count = 0 self.recent_rewards = deque([], 1000) self.recent_successes = deque([], 1000) self.recent_turns = deque([], 1000) self.recent_loss = deque([], 10) self.discount = 0.99 self.num_updates = 0 self.pol_start = pol_start self.tr = tr self.ts = ts self.max_req = max_req self.frac = frac self.upd = upd
def setUp(self): '''Sets up the test by constructing feature vectors to get tested''' self.record1 = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", IUPAC.protein), id="YP_025292.1", name="HokC", description="toxic membrane protein, small") self.seq1 = self.record1.seq self.feature_extractor = FeatureExtractor() self.feature_vector1 = self.feature_extractor.extract_features(self.seq1)
def __init__(self): self.actuNames = ActuatorNames() self.sensorNames = SensorNames() self.bdm = BDWrapper() self.expLogColl = CollectionWrapper('experience_log') #self.zonelist = self.csv2list('metadata/partialzonelist.csv') self.zonelist = self.csv2list('metadata/zonelist.csv') self.feater = FeatureExtractor() self.clust = Clusterer()
def __init__(self, numGestures, minDescriptorsPerFrame, numWords, descType, numPredictions, parent): self.numGestures = numGestures self.numWords = numWords self.minDescriptorsPerFrame = minDescriptorsPerFrame self.parent = parent self.classifier = None self.windowName = "Testing preview" self.handWindowName = "Cropped hand" self.binaryWindowName = "Binary frames" self.predictionList = [-1]*numPredictions; self.handTracker = HandTracker(kernelSize=7, thresholdAngle=0.4, defectDistFromHull=30, parent=self) self.featureExtractor = FeatureExtractor(type=descType, parent=self) self.numSideFrames = 10 self.prevFrameList = np.zeros((self.numSideFrames,self.parent.imHeight/self.numSideFrames,self.parent.imWidth/self.numSideFrames,3), "uint8") self.numPrevFrames = 0 self.predictionScoreThreshold = 0.2 self.learningRate = 0.01 self.numReinforce = 1
def __init__(self, output_width=11, training_frac=70.0, validation_frac=15.0, debug=False): self.input_width = 400 self.output_width = output_width self.training_frac = training_frac self.validation_frac = validation_frac self.debug = debug # self.dir = "/home/jlawson/Dropbox/ProteinFunctionData/" # Where the files live. self.names = [ # Names of all of the files. "baseplate_3370", "collar_1385", "htj_2258_nofg", "major_tail_1512", "mcp_3589", "minor_capsid_1500_nofg", "minor_tail_2033", "portal_2141", "tail_fiber_3007", "tail_sheath_2350", ] self.feature_extractor = FeatureExtractor()
class RelationshipPostClassifier: """ Main class for classification and prediction """ def __init__(self, classifier_type="tree"): self.data = None self.classifier_type = classifier_type self.classifier = None self.featureExtractor = None self.predictions = None def read_csv_data(self, csv_file, maxrows=None): """ Read in data from given csv_file into self.data (pandas dataframe) maxrows limits number of read rows. :param csv_file: :param maxrows: :return: None """ sys.stderr.write("Reading in data from " + csv_file + "\n") if maxrows: self.data = pan.read_csv(csv_file, nrows=maxrows, encoding='utf-8') else: self.data = pan.read_csv(csv_file, encoding='utf-8') def train_model(self, model_out_file): """ Extract the features from self.data and train the classifier. Output pickled model to model_out_file :param model_out_file: :return: None """ if self.data is None: raise Exception("Trying to train model without any data.") sys.stderr.write("Extracting features from data.\n") self.featureExtractor = FeatureExtractor(self.data) feature_matrix = self.featureExtractor.extract_full_feature_matrix() labels = np.array([0 if lab == "Romantic" else 1 for lab in self.data["is_romantic"]]) sys.stderr.write("Training classifier.\n") self.classifier = LogisticRegression() if self.classifier_type == "logit" else DecisionTreeClassifier() self.classifier.fit(feature_matrix, labels) sys.stderr.write("Saving classifier.\n") with open(model_out_file, "w") as f: pickle.dump(self.classifier, f) def predict_model(self, model_file=None, output_file=None, output_probability_file=None): """ Predict classes on self.data and output to output_file :param model_file: Model file to read model in from. Otherwise looks for self.classifier :param output_file: File to save predictions in :param output_probability_file: File to save predicted probabilities in :return: predicted classes (array) """ if not self.classifier: if not model_file: raise Exception("No model to predict with.") else: with open(model_file) as f: self.classifier = pickle.load(f) if self.data is None: raise Exception("Trying to predict using model with no data loaded.") self.featureExtractor = FeatureExtractor(self.data) feature_matrix = self.featureExtractor.extract_full_feature_matrix() self.predictions = self.classifier.predict(feature_matrix) if output_file is not None: np.savetxt(output_file, self.predictions, delimiter=",", fmt="%d") if output_probability_file is not None: pred_probs = self.classifier.predict_proba(feature_matrix) np.savetxt(output_probability_file, pred_probs, delimiter=",", fmt="%.3f") return self.predictions
def runOnSplit(penalties, constants, split): "Running on a " + str(split*100) + '/' + str((1-split)*100) + ' split' fe = FeatureExtractor(split) featurized = fe.featurizeFiles('../data') classNames = featurized[0] trainMatrix, trainLabels = featurized[1:3] devMatrix, devLabels = featurized[3:5] trainFiles, devFiles = featurized[5:] classCounts = Counter() for l in devLabels: classCounts[l] += 1 for penalty in penalties: for C in constants: print "\nPenalty, regularization: ", str(penalty), str(C) abstractModel = LogisticRegression() model = abstractModel.scikit(penalty, C) model_params = (penalty, C) model.fit(trainMatrix, trainLabels) errors, rankedExamples = Counter(), [] score = model.score(devMatrix, devLabels) predicted_labels = model.predict(devMatrix) probs = model.predict_proba(devMatrix) for j,pred in enumerate(predicted_labels): if not pred == devLabels[j]: errors[devLabels[j]] += 1 for i, p in enumerate(probs): rankedExamples.append((p, devFiles[i], predicted_labels[i] == devLabels[i])) results = '' for i, c in enumerate(classNames): missRate = str(float(errors[i]) / classCounts[i]) results += '\t' + c + ' error: ' + missRate + '\n' results += '\tScore: ' + str(score) fileName = 'results/scores/LRsplit' for param in model_params: fileName += '_' + str(param) fileName += '.txt' with open(fileName, 'w') as f: f.write(results) print results print '..ranking examples' if len(rankedExamples): examples = sorted(rankedExamples, key=lambda e: e[0][0]) fileName = 'results/rankedExamples/LRsplit_' + str(split*100) for param in model_params: fileName += '_' + str(param) fileName += '.txt' with open(fileName,'w') as f: for e in examples: results = e[1] results += '\n\t Probability of class ' results += classNames[0] + ': ' results += str(e[0][0]) results += '\n\t Correct: ' + str(e[2]) f.write(results)
def __init__(self, data_type, mode, debug_limit): log_csv_path = '{0}/../data/{1}/log_{1}.csv'.format(base_dir, data_type) feature_path = '{0}/../data/feature/enrollment_feature_{1}.csv'.format(base_dir, data_type) FeatureExtractor.__init__(self, mode, log_csv_path, feature_path, debug_limit)
def __init__(self,sentance_length_range=None): self.sentance_length_range = sentance_length_range FeatureExtractor.__init__(self)
def __init__(self, mode, data_type, log_csv_path, module_path, feature_path, debug_limit): self.module_db = load_modules(module_path) FeatureExtractor.__init__(self, mode, data_type, log_csv_path, feature_path, debug_limit)
def _get_features(self, v="", v_corpus=None, cls2id=None, domain="src"): _flist = [] _labellist_int = [] _labellist_str = [] _labelid = cls2id[v] if v_corpus: for sid, s in enumerate(v_corpus): try: fe = FeatureExtractor(s, verb=v) if "chunk" in self.featuretypes: fe.chunk() if "3gram" in self.featuretypes: fe.ngrams(n=3) if "5gram" in self.featuretypes: fe.ngrams(n=5) if "7gram" in self.featuretypes: fe.ngrams(n=7) if "dep" in self.featuretypes: fe.dependency() if "srl" in self.featuretypes: fe.srl() if "ne" in self.featuretypes: fe.ne() if "errorprob" in self.featuretypes: pass if "topic" in self.featuretypes: pass augf = proc_easyadapt(fe.features, domain=domain) _flist.append(augf) _labellist_int.append(_labelid) _labellist_str.append(v) except ValueError: logging.debug(pformat("CaseMaker feature extraction: couldn't find the verb")) except: print v raise else: _flist.append(self.nullfeature) _labellist_int.append(_labelid) _labellist_str.append(v) return _flist, _labellist_str, _labellist_int
class MultiReader(DataLoader): def __init__(self, output_width=11, training_frac=70.0, validation_frac=15.0, debug=False): self.input_width = 400 self.output_width = output_width self.training_frac = training_frac self.validation_frac = validation_frac self.debug = debug # self.dir = "/home/jlawson/Dropbox/ProteinFunctionData/" # Where the files live. self.names = [ # Names of all of the files. "baseplate_3370", "collar_1385", "htj_2258_nofg", "major_tail_1512", "mcp_3589", "minor_capsid_1500_nofg", "minor_tail_2033", "portal_2141", "tail_fiber_3007", "tail_sheath_2350", ] self.feature_extractor = FeatureExtractor() def load_data(self, source): """Load the data from a directory with a collection of source files, one file for each kind of protein. Returns an array of pairs in the form: [(train_set_in, train_set_out), (validation_set_in, validation_set_out), (test_set_in, test_set_out)] :type source: String :param source: The directory where the source files are located. """ dir = source raw_data = list() unsupporteds = list() for i in range(0, len(self.names)): num_in_file = 0 if self.debug: print (dir + self.names[i] + ".faa") handle = open(dir + self.names[i] + ".faa", "rU") # Open a file. for record in SeqIO.parse(handle, "fasta"): num_in_file += 1 try: # print " " + record.id feature_vector = self.feature_extractor.extract_features(record) # Now we have to augment the feature vector with the output # vector. So we: # 1) Make a new array a bit longer than the feature vector, # 2) Copy the feature vector into the first cells of the new array, # 3) Find the appropriate cell in the tail of the new array # and set that one equal to 1. prepared_data_record = numpy.zeros(len(feature_vector) + self.output_width) for col in range(0, len(feature_vector)): # This surely could be done more efficiently. prepared_data_record[col] = feature_vector[col] # Doesn't matter for now. prepared_data_record[ len(feature_vector) + i ] = 1 # The class of the protein is taken from the order of the files in the list "names" raw_data.append(prepared_data_record) except KeyError: if self.debug: print " Unsupported sequence: " + record.id + " " + str(record.annotations) unsupporteds.append(record) pass handle.close() if self.debug: print "Total in file " + self.names[i] + " = " + str(num_in_file) # Now we are done reading all of the data in. In debug mode, print some # overall summary information. if self.debug: print "Supported Sequences = " + str(len(raw_data)) print "Unsupported Sequences = " + str(len(unsupporteds)) num_examples = len(raw_data) # But the labeled data we have is not randomly ordered. It is sorted # by class. We need to shuffle it up or we will only train on the first # classes. if self.debug: print "Shuffling data to randomize for training" shuffle = self.rand_perm(num_examples) data = numpy.ndarray((num_examples, self.input_width + self.output_width), float) for n in range(0, num_examples): for w in range(0, self.input_width + self.output_width): s = raw_data[shuffle[n]][w] data[n, w] = float(s) if self.debug: print "Finished shuffling data" print "Processing data to cull outliers" data = self.preprocess(self.cull(data)) num_examples = len(data) print "Data shape = ", data.shape, " num_examples=", num_examples inputs = numpy.array(data)[:, 0 : self.input_width] outputs_full = numpy.array(data)[:, self.input_width : self.input_width + self.output_width] if self.debug: print "Finished culling outliers" print inputs.shape print outputs_full.shape outputs = numpy.ndarray((num_examples,), int) for n in range(0, num_examples): found_class = False for w in range(0, self.output_width): if outputs_full[n, w] > 0.5: outputs[n] = w found_class = True break num_training_cases = self.num_training(num_examples) num_validation_cases = self.num_validation(num_examples) num_test_cases = self.num_test(num_examples) print num_training_cases, " ", num_validation_cases, " ", num_test_cases training_set = (inputs[0:num_training_cases, :], outputs[0:num_training_cases]) validation_set = ( inputs[num_training_cases : num_training_cases + num_validation_cases, :], outputs[num_training_cases : num_training_cases + num_validation_cases], ) test_set = ( inputs[num_training_cases + num_validation_cases :, :], outputs[num_training_cases + num_validation_cases :], ) training_set_x, training_set_y = theanoutil.shared_dataset(training_set) validation_set_x, validation_set_y = theanoutil.shared_dataset(validation_set) test_set_x, test_set_y = theanoutil.shared_dataset(test_set) if self.debug: print "TYPE of test_set_x =", type(test_set_x) print "TYPE of test_set=", type(test_set), " SIZE of test_set=", len(test_set) print "TYPE of test_set[0]=", type(test_set[0]), " SHAPE of test_set[0]=", test_set[0].shape print "TYPE of test_set[1]=", type(test_set[1]), " SHAPE of test_set[1]=", test_set[1].shape print "VALUE of training_set[0,0,0]=", training_set[0][0, 0] print "VALUE of training_set[1,0]=", training_set[1][0], " test_set[1,0]=", test_set[1][0] rval = [(training_set_x, training_set_y), (validation_set_x, validation_set_y), (test_set_x, test_set_y)] return rval # Everything from here down should be turned into a base class. def num_training(self, num_examples): return num_examples * (self.training_frac / 100.0) def num_validation(self, num_examples): return num_examples * (self.validation_frac / 100.0) def num_test(self, num_examples): return num_examples - (self.num_training(num_examples) + self.num_validation(num_examples)) def rand_perm(self, length): # In debug mode, we want to have a repeatable random number seed so # that we can have a repeatable shuffling. if self.debug: seed(1) shuffle = numpy.ndarray((length,), int) for n in range(0, length): shuffle[n] = n for n in range(0, length): swap_cell = randint(0, length - 1) temp = shuffle[swap_cell] shuffle[swap_cell] = shuffle[n] shuffle[n] = temp return shuffle def cull(self, data): # Make a list of all row numbers that need to get culled from the data. cull_list = [] for n in range(0, len(data)): if self.prune(data[n]): cull_list.append(n) cull_list.append(len(data)) # A sentinel at the end of the cull list. # Make a new array that doesn't have the culled items in it. # The 1+ is for the sentinel. new_data = numpy.ndarray((1 + len(data) - len(cull_list), self.input_width + self.output_width), float) next_cull_index = 0 next_data_index = 0 for n in range(0, len(data)): if n == cull_list[next_cull_index]: next_cull_index += 1 else: new_data[next_data_index] = data[n] next_data_index += 1 print "Number culled = ", len(cull_list) - 1 return new_data def prune(self, example): sum = 0.0 for n in range(0, self.input_width): if example[n] < 0.0: return True if example[n] > 1.0: return True sum += example[n] if sum > 1.01: return True if sum < 0.99: return True return False def preprocess(self, data): n = self.input_width for r in range(0, len(data)): sum_x = 0.0 sum_x2 = 0.0 for c in range(0, n): sum_x += data[r, c] sum_x2 += data[r, c] * data[r, c] mu = sum_x / n std = math.sqrt((sum_x2 - (sum_x * sum_x) / n) / n) # Population std for c in range(0, n): z = (data[r, c] - mu) / std # squashed_z = sigma(z) data[r, c] = z if r % 1000 == 0: print "Preprocessed row ", r return data
class Tester(object): def __init__(self, numGestures, minDescriptorsPerFrame, numWords, descType, numPredictions, parent): self.numGestures = numGestures self.numWords = numWords self.minDescriptorsPerFrame = minDescriptorsPerFrame self.parent = parent self.classifier = None self.windowName = "Testing preview" self.handWindowName = "Cropped hand" self.binaryWindowName = "Binary frames" self.predictionList = [-1]*numPredictions; self.handTracker = HandTracker(kernelSize=7, thresholdAngle=0.4, defectDistFromHull=30, parent=self) self.featureExtractor = FeatureExtractor(type=descType, parent=self) self.numSideFrames = 10 self.prevFrameList = np.zeros((self.numSideFrames,self.parent.imHeight/self.numSideFrames,self.parent.imWidth/self.numSideFrames,3), "uint8") self.numPrevFrames = 0 self.predictionScoreThreshold = 0.2 self.learningRate = 0.01 self.numReinforce = 1 def initialize(self, clf): self.classifier = clf self.numWords = self.classifier.voc.shape[0] self.prevStates = np.zeros((self.numSideFrames, self.numWords), "float32") self.prevLabels = [0]*self.numSideFrames self.prevScores = [0]*self.numSideFrames def test_on_video(self): vc = self.parent.vc while(vc.isOpened()): ret,im = vc.read() im = cv2.flip(im, 1) imhsv = cv2.cvtColor(im, cv2.COLOR_BGR2HSV) self.handTracker.colorProfiler.draw_color_windows(im, imhsv) cv2.imshow(self.windowName, im) k = cv2.waitKey(1) if k == 32: # space break elif k == 27: sys.exit(0) self.handTracker.colorProfiler.run() binaryIm = self.handTracker.get_binary_image(imhsv) cnt,hull,centroid,defects = self.handTracker.initialize_contour(binaryIm) cv2.namedWindow(self.binaryWindowName) cv2.namedWindow(self.handWindowName) cv2.namedWindow(self.windowName) cv2.setMouseCallback(self.windowName, self.reinforce) while(vc.isOpened()): ret,im = vc.read() im = cv2.flip(im, 1) imhsv = cv2.cvtColor(im, cv2.COLOR_BGR2HSV) imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) binaryIm = self.handTracker.get_binary_image(imhsv) cnt,hull,centroid,defects = self.handTracker.get_contour(binaryIm) imCopy = 1*im testData = None prediction = -1 score = -1 update = False if cnt is not None: numDefects = defects.shape[0] cropImage,cropPoints = self.handTracker.get_cropped_image_from_cnt(im, cnt, 0.05) cropImageGray = self.handTracker.get_cropped_image_from_points(imgray, cropPoints) #cv2.fillPoly(binaryIm, cnt, 255) #cropImageBinary = self.handTracker.get_cropped_image_from_points(binaryIm, cropPoints) #cropImageGray = self.apply_binary_mask(cropImageGray, cropImageBinary, 5) #kp,des = self.featureExtractor.get_keypoints_and_descriptors(cropImageGray) kp = self.featureExtractor.get_keypoints(cropImageGray) cropCnt = self.handTracker.get_cropped_contour(cnt, cropPoints) kp = self.featureExtractor.get_keypoints_in_contour(kp, cropCnt) kp,des = self.featureExtractor.compute_descriptors(cropImageGray, kp) if des is not None and des.shape[0] >= 0: self.featureExtractor.draw_keypoints(cropImage, kp) if des is not None and des.shape[0] >= self.minDescriptorsPerFrame and self.is_hand(defects): words, distance = vq(des, self.classifier.voc) testData = np.zeros(self.numWords, "float32") for w in words: testData[w] += 1 normTestData = np.linalg.norm(testData, ord=2) * np.ones(self.numWords) testData = np.divide(testData, normTestData) prediction,score = self.predict(testData) sortedScores = np.sort(score) #if max(score) > self.predictionScoreThreshold: if sortedScores[-1]-sortedScores[-2] >= self.predictionScoreThreshold: self.handTracker.draw_on_image(imCopy, cnt=False, hullColor=(0,255,0)) else: self.handTracker.draw_on_image(imCopy, cnt=False, hullColor=(255,0,0)) prediction = -1 update = True else: self.handTracker.draw_on_image(imCopy, cnt=False, hullColor=(0,0,255)) prediction = -1 cv2.imshow(self.handWindowName,cropImage) else: prediction = -1 #self.insert_to_prediction_list(prediction) #prediction,predictionCount = self.most_common(self.predictionList) #if prediction>=0: writtenVal = '-' if prediction > 0: #if self.classifier.medianDefects is not None and numDefects>=self.classifier.medianDefects[prediction-1]-1 and numDefects<=self.classifier.medianDefects[prediction-1]+1: # #print prediction # writtenVal = str(prediction) # update = True #elif self.classifier.medianDefects is None: #print prediction writtenVal = str(prediction) self.write_on_image(imCopy, writtenVal) cv2.imshow(self.binaryWindowName, binaryIm) imCopy = self.add_prev_frames_to_image(imCopy, testData, prediction, score, update) cv2.imshow(self.windowName,imCopy) k = cv2.waitKey(1) if k == 27: # space break def test_on_descriptors(self, desList): testLabels = [] for i,des in enumerate(desList): if des is not None and des.shape[0] >= self.minDescriptorsPerFrame: words, distance = vq(des, self.classifier.voc) testData = np.zeros(self.numWords, "float32") for w in words: testData[w] += 1 normTestData = np.linalg.norm(testData, ord=2) * np.ones(self.numWords) testData = np.divide(testData, normTestData) prediction,score = self.predict(testData) sortedScores = np.sort(score) #if max(score) > self.predictionScoreThreshold: if sortedScores[-1]-sortedScores[-2] >= self.predictionScoreThreshold: pass else: prediction = -1 else: prediction = -1 testLabels.append(prediction) return testLabels def predict(self, testData): prediction = self.classifier.predict(testData.reshape(1,-1)) score = self.classifier.decision_function(testData.reshape(1,-1)) return prediction[0], score[0] def insert_to_prediction_list(self, prediction): self.predictionList.append(prediction) self.predictionList = self.predictionList[1:] def most_common(self, lst): for i in range(1,len(lst)-1): if lst[i] != lst[i-1] and lst[i] != lst[i+1]: lst[i] = -1 e = max(set(lst), key=lst.count) return e,lst.count(e) def is_hand(self, defects): if defects.shape[0] > 5: return False else: return True def write_on_image(self, image, text): cv2.putText(image, text, (self.parent.imWidth/20,self.parent.imHeight/4), cv2.FONT_HERSHEY_SIMPLEX, 5, (0,0,255), 5) def get_prev_frames_image(self): image = self.prevFrameList[0] for i in range(1,len(self.prevFrameList)): image = np.append(image, self.prevFrameList[i], axis=0) return image def apply_binary_mask(self, image, mask, kernelSize): kernel = np.ones((kernelSize,kernelSize),np.uint8) dilatedMask = cv2.dilate(mask,kernel,iterations=1) maskedImage = cv2.bitwise_and(image, image, mask=dilatedMask) return maskedImage def add_prev_frames_to_image(self, image, testData, testLabel, testScore, update=False): shrinkIm = cv2.resize(image, None, fx=float(1)/self.numSideFrames, fy=float(1)/self.numSideFrames) prevFramesIm = self.get_prev_frames_image() image = np.append(image, prevFramesIm, axis=1) if update: if self.numPrevFrames < self.numSideFrames: self.prevFrameList[self.numPrevFrames] = shrinkIm self.prevStates[self.numPrevFrames] = testData self.prevLabels[self.numPrevFrames] = testLabel self.prevScores[self.numPrevFrames] = testScore self.numPrevFrames += 1 else: self.prevFrameList = np.append(self.prevFrameList, np.array([shrinkIm]), axis=0) self.prevFrameList = self.prevFrameList[1:] self.prevStates = np.append(self.prevStates, np.array([testData]), axis=0) self.prevStates = self.prevStates[1:] self.prevLabels.append(testLabel) self.prevLabels = self.prevLabels[1:] self.prevScores.append(testScore) self.prevScores = self.prevScores[1:] return image def reinforce(self, event, x, y, flags, param): if event == cv2.EVENT_LBUTTONDOWN: if x > self.parent.imWidth: prevFrameID = int(np.floor(y*self.numSideFrames/self.parent.imHeight)) self.prevFrameList[prevFrameID] = cv2.cvtColor(self.prevFrameList[prevFrameID], cv2.COLOR_BGR2HSV) if isinstance(self.classifier, svm.LinearSVC): self.perceptron_update(prevFrameID, False) elif event == cv2.EVENT_RBUTTONDOWN: if x > self.parent.imWidth: prevFrameID = int(np.floor(y*self.numSideFrames/self.parent.imHeight)) self.prevFrameList[prevFrameID] = cv2.cvtColor(self.prevFrameList[prevFrameID], cv2.COLOR_BGR2YCR_CB) if isinstance(self.classifier, svm.LinearSVC): self.perceptron_update(prevFrameID, True) def perceptron_update(self, prevFrameID, flag): weights = self.classifier.coef_ if not flag: wrongData = self.prevStates[prevFrameID] #normData = np.linalg.norm(wrongData, ord=2) * np.ones(self.numWords) #wrongData = np.divide(wrongData, normData) wrongLabel = self.prevLabels[prevFrameID] wrongScores = self.prevScores[prevFrameID] wrongScore = max(wrongScores) if wrongLabel > 0: wrongWeights = weights[wrongLabel-1] newWeights = np.subtract(wrongWeights, (self.learningRate/self.numReinforce)*wrongData) weights[wrongLabel-1] = newWeights else: k = cv2.waitKey(-1) rightLabel = k - 48 if rightLabel > 0 and rightLabel <= weights.shape[0]: wrongWeights = weights[rightLabel-1] newWeights = np.add(wrongWeights, (self.learningRate/self.numReinforce)*wrongData) weights[rightLabel-1] = newWeights else: rightData = self.prevStates[prevFrameID] #normData = np.linalg.norm(rightData, ord=2) * np.ones(self.numWords) #rightData = np.divide(rightData, normData) rightLabel = self.prevLabels[prevFrameID] rightScores = self.prevScores[prevFrameID] rightScore = max(rightScores) if rightLabel > 0: rightWeights = weights[rightLabel-1] newWeights = np.add(rightWeights, (self.learningRate/self.numReinforce)*rightData) weights[rightLabel-1] = newWeights #self.numReinforce += 1 self.classifier.coef_ = weights
def __init__(self, mode, data_type, log_csv_path, feature_path, debug_limit): FeatureExtractor.__init__(self, mode, data_type, log_csv_path, feature_path, debug_limit)
def __init__(self, k, similarity_function): self.vocabulary = None self.k = k self.sim_function = similarity_function FeatureExtractor.__init__(self) self.featuresVec
class Analyzer: bdm = None expLogColl = None #timeGran = timedelta(minutes=5) timeGran = timedelta(minutes=2) actuNames = None sensorNames = None zonelist = None feater = None clust = None def __init__(self): self.actuNames = ActuatorNames() self.sensorNames = SensorNames() self.bdm = BDWrapper() self.expLogColl = CollectionWrapper('experience_log') #self.zonelist = self.csv2list('metadata/partialzonelist.csv') self.zonelist = self.csv2list('metadata/zonelist.csv') self.feater = FeatureExtractor() self.clust = Clusterer() def csv2list(self, filename): outputList = list() with open(filename, 'r') as fp: reader = csv.reader(fp, delimiter=',') for row in reader: outputList.append(row[0]) return outputList def get_actuator_uuid(self, zone=None, actuType=None): context = dict() if zone != None: context['room']=zone if actuType != None: context['template']=actuType uuids = self.bdm.get_sensor_uuids(context) if len(uuids)>1: raise QRError('Many uuids are found', context) elif len(uuids)==0: raise QRError('No uuid is found', context) else: return uuids[0] def normalize_data_avg(self, rawData, beginTime, endTime): procData = pd.Series({beginTime:float(rawData[0])}) tp = beginTime while tp<=endTime: tp = tp+self.timeGran leftSeries = rawData[:tp] if len(leftSeries)>0: idx = len(leftSeries)-1 leftVal = leftSeries[idx] leftIdx = leftSeries.index[idx] else: leftVal = None rightSeries = rawData[tp:] if len(rightSeries)>0: rightVal = rightSeries[0] rightIdx = rightSeries.index[0] else: rightVal = None if rightVal==None and leftVal!=None: newVal = leftVal elif rightVal!=None and leftVal==None: newVal = rightVal elif tp==leftIdx: newVal = leftVal elif tp==rightIdx: newVal = rightVal elif rightVal!=None and leftVal!=None: leftDist = (tp - leftIdx).total_seconds() rightDist = (rightIdx - tp).total_seconds() newVal = (leftVal*rightDist+rightVal*leftDist)/(rightDist+leftDist) else: print "ERROR: no data found in raw data" newVal = None newData = pd.Series({tp:newVal}) procData = procData.append(newData) return procData def normalize_data_nextval_deprecated(self, rawData, beginTime, endTime): procData = pd.Series({beginTime:float(rawData[0])}) tp = beginTime while tp<=endTime: tp = tp+self.timeGran leftSeries = rawData[:tp] if len(leftSeries)>0: idx = len(leftSeries)-1 leftVal = leftSeries[idx] leftIdx = leftSeries.index[idx] else: leftVal = None rightSeries = rawData[tp:] if len(rightSeries)>0: rightVal = rightSeries[0] rightIdx = rightSeries.index[0] else: rightVal = None if rightVal != None: newVal = rightVal else: newVal = leftVal newData = pd.Series({tp:newVal}) procData = procData.append(newData) return procData def normalize_data(self, rawData, beginTime, endTime, normType): rawData = rawData[beginTime:endTime] if not beginTime in rawData.index: rawData[beginTime] = rawData.head(1)[0] rawData = rawData.sort_index() if not endTime in rawData.index: rawData[endTime] = rawData.tail(1)[0] rawData = rawData.sort_index() if normType=='nextval': procData = rawData.resample('2Min', fill_method='pad') elif normType=='avg': procData = rawData.resample('2Min', how='mean') else: procData = None return procData def receive_a_sensor(self, zone, actuType, beginTime, endTime, normType): print zone, actuType uuid = self.get_actuator_uuid(zone, actuType) rawData = self.bdm.get_sensor_ts(uuid, 'PresentValue', beginTime, endTime) if actuType!=self.actuNames.damperCommand: rawData = self.remove_negativeone(rawData) procData = self.normalize_data(rawData, beginTime, endTime, normType) return procData def receive_entire_sensors_notstore(self, beginTime, endTime, normType, exceptZoneList=[]): #TODO: Should be parallelized here dataDict = dict() for zone in self.zonelist: if not zone in exceptZoneList: dataDict[zone] = self.receive_zone_sensors(zone, beginTime, endTime, normType) return dataDict def receive_entire_sensors(self, beginTime, endTime, filename, normType, exceptZoneList=[]): # filename='data/'+beginTime.isoformat()[0:-7].replace(':','_') + '.pkl' dataDict = self.receive_entire_sensors_notstore(beginTime, endTime, normType, exceptZoneList=exceptZoneList) with open(filename, 'wb') as fp: pickle.dump(dataDict, fp) # json.dump(dataDict,fp) def clustering(self, inputData, dataDict): fftFeat = self.feater.get_fft_features(inputData, dataDict) minmaxFeat = self.feater.get_minmax_features(dataDict) dtwFeat = self.feater.get_dtw_features(inputData, dataDict) freqFeat = self.feater.get_freq_features(inputData, dataDict) featDict = dict() for zone in self.zonelist: featList = list() featList.append(fftFeat[zone]) featList.append(minmaxFeat[zone]) featList.append(dtwFeat[zone]) #featList.append(freqFeat[zone]) featDict[zone] = featList print featDict['RM-4132'] return self.clust.cluster_kmeans(featDict) def remove_negativeone(self, data): if -1 in data.values: indices = np.where(data==-1) for idx in indices: data[idx] = data[idx-1] return data def receive_zone_sensors(self, zone, beginTime, endTime, normType): zoneDict = dict() for actuType in self.actuNames.nameList+self.sensorNames.nameList: if actuType=='Actual Supply Flow': pass try: uuid = self.get_actuator_uuid(zone, actuType) except QRError: continue # if actuType == self.actuNames.commonSetpoint: # wcad = self.receive_a_sensor(zone, 'Warm Cool Adjust', beginTime, endTime, normType) # data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType) # data = data + wcad # pass if actuType != self.actuNames.damperCommand: if actuType==self.actuNames.occupiedCommand: pass data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType) else: data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType) zoneDict[actuType] = data return zoneDict def store_zone_sensors(self, zone, beginTime, endTime, normType, filename): data = self.receive_zone_sensors(zone, beginTime, endTime, normType) # with open(filename, 'wb') as fp: # w = csv.DictWriter(fp, data.keys()) # w.writeheader() # w.writerow(data) for key, val in data.iteritems(): val.to_csv('rm4132.csv', header=key, mode='a') def store_minmax_dict(self): minDict = defaultdict(dict) maxDict = defaultdict(dict) beginTime = datetime(2015,2,1) endTime = datetime(2015,9,1) shortBeginTime = datetime(2015,8,1) shortEndTime = datetime(2015,8,2) for zone in self.zonelist: for pointType in self.actuNames.nameList+self.sensorNames.nameList: try: if pointType=='Occupied Command': minDict[zone][pointType] = 1 maxDict[zone][pointType] = 3 elif pointType=='Cooling Command': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 100 elif pointType=='Cooling Command' or pointType=='Heating Command': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 100 elif pointType=='Occupied Clg Min' or pointType=='Occupied Htg Flow' or pointType=='Cooling Max Flow': uuid = self.get_actuator_uuid(zone, pointType) data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime) minDict[zone][pointType] = min(data) maxDict[zone][pointType] = max(data) elif pointType=='Temp Occ Sts': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 1 elif pointType=='Reheat Valve Command': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 100 elif pointType=='Actual Supply Flow' or pointType=='Actual Sup Flow SP': uuid = self.get_actuator_uuid(zone, pointType) data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime) maxFlow = data[0] minDict[zone][pointType] = 0 maxDict[zone][pointType] = maxFlow elif pointType=='Damper Position': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 100 elif pointType=='Damper Command': uuid = self.get_actuator_uuid(zone, pointType) data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime) meanData = np.mean(data) stdData = np.std(data) meanAgain = np.mean(data[np.logical_and(data<=meanData+2*stdData, data>=meanData-2*stdData)]) minDict[zone][pointType] = meanData-2*stdData maxDict[zone][pointType] = meanData+2*stdData else: uuid = self.get_actuator_uuid(zone, pointType) data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', beginTime, endTime) minDict[zone][pointType] = min(data) maxDict[zone][pointType] = max(data) except: print "Something is wrong" pass with open('metadata/mindict.pkl', 'wb') as fp: pickle.dump(minDict, fp) with open('metadata/maxdict.pkl', 'wb') as fp: pickle.dump(maxDict, fp)