def _get_training_data(self): print('Extracting vehicle features ...') t = time.time() vehicle_features = FeatureExtractor.extract_features_for_multiple_images( ClassifierTrainer._get_vehicle_img_paths(), cspace='YUV' ) t2 = time.time() print round(t2-t, 2), 'Seconds to extract vehicle features' print_feature_info("vehicle_features", vehicle_features) print('Extracting non-vehicle features ...') t = time.time() non_vehicle_features = FeatureExtractor.extract_features_for_multiple_images( ClassifierTrainer._get_non_vehicle_img_paths(), cspace='YUV' ) t2 = time.time() print round(t2-t, 2), 'Seconds to extract non-vehicle features' print_feature_info("non_vehicle_features", non_vehicle_features) # combine an scale the vehicle and non-vehicle features combined = np.vstack((vehicle_features, non_vehicle_features)).astype(np.float64) print_feature_info("combined", combined) # Fit a per-column scaler if not self.feature_scaler: self.feature_scaler = StandardScaler().fit(combined) # Apply the scaler to X features = self.feature_scaler.transform(combined) # Define the labels vector labels = np.hstack( (np.ones(len(vehicle_features)), np.zeros(len(non_vehicle_features))) ) return features, labels
def __init__(self, parent, **kw): super().__init__(parent, **kw) self.parent = parent self.threshold = THRESHOLD self.model = lgb.Booster(model_file=MODEL_PATH) self.extractor = FeatureExtractor() parent.title("Cinder") parent.geometry("800x600") parent.resizable(width=False, height=False) parent.grid_columnconfigure(0, weight=1) parent.grid_rowconfigure(0, weight=1) parent.option_add('*tearOff', 'FALSE') self.grid(column=0, row=0, sticky='nsew') self.grid_columnconfigure(0, weight=1) label = ttk.Label( self, text="Cinder - A tiny Machine learning-based Malware Detector", font='Arial 24 bold') label.grid(row=0, column=0, columnspan=2, sticky='nsew') label.configure(anchor="center") btn_scan = ttk.Button(self, text='Scan', command=self.scan) btn_scan.grid(row=1, column=0, sticky='ew') btn_reset = ttk.Button(self, text='Clear', command=self.clear) btn_reset.grid(row=1, column=1, sticky='ew') self.table_result = ttk.Frame(self) self.table_result.grid(row=2, column=0, columnspan=2) for child in self.winfo_children(): child.grid_configure(padx=10, pady=5)
def call_extract_features(arg_json, arg_nlp, arg_templates, arg_parameters): examples = LabeledExample.read(arg_json) indices = [e.index for e in examples.itervalues()] natural_language = {i: NLP.read(arg_nlp, i) for i in indices} word_problems = [WordProblem(examples[i], natural_language[i]) for i in indices] with open(arg_templates, 'rt') as f_handle: raw = f_handle.read() parsed = json.loads(raw) unique_templates = [Template.from_json(j) for j in parsed['templates']] # TODO(Eric): using only 2 word problems for testing unique_templates = unique_templates[:2] word_problems = word_problems[:2] feature_extractor = FeatureExtractor(unique_templates, word_problems) derivations = initialize_partial_derivations_for_all_templates( word_problems[0], unique_templates) derivation = derivations[0] while not derivation.is_complete(): derivation = derivation.all_ways_to_fill_next_slot()[0] print(feature_extractor.extract(derivation)) print(derivation)
def predict(window): """ Given a window of audio data, predict the speaker. Then use the onSpeakerDetected(speaker) method to notify the Android application. You must use the same feature extraction method that you used to train the model. """ # TODO: Extract features and predict class label # You may need to reshape your feature vector into a 1 X d matrix as follows: # X = np.reshape(X,(1,-1)) # Create a feature extractor feature_extractor = FeatureExtractor(debug=False) # Extract features from the window X = feature_extractor.extract_features(window) # Reshape features into matrix for prediction X = np.reshape(X, (1, -1)) # When you get a label, send it to the UI by calling onSpeakerDetected: # onSpeakerDetected(speaker) # Load the pickle file of the scaler with open(os.path.join(output_dir, 'scaler.pickle'), 'rb') as f: # Initialize the scaler scaler = pickle.load(f) # Send the prediction, the 0 index is because the prediction will be a vector onSpeakerDetected(classifier.predict(scaler.transform(X))[0]) return
def run_instance(n_components, max_iter, emphasis_coefficient, energy_multiplier, energy_range, n_ccs, win_len, win_step, frame_length, frame_skip, top_db): data_directory = 'profile_data/' X_train = [] y_train = [] # Instantiate model and feature extractor d_params = {'n_components':int(n_components), \ 'max_iter':int(max_iter) } diarizer = GMMDiariser(d_params) f_params = {'emphasis_coefficient':emphasis_coefficient, \ 'energy_multiplier':energy_multiplier, \ 'energy_range':int(energy_range), \ 'n_ccs':int(n_ccs), \ 'win_len':win_len, \ 'win_step':win_step, \ 'frame_length':int(frame_length), \ 'frame_skip':int(frame_skip), \ 'top_db':int(top_db) } extractor = FeatureExtractor(f_params) # Init diarizer with classes based on filesystem classes = os.listdir(data_directory) diarizer.init_profiles(labels=classes) D = len(classes) # Grab training and testing data # This only works when we concatenate data, if we don't we have to do a little extra for label in classes: class_dir = os.path.join(data_directory, label) X_class = extractor.extract_features_dir(dir=class_dir, concatenate=True) N_train = X_class.shape[0] y_class = diarizer.label_to_vector(label=label, N=N_train, D=D) X_train.append(X_class) y_train.append(y_class) # Flatten data X_train = np.concatenate(X_train, axis=0) y_train = np.concatenate(y_train, axis=0) # TODO: Remove this for bayesian tuning... X_train, y_train = diarizer.shuffle_data(X_train, y_train) # Do cross-validation accuracies = diarizer.cross_validate(X=X_train, y=y_train, n_folds=5, shuffle=False) return np.mean(accuracies)
def vectorize_data(arg): row, raw_data, x_path, y_path, n_rows = arg extractor = FeatureExtractor() dim = FeatureExtractor.dim raw_features = json.loads(raw_data) y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=n_rows) y[row] = raw_features["label"] feature_vector = extractor.process_raw_features(raw_features) x = np.memmap(x_path, dtype=np.float32, mode="r+", shape=(n_rows, dim)) x[row] = feature_vector
def make_data(self, external_collector, run_numbers): self.make_meta(external_collector) root2py = Root2Py(self, external_collector, self.njobs) root2py.process_runs(run_numbers) feature_extractor = FeatureExtractor(self, self.njobs) feature_extractor.make_features(run_numbers) train_data_maker = TrainDataMaker(self) train_data_maker.make_train_data(run_numbers, feature_extractor)
def extraction_process_(paths, n_frames, n_blocks): """ An single process of feature extraction. """ extractor = FeatureExtractor(n_frames, n_blocks) path_feature_map = {} for i, filepath in enumerate(paths): feature_vector = extractor.extract(filepath) path_feature_map[filepath] = feature_vector return path_feature_map
def __init__(self, tdm): self.tdm = tdm # Local (edges) self.local_fe = FeatureExtractor(get_all_feature_classes(LocalFeatureClass)) # Local Node (nodes) self.node_fe = FeatureExtractor(get_all_feature_classes(NodeFeatureClass)) # Local RHS rhs_fc = [fc for fc in get_all_feature_classes(LocalFeatureClass) if fc.feature_side & LocalContext.TARGET ] self.rhs_fe = FeatureExtractor(rhs_fc)
def __init__(self, points_on_normal=6, search_points_on_normal=5): super().__init__() self.cov_mat_pyr_inv = list() self.mean_vec_pyr = list() self.pca_shape_pyr = list() self.eigenvectors_pyr = list() self.eigenvalues_pyr = list() self.sigma2_pyr = list() self.points_on_normal = points_on_normal self.search_points_on_normal = search_points_on_normal self.feature_extractor = FeatureExtractor(self.pyramid_level, self.points_on_normal, self.search_points_on_normal) self.params_limits = None
def __init__(self, name, regularization_C, percentile=None): name += self.get_relscorer_suffix() MLModel.__init__(self, name, None) # Note: The model is lazily when needed. self.model = None self.regularization_C = regularization_C self.top_percentile = percentile self.label_encoder = None self.dict_vec = None self.scaler = None # The index of the correct label. self.correct_index = -1 self.feature_extractor = FeatureExtractor(False, True, entity_features=False)
def __init__(self, name, rel_score_model): name += self.get_pruner_suffix() MLModel.__init__(self, name, None) # Note: The model is lazily when needed. self.model = None self.label_encoder = None self.dict_vec = None self.scaler = None # The index of the correct label. self.correct_index = -1 self.feature_extractor = FeatureExtractor( True, False, relation_score_model=rel_score_model, entity_features=True)
def extract_all_to_csv(cut_res, obj, outfile, cut_requirement=-1): FExtractor = FeatureExtractor() #Track progress #total = float(len(list(cut_res.keys()))) #counter = 0.0 data = [] for snid, info in cut_res.iteritems(): #output progress #counter += 1 #progress = counter / total * 100.0 #sys.stdout.write('\rProgress: %.2f %%' %progress) #sys.stdout.flush() #extract features if cut_requirement is met if info['cut'] == cut_requirement: flts = np.unique(info['lightcurve']['FLT'].values) data_dict = extract(info['lightcurve'], flts, FExtractor) data_dict['SNID'] = snid data_dict['OBJ'] = obj data.append(data_dict) df = pd.DataFrame(data) #drop rows where every values is NaN (these were events with no good observations) df = df.dropna(how='all') #replace NaNs with 'N' to be consistent df = df.fillna('N') df.to_csv(outfile, index=False) return
def main(args): logging.basicConfig(level=logging.DEBUG) if args.splits is None: print("At least one dataset split must be specified using --split") import sys sys.exit(1) cfg = RGBConfig(**load_jsonnet(args.cfg)) model: AggregatedBackboneModel = cfg.get_model() backbone = model.backbone.eval() device = torch.device("cuda") backbone = torch.nn.DataParallel(backbone).to(device) datasets = get_datasets(cfg, args.splits) dataloaders = { name: DataLoader( dataset, # sadly since we are dealing with tensors of variable size we have to set # batch size to 1 unless we wish to deal with packing and unpacking which # is a massive pain. batch_size=1, pin_memory=True, shuffle=False, num_workers=args.n_workers, ) for name, dataset in datasets.items() } feature_extractor = FeatureExtractor( backbone_2d=backbone, device=device, frame_batch_size=args.batch_size ) total_instances = extract_features_to_hdf( dataloaders, feature_extractor, args.features_hdf, cfg.model.backbone_dim ) print(f"Extracted {total_instances} features.")
class FeatureAdder(object): def __init__(self, tdm): self.tdm = tdm # Local (edges) self.local_fe = FeatureExtractor(get_all_feature_classes(LocalFeatureClass)) # Local Node (nodes) self.node_fe = FeatureExtractor(get_all_feature_classes(NodeFeatureClass)) # Local RHS rhs_fc = [fc for fc in get_all_feature_classes(LocalFeatureClass) if fc.feature_side & LocalContext.TARGET ] self.rhs_fe = FeatureExtractor(rhs_fc) def add_features(self, tforest, just_list = False): allfeats = set() for node in tforest.nodes.values(): node_local_context = LocalNodeContext(node, tforest.sent) node_features = self.node_fe.extract_all(node_local_context) if just_list: allfeats |= set([f.split('=')[0] for f in node_features]) else: #node.fvector = " ".join([f for f in node_features]) node.fvector = Vector(" ".join([f for f in node_features])) for edge in node.edges: local_context = LocalContext(node, edge, edge.rule, tforest.sent) features = self.local_fe.extract_all(local_context) #local_context.set_cluster_level(self.tdm, 4) #features.extend(self.rhs_fe.extract_all(local_context)) #local_context.set_cluster_level(self.tdm, 6) #features.extend(self.rhs_fe.extract_all(local_context)) if just_list: allfeats |= set([f.split('=')[0] for f in features]) else: # hack, add in features # edge.fvector = " ".join([f for f in features]) edge.fvector = Vector(" ".join([f for f in features])) edge.rule.fields = Vector(" ".join(features)) return allfeats
def process_batch_frame(idx: int, file_paths: str, output_path: str, extractor: ft.FeatureExtractor): path = file_paths[idx] if idx < len(file_paths) else '' frame = cv2.imread(join(output_path, SUBFOLDER_FRAMES, path)) frame = preprocess_frames(frame, output_path, idx) frame = np.random.randint(0, 256, (850, 850, 3), dtype=np.uint8) if frame is None or frame.size == 0 else frame feature_vec = extractor.extract_single_feature_vector(frame, extractor.haralick_dist, extractor.hist_size, extractor.clip_limit) return feature_vec
def eval_options(board, depth, max_depth=1, successor_number=0): if depth == 1: print(f"{successor_number}") #Given the board, board if depth == max_depth: return 0, 4 # With a depth of 2: # - Maxes at 512 with decay >= 0.8 # - Pretty consistent min of 1024 for decay <= 0.7 # - Sometimes reaches 2048 with 0.7 decay = 0.7 scores = [] actions = [] successors = [] seen = [] #Left = 0, Down = 1, Right = 2, Up = 3 #for each possible action: for i in range(4): successors = board_generator(board.copy(), i) counter = 0 num_samples = min(len(successors), 15) random_sample = random.sample(range(len(successors)), num_samples) if depth == 0: print( f"Depth: {depth}, Direction: {i}, Number of successors: {len(successors)}, Number of samples: {num_samples}" ) for index in random_sample: new_board = successors[index] if in_seen(seen, new_board): continue seen.append(new_board) #extract features of result f = FeatureExtractor(new_board) #get score score = calculate_score(f.getfeatures()) scores.append(score + decay * eval_options(new_board, depth + 1, max_depth, counter)[0]) counter += 1 actions.append(i) if not scores: return 0, 4 max_score = max(scores) return statistics.mean(scores), actions[scores.index(max_score)]
def __init__(self, image_names=[], f=800, mode='Spherical'): self.__images = [] self.__image_masks = [] self.__features = [] self.__image_pairs = [] self.__start = None self.extractor = FeatureExtractor() self.matcher = FeatureMatcher() for image_name in image_names: img = cv2.imread(image_name) if mode=='Cylindrical': self.__transform_method = 'affine' #### convert rectangler to cylindrical h,w = img.shape[:2] start=timer() K = np.array([[f, 0, w/2], [0, f, h/2], [0, 0, 1]]) # mock calibration matrix cylindrical_img, cylindrical_mask = cylindricalWarpImage(img, K) end=timer() print("convert time", end-start) print("converting", image_name, "to cylindrical ... ") # print("cylindrical mask dtype:", cylindrical_mask.dtype) # print("cylindrical mask shape:", cylindrical_mask.shape) self.__images.append(cylindrical_img) self.__image_masks.append(cylindrical_mask) if mode=='Spherical': self.__transform_method = 'affine' #### convert rectangler to spherical # f = 3000 start=timer() spherical_img, spherical_mask = warpSpherical(img, f) end=timer() print("convert time", end-start) print("converting", image_name, "to spherical ... ") self.__images.append(spherical_img) self.__image_masks.append(spherical_mask) if mode=='Flat': self.__transform_method = 'homography' #### flat self.__images.append(img) self.__image_masks.append(np.ones(img.shape[:2], dtype=np.uint8)*255)
def extract_features( file_path: str, save_dir: str, selector: KeyframeSelector, extractor: FeatureExtractor, force=False ): """ Extracts features for the video and saves them in the given dir. :param file_path: video path. :param save_dir: directory to save the features. :param selector: . :param extractor: . :param force: when True, calculates features even if it was done previously. """ video_name = re.split('[/.]', file_path)[-2] save_path_feats = f'{save_dir}/{video_name}-feats.npy' save_path_tags = f'{save_dir}/{video_name}-tags.npy' # skip already processed videos if not force and os.path.isfile(save_path_feats) and os.path.isfile(save_path_tags): print(f'Skipping video {video_name}') return print(f'Extracting features from video {video_name}') # obtain keyframes t0 = time.time() keyframes, timestamps, total_frames = selector.select_keyframes(file_path) selection = time.time() - t0 print(f'selected {len(keyframes)} of {total_frames} frames in {selection:.1f} secs') # log selection time log_persistent(f'{len(timestamps)}\t{selection:.2f}\n', f'{save_dir}/selection_log.txt') # measure time t0 = time.time() # extract features and save features = extractor.extract_features(keyframes) np.save(save_path_feats, features) # generate tags and save tags = np.empty(timestamps.shape[0], dtype='<U30') for i in range(timestamps.shape[0]): tags[i] = f'{video_name} # {timestamps[i]:.2f} # {i + 1}' np.save(save_path_tags, tags) extraction = time.time() - t0 print(f'feature extraction for {len(timestamps)} frames took {extraction:.2f} seconds\n') # log extraction time log_persistent(f'{len(timestamps)}\t{extraction:.2f}\n', f'{save_dir}/extraction_log.txt') return
def __init__(self, src=None, _dtype="video"): if (src == None): raise InputError("No source to video/image seq found!") else: self.loader = Loader(src, _dtype) self.loader._load_media_instance() self.scale = 1.0 self.feature_extractor = FeatureExtractor() # load the first frame self.prev_frame = self.loader.load_frame() # given in kitti dataset / sequence under consideration self.focal = 718.8560 self.camera_coords = (607.1928, 185.2157) # This is somewhat problammatic because we do not know the initial pose # Also it cannot be 0 since it's a matrix and a vector respectively self.R_pos = 0 self.t_pos = 0
def process_video_frame(output_path, img_path: str, extractor: ft.FeatureExtractor): frame = cv2.imread(join(output_path, SUBFOLDER_FRAMES, img_path)) frame = preprocess_frames(frame, output_path, img_path) frame = np.zeros( (850, 850, 3), dtype=np.uint8) if frame is None or frame.size == 0 or ( type(frame) != np.ndarray and type(frame) != np.memmap) else frame feature_vec = extractor.extract_single_feature_vector( frame, extractor.haralick_dist, extractor.hist_size, extractor.clip_limit) return feature_vec
def _single_img_features(self, img, cspace='RGB', orient=9, pix_per_cell=8, cell_per_block=2): return FeatureExtractor.extract_features_for_img( img=img, cspace=cspace, orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block)
def board_generator(board, direction): basic_board = move(board.copy(), direction) if np.array_equal(board, basic_board): return set() f = FeatureExtractor(basic_board) try: if f.getfeatures()["empty_tiles"] == 0: return [basic_board] except IndexError: print(":(") successors = [] for r in range(4): for c in range(4): if basic_board[r][c] == 0: basic_board[r][c] = 2 successors.append(basic_board.copy()) basic_board[r][c] = 4 successors.append(basic_board.copy()) basic_board[r][c] = 0 return successors
def group_features(selector: KeyframeSelector, extractor: FeatureExtractor, force: bool = False) -> Tuple[np.ndarray, np.ndarray]: """ Groups all the features and tags in a directory and saves them in a file each. :param selector: . :param extractor: . :param force: when True, groups features even if it was done previously. """ # full path to the features directory feats_dir = get_features_dir(selector=selector, extractor=extractor) # reload files if grouping was already done if os.path.isfile(f'{feats_dir}/{FEATURES_FILE}.npy') \ and os.path.isfile(f'{feats_dir}/{TAGS_FILE}.npy') \ and not force: print(f'Grouping already done for {feats_dir}') all_features = np.load(f'{feats_dir}/{FEATURES_FILE}.npy') all_tags = np.load(f'{feats_dir}/{TAGS_FILE}.npy') return all_tags, all_features # obtain all videos videos = os.listdir(get_videos_dir()) all_tags = np.empty(0, dtype=np.str) all_features = np.empty((0, extractor.descriptor_size()), dtype='int8') i = 0 # reads all the features files and groups them in one for video in videos: if video.endswith('.mp4'): video_name = video.split('.')[0] tags, features = read_features(video_name, feats_dir) all_tags = np.concatenate((all_tags, tags)) all_features = np.concatenate((all_features, features)) i += 1 print( f'{all_features.shape[0]:,d} feats read in {i} file{"s" if i > 1 else ""}' ) assert all_features.shape[0] == all_tags.shape[ 0], 'features and tags length must match' # save files np.save(f'{feats_dir}/{FEATURES_FILE}.npy', all_features) np.save(f'{feats_dir}/{TAGS_FILE}.npy', all_tags) return all_tags, all_features
class MonoVO: def __init__(self, src=None, _dtype="video"): if (src == None): raise InputError("No source to video/image seq found!") else: self.loader = Loader(src, _dtype) self.loader._load_media_instance() self.scale = 1.0 self.feature_extractor = FeatureExtractor() # load the first frame self.prev_frame = self.loader.load_frame() # given in kitti dataset / sequence under consideration self.focal = 718.8560 self.camera_coords = (607.1928, 185.2157) # This is somewhat problammatic because we do not know the initial pose # Also it cannot be 0 since it's a matrix and a vector respectively self.R_pos = 0 self.t_pos = 0 def _getFeatures(self): # keep a tracker to the previous frame and next frame self.curr_frame = self.loader.load_frame() prev_frame_kps, prev_frame_des = self.feature_extractor.find_keypoints( self.prev_frame) curr_frame_kps, curr_frame_des = self.feature_extractor.find_keypoints( self.curr_frame) R,t = self.feature_extractor.match_points_and_find_E(prev_frame_des, \ curr_frame_des, self.focal, \ self.camera_coords) self.construct_trajectory(R, t) def construct_trajectory(self, R, t): # the following equations come form the homogenous coordinate system self.R_pos = np.dot(R, R_pos) self.t_pos = self.t_pos + self.R_pos #TODO: write a unit test
def get_extracted_features(feature_type, train, test): """ Extracts specified features from training and testing set parameters ---------- feature_type: array of feature string (valid values: word, wordcont, char) train: the training set test : the testing set Returns: ------- tuple: train, test features extracted from training and testing """ fx = FeatureExtractor() meta_train = fx.get_features(train, feature_type) meta_test = fx.get_features(test, feature_type) return meta_train, meta_test
def __init__(self, name, train_dataset, top_ngram_percentile=5, rel_regularization_C=None, **kwargs): MLModel.__init__(self, name, train_dataset) Ranker.__init__(self, name, **kwargs) # Note: The model is lazily loaded when score is called. self.model = None self.label_encoder = None self.dict_vec = None # The index of the correct label. self.correct_index = -1 self.cmp_cache = dict() self.relation_scorer = None self.pruner = None self.scaler = None self.kwargs = kwargs self.top_ngram_percentile = top_ngram_percentile self.rel_regularization_C = rel_regularization_C # Only extract ngram features. self.feature_extractor = FeatureExtractor(True, False, None)
def visualize_tags(data_path, classes): sents, labels, ids = load_data(data_path) feats = FeatureExtractor(bow=False, negation=False, emoji=False, senti_words=False, emoticon=False, postag=True, verbose=False) feats.make_bow(sents) tags = feats.get_representation(sents) df = pd.DataFrame(tags, index=ids, columns=['N', 'ADV', 'ADJ', 'V']) df['label'] = [classes[l] for l in labels] counts = df.groupby('label').sum() counts = counts.div(counts.sum(axis=1), axis=0) counts *= 100 counts.plot.bar(rot=0) plt.xlabel('Class') plt.ylabel('Frequency of PoS tag (%)') plt.show()
def __init__(self, name, rel_score_model): name += self.get_pruner_suffix() MLModel.__init__(self, name, None) # Note: The model is lazily when needed. self.model = None self.label_encoder = None self.dict_vec = None self.scaler = None # The index of the correct label. self.correct_index = -1 self.feature_extractor = FeatureExtractor(True, False, relation_score_model=rel_score_model, entity_features=True)
def extract_features_to_hdf( dataloaders: Dict[str, DataLoader], feature_extractor: FeatureExtractor, features_path: Path, feature_dim: int, ): total_instances = 0 with h5py.File(features_path, mode="w", swmr=True, libver="latest") as root_group: for dataset_name, dataloader in dataloaders.items(): n_examples = len(dataloader.dataset) feature_writer = HdfFeatureWriter( root_group.create_group(dataset_name), n_examples, feature_dim, ) total_instances += feature_extractor.extract(dataloader, feature_writer) return total_instances
def __init__(self, name, regularization_C, percentile=None): name += self.get_relscorer_suffix() MLModel.__init__(self, name, None) # Note: The model is lazily when needed. self.model = None self.regularization_C = regularization_C self.top_percentile = percentile self.label_encoder = None self.dict_vec = None self.scaler = None # The index of the correct label. self.correct_index = -1 self.feature_extractor = FeatureExtractor(False, True, entity_features=False)
def call_fold(arg_testfold, arg_numfolds, arg_foldoutput, arg_json, arg_nlp, arg_templates, arg_parameters): examples = LabeledExample.read(arg_json) indices = [e.index for e in examples.itervalues()][:5] # TODO just 5 for testing natural_language = {i: NLP.read(arg_nlp, i) for i in indices} word_problems = [WordProblem(examples[i], natural_language[i]) for i in indices] fold_indices = make_fold_indices(arg_numfolds, len(word_problems)) test_indices = fold_indices.pop(arg_testfold) train_indices = list() for per_fold in fold_indices: train_indices.extend(per_fold) with open(arg_templates, 'rt') as f_handle: raw = f_handle.read() parsed = json.loads(raw) unique_templates = [Template.from_json(j) for j in parsed['templates']] wp_template_map = {int(k): v for k, v in parsed['wp_template_map'].iteritems()} train_wps = [word_problems[i] for i in train_indices] train_templates_indices = list({wp_template_map[wp.labeled_example.index] for wp in train_wps}) remap_templates = {wp.labeled_example.index: train_templates_indices.index( wp_template_map[wp.labeled_example.index]) for wp in train_wps} train_templates = [unique_templates[i] for i in train_templates_indices] feature_extractor = FeatureExtractor(train_templates, train_wps) classifier = optimize_parameters(feature_extractor, train_wps, train_templates, remap_templates) with open(arg_parameters, 'wt') as f_handle: f_handle.write(json.dumps(classifier.to_json())) correct = 0 for test_i in test_indices: test_wp = word_problems[test_i] correct += classifier.solve(test_wp) print('{} correct out of {}'.format(correct, len(test_indices)))
def split_song(original_file_name, folder_path, sz_limit): """ Load a song, split in halves and export as independent files. :param original_file_name: string :param folder_path: pathlib.Path :return: """ wav, sr = librosa.load(str(folder_path / original_file_name), sr=None) ext = original_file_name.split('.')[-1] # wav is np.ndarray [shape=(n,) or (2, n)] if len(wav.shape) == 1: # if is mono, add dummy dim wav = np.expand_dims(wav, axis=0) size = wav.shape[1] ctr = 0 while size >= sz_limit: # count how many half split (binary) should i do to have enough small parts size = size // 2 ctr += 1 if ctr == 0: return [original_file_name] parts = 2 * ctr part_size = wav.shape[1] // parts file_names = [] for part_idx in range(parts): wav_part = wav[:, part_idx * part_size:(part_idx + 1) * part_size] n_file_name = original_file_name.replace(ext, '{}.{}'.format(part_idx, ext)) n_file_name = FeatureExtractor.save_mp3(wav_part.swapaxes(0, 1), sr, None, folder_path, None, None, None, n_file_name) file_names.append(n_file_name) return file_names
def run(self): self.reset() game = Easy21() S = game.state() A = self.epsilon_greedy_action(S) while not game.isTerminal(): Aprime = None game, R = game.step(A) Sprime = game.state() # Initialize Q to zero for "all" states # Our lookup table is only for interesting states # So we hack around by putting Q = 0 if game.isTerminal(): Q = 0 else: Aprime = self.epsilon_greedy_action(Sprime) Q = self.Q(Sprime, Aprime) """ This is our TD error """ delta = R + Sarsa2.GAMMA * Q - self.Q(S, A) features = FeatureExtractor(S, A).features() self.update(delta, features) S = Sprime A = Aprime
def __init__(self, name, train_dataset, top_ngram_percentile=5, rel_regularization_C=None, **kwargs): MLModel.__init__(self, name, train_dataset) Ranker.__init__(self, name, **kwargs) # Note: The model is lazily loaded when score is called. self.model = None self.label_encoder = None self.dict_vec = None # The index of the correct label. self.correct_index = -1 self.cmp_cache = dict() self.relation_scorer = None self.pruner = None self.scaler = None self.kwargs = kwargs self.top_ngram_percentile = top_ngram_percentile self.rel_regularization_C = rel_regularization_C # Only extract ngram features. self.feature_extractor = FeatureExtractor(True, False, None)
send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) send_socket.connect(("none.cs.umass.edu", 9999)) # Load the classifier: output_dir = 'training_output' classifier_filename = 'classifier.pickle' with open(os.path.join(output_dir, classifier_filename), 'rb') as f: classifier = pickle.load(f) if classifier == None: print("Classifier is null; make sure you have trained it!") sys.exit() feature_extractor = FeatureExtractor(debug=False) def onSpeakerDetected(speaker): """ Notifies the client of the current speaker """ print("Speaker is {}.".format(speaker)) sys.stdout.flush() send_socket.send( json.dumps({ 'user_id': user_id, 'sensor_type': 'SENSOR_SERVER_MESSAGE', 'message': 'SPEAKER_DETECTED', 'data': { 'speaker': speaker
bit_representation = {'g': 1, 'r': 2, 'i': 4, 'z': 8} band_bits = bit_representation[band_set[0]] + bit_representation[band_set[1]] band_bit_dict = {3: 'gr', 6: 'ri', 12: 'iz', 5: 'gi', 9: 'gz', 10: 'rz'} pairs.append([band_bit_dict[band_bits]]) lc_pairs = pairs[np.argmax(np.array([len(x) for x in pairs]))] else: lc_pairs = [] ### Summary # Available filter pairs are listed in lc_pairs # Available filters are listed in lc_bands # Available number of observations is lc_nobs #based on lc_pairs, lc_bands, and lc_nobs, find the meaningful features FExtractor = FeatureExtractor() #['nobs_brighter_than', 'slope', 'same_nite_color_diff', 'total_color_diff', 'snr', 'flat', 'half', 'mag'] good_families = FExtractor.families if len(lc_pairs) == 0: good_families.remove('same_nite_color_diff') if len(lc_bands) < 2: good_families.remove('total_color_diff') if lc_nobs < 6: good_families.remove('half') if lc_nobs < 3: good_families.remove('flat') if lc_nobs < 2: good_families.remove('slope') useable_features = []
def create_fingerprint_from_capture_data( name, capture_data ): assert isinstance( capture_data, KeystrokeCaptureData ) fe= FeatureExtractor() capture_data.feed( fe ) features= fe.extract_features() return Fingerprint.from_features( name, features )
x, y = circle_perimeter(pupil[0], pupil[1], pupil[2]) rgb[x,y] = 255 #ex, ey = ellipse.center #major, minor = ellipse.axes #orientation = ellipse.orientation #imshow(rgb) #x, y = ellipse_perimeter(int(ex), int(ey), int(major), int(minor), orientation) #rgb[x,y] = (220, 40, 40) #imshow(rgb) return e, image def get_rect(path): e, img = detect(path) img = cv2.linearPolar(img.T, (e.center[0], e.center[1]), 80, cv2.WARP_FILL_OUTLIERS).T imshow(img) img = img[0:23, :] return img def feature(path, f): img = get_rect(path) #imshow(img) feature = f.extract(img) return feature if __name__ == "__main__": d = Dataset('./data', suffix='.jpg') img = get_rect(d.images[1]) imshow(img) f = FeatureExtractor(set(['daisy', 'hog'])) feature = f.extract(img)
class AccuModel(MLModel, Ranker): """Performs a pair-wise transform to learn a ranking. It always compares two candidates and makes a classification decision using a random forest to decide which one should be ranked higher. """ def score(self, candidate): pass def __init__(self, name, train_dataset, top_ngram_percentile=5, rel_regularization_C=None, **kwargs): MLModel.__init__(self, name, train_dataset) Ranker.__init__(self, name, **kwargs) # Note: The model is lazily loaded when score is called. self.model = None self.label_encoder = None self.dict_vec = None # The index of the correct label. self.correct_index = -1 self.cmp_cache = dict() self.relation_scorer = None self.pruner = None self.scaler = None self.kwargs = kwargs self.top_ngram_percentile = top_ngram_percentile self.rel_regularization_C = rel_regularization_C # Only extract ngram features. self.feature_extractor = FeatureExtractor(True, False, None) def load_model(self): model_file = self.get_model_filename() try: [model, label_enc, dict_vec, scaler] \ = joblib.load(model_file) self.model = model self.scaler = scaler relation_scorer = RelationNgramScorer(self.get_model_name(), self.rel_regularization_C, percentile=self.top_ngram_percentile) relation_scorer.load_model() self.feature_extractor.relation_score_model = relation_scorer pruner = CandidatePruner(self.get_model_name(), relation_scorer) pruner.load_model() self.pruner = pruner self.dict_vec = dict_vec self.label_encoder = label_enc self.correct_index = label_enc.transform([1])[0] logger.info("Loaded scorer model from %s" % model_file) except IOError: logger.warn("Model file %s could not be loaded." % model_file) raise def learn_rel_score_model(self, queries): rel_model = RelationNgramScorer(self.get_model_name(), self.rel_regularization_C, percentile=self.top_ngram_percentile) rel_model.learn_model(queries) return rel_model def learn_prune_model(self, labels, features): prune_model = CandidatePruner(self.get_model_name(), self.relation_scorer) prune_model.learn_model(labels, features) return prune_model def learn_model(self, train_queries, n_folds=6): # split the training queries into folds # for each fold extract n-gram features (and select best ones) # myahya: where does this selection happen? # also extract regular features # learn the relation classifier and score the "test" fold # add the score as feature in the test-fold # collect all test-folds # train the treepair classifier on the collected test-folds # train the relation classifier on the all relation-features kf = KFold(len(train_queries), n_folds=n_folds, shuffle=True, random_state=999) num_fold = 1 pair_features = [] # myahya: used for learn_ranking_model pair_labels = [] # myahya: used for learn_ranking_model features = [] # myahya: used for learn_prune_model labels = [] # myahya: used for learn_prune_model # myahya: So a bunch of relation scoring models are created and applied, one for each split. for train, test in kf: logger.info("Training relation score model on fold %s/%s" % ( num_fold, n_folds)) test_fold = [train_queries[i] for i in test] train_fold = [train_queries[i] for i in train] # myahya: train of training fold rel_model = self.learn_rel_score_model(train_fold) self.feature_extractor.relation_score_model = rel_model # myahya: feature extractor uses relation score as a feature logger.info("Applying relation score model.") # myahya: extract PAIR features/labels for test fold testfoldpair_features, testfoldpair_labels = construct_pair_examples( test_fold, self.feature_extractor) # myahya: extract UNI features/labels for test fold testfold_features, testfold_labels = construct_examples( test_fold, self.feature_extractor) # myahya: add features from test fold (which features are these exactly?) features.extend(testfold_features) # myahya: add test fold labels UNI labels.extend(testfold_labels) # myahya: add pair features from fold pair_features.extend(testfoldpair_features) # myahya: add test fold labels PAIR pair_labels.extend(testfoldpair_labels) num_fold += 1 logger.info("Done collecting features for fold.") logger.info("Training final relation scorer.") rel_model = self.learn_rel_score_model(train_queries) self.feature_extractor.relation_score_model = rel_model self.relation_scorer = rel_model self.pruner = self.learn_prune_model(labels, features) self.learn_ranking_model(pair_features, pair_labels) def learn_ranking_model(self, features, labels): logger.info("Training tree classifier for ranking.") logger.info("#of labeled examples: %s" % len(features)) logger.info("#labels non-zero: %s" % sum(labels)) label_encoder = LabelEncoder() logger.info(features[-1]) labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=False) X = vec.fit_transform(features) X, labels = utils.shuffle(X, labels, random_state=999) decision_tree = RandomForestClassifier(class_weight='auto', random_state=999, n_jobs=6, n_estimators=90) logger.info("Training random forest...") decision_tree.fit(X, labels) logger.info("Done.") self.model = decision_tree self.dict_vec = vec self.label_encoder = label_encoder self.correct_index = label_encoder.transform([1])[0] def store_model(self): logger.info("Writing model to %s." % self.get_model_filename()) joblib.dump([self.model, self.label_encoder, self.dict_vec, self.scaler], self.get_model_filename()) self.relation_scorer.store_model() self.pruner.store_model() logger.info("Done.") def compare_pair(self, x_candidate, y_candidate): """Compare two candidates. Return 1 if x_candidate > y_candidate, else return -1. :param x_candidate: :param y_candidate: :return: """ if not self.model: self.load_model() # Use the preferences for sorting. else: res = None if (x_candidate, y_candidate) in self.cmp_cache: res = self.cmp_cache[(x_candidate, y_candidate)] if res is None: x_features = self.feature_extractor.extract_features( x_candidate) y_features = self.feature_extractor.extract_features( y_candidate) diff = feature_diff(x_features, y_features) X = self.dict_vec.transform(diff) if self.scaler: X = self.scaler.transform(X) self.model.n_jobs = 1 p = self.model.predict(X) c = self.label_encoder.inverse_transform(p) res = c[0] if res == 1: return 1 else: return -1 def _precompute_cmp(self, candidates, max_cache_candidates=300): """Pre-compute comparisons. The main overhead is calling the classification routine. Therefore, pre-computing all O(n^2) comparisons (which can be done with a single classification call) is actually faster up to a limit. :param candidates: :param max_cache_candidates: :return: """ if not self.model: self.load_model() self.cmp_cache = dict() pairs = [] pair_features = [] features = [] if len(candidates) > max_cache_candidates: logger.info("Cannot precoumpte for all of %s candidates." % len(candidates)) return start = time.time() for c in candidates[:max_cache_candidates]: f = self.feature_extractor.extract_features(c) features.append(f) duration = (time.time() - start) * 1000 logger.debug("FExtract took %s ms" % duration) start = time.time() for i, x in enumerate(candidates[:max_cache_candidates]): x_f = features[i] for j, y in enumerate(candidates[:max_cache_candidates]): if i == y: continue y_f = features[j] diff = feature_diff(x_f, y_f) pair_features.append(diff) pairs.append((x, y)) duration = (time.time() - start) * 1000 logger.debug("FDiff for %s took %s ms" % (len(pairs), duration)) if len(pairs) > 0: X = self.dict_vec.transform(pair_features) if self.scaler: X = self.scaler.transform(X) self.model.n_jobs = 1 start = time.time() p = self.model.predict(X) duration = (time.time() - start) * 1000 logger.debug("Predict for %s took %s ms" % (len(pairs), duration)) self.model.n_jobs = 1 c = self.label_encoder.inverse_transform(p) # Remember the #of wins/losses for each candidate. for (x, y), s in zip(pairs, c): self.cmp_cache[(x, y)] = s def rank_query_candidates(self, query_candidates, key=lambda x: x): """Rank query candidates by scoring and then sorting them. :param query_candidates: :return: """ if not self.model: self.load_model() query_candidates = shuffle_candidates(query_candidates, key) num_candidates = len(query_candidates) logger.debug("Pruning %s candidates" % num_candidates) query_candidates = self.prune_candidates(query_candidates, key) logger.debug("%s of %s candidates remain" % (len(query_candidates), num_candidates)) start = time.time() candidates = [key(q) for q in query_candidates] self._precompute_cmp(candidates) ranked_candidates = sorted(query_candidates, cmp=self.compare_pair, key=key, reverse=True) self.cmp_cache = dict() if len(query_candidates) > 0: duration = (time.time() - start) * 1000 logger.debug( "Sorting %s candidates took %s ms. %s ms per candidate" % (len(query_candidates), duration, float(duration) / len(query_candidates))) return ranked_candidates def prune_candidates(self, query_candidates, key): remaining = [] if len(query_candidates) > 0: remaining = self.pruner.prune_candidates(query_candidates, key) return remaining
class CandidatePruner(MLModel): """Learns a recall-optimized pruning model.""" def __init__(self, name, rel_score_model): name += self.get_pruner_suffix() MLModel.__init__(self, name, None) # Note: The model is lazily when needed. self.model = None self.label_encoder = None self.dict_vec = None self.scaler = None # The index of the correct label. self.correct_index = -1 self.feature_extractor = FeatureExtractor(True, False, relation_score_model=rel_score_model, entity_features=True) def get_pruner_suffix(self): return "_Pruner" def print_model(self, n_top=30): dict_vec = self.dict_vec classifier = self.model logger.info("Printing top %s weights." % n_top) logger.info("intercept: %.4f" % classifier.intercept_[0]) feature_weights = [] for name, index in dict_vec.vocabulary_.iteritems(): feature_weights.append((name, classifier.coef_[0][index])) feature_weights = sorted(feature_weights, key=lambda x: math.fabs(x[1]), reverse=True) for name, weight in feature_weights[:n_top]: logger.info("%s: %.4f" % (name, weight)) def learn_model(self, labels, features): logger.info("Learning prune classifier.") logger.info("#of labeled examples: %s" % len(features)) logger.info("#labels non-zero: %s" % sum(labels)) num_labels = float(len(labels)) num_pos_labels = sum(labels) num_neg_labels = num_labels - num_pos_labels pos_class_weight = num_labels / num_pos_labels neg_class_weight = num_labels / num_neg_labels pos_class_boost = 2.0 label_encoder = LabelEncoder() logger.info(features[-1]) vec = DictVectorizer(sparse=False) X = vec.fit_transform(features) labels = label_encoder.fit_transform(labels) self.label_encoder = label_encoder self.scaler = StandardScaler() X = self.scaler.fit_transform(X) X, labels = utils.shuffle(X, labels, random_state=999) class_weights = {1: pos_class_weight * pos_class_boost, 0: neg_class_weight} logger.info(class_weights) logreg_cv = LogisticRegressionCV(Cs=20, class_weight=class_weights, cv=6, solver='lbfgs', n_jobs=6, # max_iter=40, verbose=True) logreg_cv.fit(X, labels) self.model = logreg_cv pred = self.model.predict(X) logger.info("F-1 score on train: %.4f" % metrics.f1_score(labels, pred, pos_label=1)) logger.info("Classification report:\n" + classification_report(labels, pred)) self.dict_vec = vec self.label_encoder = label_encoder self.print_model() logger.info("Done learning prune classifier.") def load_model(self): model_file = self.get_model_filename() try: [model, label_enc, dict_vec, scaler] \ = joblib.load(model_file) self.model = model self.dict_vec = dict_vec self.scaler = scaler self.label_encoder = label_enc self.correct_index = label_enc.transform([1])[0] logger.info("Loaded scorer model from %s" % model_file) except IOError: logger.warn("Model file %s could not be loaded." % model_file) raise def store_model(self): logger.info("Writing model to %s." % self.get_model_filename()) joblib.dump([self.model, self.label_encoder, self.dict_vec, self.scaler], self.get_model_filename()) logger.info("Done.") def prune_candidates(self, query_candidates, key): remaining = [] candidates = [key(q) for q in query_candidates] features = [] for c in candidates: c_features = self.feature_extractor.extract_features(c) features.append(c_features) X = self.dict_vec.transform(features) X = self.scaler.transform(X) p = self.model.predict(X) # c = self.prune_label_encoder.inverse_transform(p) for candidate, predict in zip(query_candidates, p): if predict == 1: remaining.append(candidate) return remaining
class RelationNgramScorer(MLModel): """Learns a scoring based on question ngrams.""" def __init__(self, name, regularization_C, percentile=None): name += self.get_relscorer_suffix() MLModel.__init__(self, name, None) # Note: The model is lazily when needed. self.model = None self.regularization_C = regularization_C self.top_percentile = percentile self.label_encoder = None self.dict_vec = None self.scaler = None # The index of the correct label. self.correct_index = -1 self.feature_extractor = FeatureExtractor(False, True, entity_features=False) def get_relscorer_suffix(self): return "_RelScore" def load_model(self): model_file = self.get_model_filename() try: [model, label_enc, dict_vec, scaler] \ = joblib.load(model_file) self.model = model self.dict_vec = dict_vec self.scaler = scaler self.label_encoder = label_enc self.correct_index = label_enc.transform([1])[0] logger.info("Loaded scorer model from %s" % model_file) except IOError: logger.warn("Model file %s could not be loaded." % model_file) raise def learn_model(self, train_queries): if self.top_percentile: logger.info("Collecting frequent n-gram features...") n_grams_dict = get_top_chi2_candidate_ngrams(train_queries, self.feature_extractor, percentile=self.top_percentile) logger.info("Collected %s n-gram features" % len(n_grams_dict)) self.feature_extractor.ngram_dict = n_grams_dict features, labels = construct_examples(train_queries, self.feature_extractor) logger.info("#of labeled examples: %s" % len(features)) logger.info("#labels non-zero: %s" % sum(labels)) label_encoder = LabelEncoder() logger.info(features[-1]) labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) scaler = StandardScaler(with_mean=False) X = vec.fit_transform(features) X = scaler.fit_transform(X) X, labels = utils.shuffle(X, labels, random_state=999) logger.info("#Features: %s" % len(vec.vocabulary_)) # Perform grid search or use provided C. if self.regularization_C is None: logger.info("Performing grid search.") relation_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=np.ceil( 10 ** 6 / len(labels)), random_state=999) cv_params = [{"alpha": [10.0, 5.0, 2.0, 1.5, 1.0, 0.5, 0.1, 0.01, 0.001, 0.0001]}] grid_search_cv = grid_search.GridSearchCV(relation_scorer, cv_params, n_jobs=8, verbose=1, cv=8, refit=True) grid_search_cv.fit(X, labels) logger.info("Best score: %.5f" % grid_search_cv.best_score_) logger.info("Best params: %s" % grid_search_cv.best_params_) self.model = grid_search_cv.best_estimator_ else: logger.info("Learning relation scorer with C: %s." % self.regularization_C) relation_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=np.ceil( 10 ** 6 / len(labels)), alpha=self.regularization_C, random_state=999) relation_scorer.fit(X, labels) logger.info("Done.") self.model = relation_scorer self.dict_vec = vec self.scaler = scaler self.label_encoder = label_encoder self.correct_index = label_encoder.transform([1])[0] self.print_model() def print_model(self, n_top=20): dict_vec = self.dict_vec classifier = self.model logger.info("Printing top %s weights." % n_top) logger.info("intercept: %.4f" % classifier.intercept_[0]) feature_weights = [] for name, index in dict_vec.vocabulary_.iteritems(): feature_weights.append((name, classifier.coef_[0][index])) feature_weights = sorted(feature_weights, key=lambda x: math.fabs(x[1]), reverse=True) for name, weight in feature_weights[:n_top]: logger.info("%s: %.4f" % (name, weight)) def store_model(self): logger.info("Writing model to %s." % self.get_model_filename()) joblib.dump([self.model, self.label_encoder, self.dict_vec, self.scaler], self.get_model_filename()) logger.info("Done.") def score(self, candidate): if not self.model: self.load_model() features = self.feature_extractor.extract_features(candidate) X = self.dict_vec.transform(features) X = self.scaler.transform(X) prob = self.model.predict_proba(X) # Prob is an array of n_examples, n_classes score = prob[0][self.correct_index] return RankScore(score)