Пример #1
0
    def _get_training_data(self):
        print('Extracting vehicle features ...')
        t = time.time()
        vehicle_features = FeatureExtractor.extract_features_for_multiple_images(
            ClassifierTrainer._get_vehicle_img_paths(), cspace='YUV'
        )
        t2 = time.time()
        print round(t2-t, 2), 'Seconds to extract vehicle features'
        print_feature_info("vehicle_features", vehicle_features)

        print('Extracting non-vehicle features ...')
        t = time.time()
        non_vehicle_features = FeatureExtractor.extract_features_for_multiple_images(
            ClassifierTrainer._get_non_vehicle_img_paths(), cspace='YUV'
        )
        t2 = time.time()
        print round(t2-t, 2), 'Seconds to extract non-vehicle features'
        print_feature_info("non_vehicle_features", non_vehicle_features)

        # combine an scale the vehicle and non-vehicle features
        combined = np.vstack((vehicle_features, non_vehicle_features)).astype(np.float64)
        print_feature_info("combined", combined)
        # Fit a per-column scaler
        if not self.feature_scaler:
            self.feature_scaler = StandardScaler().fit(combined)
        # Apply the scaler to X
        features = self.feature_scaler.transform(combined)

        # Define the labels vector
        labels = np.hstack(
            (np.ones(len(vehicle_features)), np.zeros(len(non_vehicle_features)))
        )

        return features, labels
Пример #2
0
    def __init__(self, parent, **kw):
        super().__init__(parent, **kw)
        self.parent = parent
        self.threshold = THRESHOLD
        self.model = lgb.Booster(model_file=MODEL_PATH)
        self.extractor = FeatureExtractor()

        parent.title("Cinder")
        parent.geometry("800x600")
        parent.resizable(width=False, height=False)
        parent.grid_columnconfigure(0, weight=1)
        parent.grid_rowconfigure(0, weight=1)
        parent.option_add('*tearOff', 'FALSE')
        self.grid(column=0, row=0, sticky='nsew')

        self.grid_columnconfigure(0, weight=1)
        label = ttk.Label(
            self,
            text="Cinder - A tiny Machine learning-based Malware Detector",
            font='Arial 24 bold')
        label.grid(row=0, column=0, columnspan=2, sticky='nsew')
        label.configure(anchor="center")
        btn_scan = ttk.Button(self, text='Scan', command=self.scan)
        btn_scan.grid(row=1, column=0, sticky='ew')
        btn_reset = ttk.Button(self, text='Clear', command=self.clear)
        btn_reset.grid(row=1, column=1, sticky='ew')
        self.table_result = ttk.Frame(self)
        self.table_result.grid(row=2, column=0, columnspan=2)

        for child in self.winfo_children():
            child.grid_configure(padx=10, pady=5)
Пример #3
0
def call_extract_features(arg_json, arg_nlp, arg_templates, arg_parameters):
    examples = LabeledExample.read(arg_json)
    indices = [e.index for e in examples.itervalues()]
    natural_language = {i: NLP.read(arg_nlp, i) for i in indices}
    word_problems = [WordProblem(examples[i], natural_language[i])
                     for i in indices]

    with open(arg_templates, 'rt') as f_handle:
        raw = f_handle.read()

    parsed = json.loads(raw)
    unique_templates = [Template.from_json(j) for j in parsed['templates']]
    # TODO(Eric): using only 2 word problems for testing
    unique_templates = unique_templates[:2]
    word_problems = word_problems[:2]

    feature_extractor = FeatureExtractor(unique_templates, word_problems)
    derivations = initialize_partial_derivations_for_all_templates(
        word_problems[0], unique_templates)
    derivation = derivations[0]
    while not derivation.is_complete():
        derivation = derivation.all_ways_to_fill_next_slot()[0]

    print(feature_extractor.extract(derivation))
    print(derivation)
Пример #4
0
def predict(window):
    """
    Given a window of audio data, predict the speaker.
    Then use the onSpeakerDetected(speaker) method to notify the
    Android application. You must use the same feature
    extraction method that you used to train the model.
    """

    # TODO: Extract features and predict class label

    # You may need to reshape your feature vector into a 1 X d matrix as follows:
    # X = np.reshape(X,(1,-1))

    # Create a feature extractor
    feature_extractor = FeatureExtractor(debug=False)

    # Extract features from the window
    X = feature_extractor.extract_features(window)

    # Reshape features into matrix for prediction
    X = np.reshape(X, (1, -1))

    # When you get a label, send it to the UI by calling onSpeakerDetected:
    # onSpeakerDetected(speaker)

    # Load the pickle file of the scaler
    with open(os.path.join(output_dir, 'scaler.pickle'), 'rb') as f:
        # Initialize the scaler
        scaler = pickle.load(f)

        # Send the prediction, the 0 index is because the prediction will be a vector
        onSpeakerDetected(classifier.predict(scaler.transform(X))[0])

    return
Пример #5
0
def run_instance(n_components, max_iter, emphasis_coefficient,
                 energy_multiplier, energy_range, n_ccs, win_len, win_step,
                 frame_length, frame_skip, top_db):
    data_directory = 'profile_data/'
    X_train = []
    y_train = []

    # Instantiate model and feature extractor
    d_params = {'n_components':int(n_components), \
                'max_iter':int(max_iter)
    }
    diarizer = GMMDiariser(d_params)

    f_params = {'emphasis_coefficient':emphasis_coefficient, \
                'energy_multiplier':energy_multiplier, \
                'energy_range':int(energy_range), \
                'n_ccs':int(n_ccs), \
                'win_len':win_len, \
                'win_step':win_step, \
                'frame_length':int(frame_length), \
                'frame_skip':int(frame_skip), \
                'top_db':int(top_db)
    }
    extractor = FeatureExtractor(f_params)

    # Init diarizer with classes based on filesystem
    classes = os.listdir(data_directory)
    diarizer.init_profiles(labels=classes)
    D = len(classes)

    # Grab training and testing data
    # This only works when we concatenate data, if we don't we have to do a little extra
    for label in classes:
        class_dir = os.path.join(data_directory, label)

        X_class = extractor.extract_features_dir(dir=class_dir,
                                                 concatenate=True)
        N_train = X_class.shape[0]
        y_class = diarizer.label_to_vector(label=label, N=N_train, D=D)

        X_train.append(X_class)
        y_train.append(y_class)

    # Flatten data
    X_train = np.concatenate(X_train, axis=0)
    y_train = np.concatenate(y_train, axis=0)

    # TODO: Remove this for bayesian tuning...
    X_train, y_train = diarizer.shuffle_data(X_train, y_train)

    # Do cross-validation
    accuracies = diarizer.cross_validate(X=X_train,
                                         y=y_train,
                                         n_folds=5,
                                         shuffle=False)
    return np.mean(accuracies)
Пример #6
0
def vectorize_data(arg):
    row, raw_data, x_path, y_path, n_rows = arg
    extractor = FeatureExtractor()
    dim = FeatureExtractor.dim
    raw_features = json.loads(raw_data)
    y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=n_rows)
    y[row] = raw_features["label"]
    feature_vector = extractor.process_raw_features(raw_features)
    x = np.memmap(x_path, dtype=np.float32, mode="r+", shape=(n_rows, dim))
    x[row] = feature_vector
Пример #7
0
    def make_data(self, external_collector, run_numbers):
        self.make_meta(external_collector)

        root2py = Root2Py(self, external_collector, self.njobs)
        root2py.process_runs(run_numbers)

        feature_extractor = FeatureExtractor(self, self.njobs)
        feature_extractor.make_features(run_numbers)

        train_data_maker = TrainDataMaker(self)
        train_data_maker.make_train_data(run_numbers, feature_extractor)
Пример #8
0
def extraction_process_(paths, n_frames, n_blocks):
    """
    An single process of feature extraction.
    """

    extractor = FeatureExtractor(n_frames, n_blocks) 

    path_feature_map = {}
    for i, filepath in enumerate(paths):
        feature_vector = extractor.extract(filepath)
        path_feature_map[filepath] = feature_vector
    return path_feature_map
Пример #9
0
  def __init__(self, tdm):
    self.tdm = tdm

    # Local (edges)
    self.local_fe = FeatureExtractor(get_all_feature_classes(LocalFeatureClass))


    # Local Node (nodes)
    self.node_fe = FeatureExtractor(get_all_feature_classes(NodeFeatureClass))

    # Local RHS
    rhs_fc  = [fc for fc in get_all_feature_classes(LocalFeatureClass) if fc.feature_side & LocalContext.TARGET ]
    self.rhs_fe = FeatureExtractor(rhs_fc)
Пример #10
0
 def __init__(self, points_on_normal=6, search_points_on_normal=5):
     super().__init__()
     self.cov_mat_pyr_inv = list()
     self.mean_vec_pyr = list()
     self.pca_shape_pyr = list()
     self.eigenvectors_pyr = list()
     self.eigenvalues_pyr = list()
     self.sigma2_pyr = list()
     self.points_on_normal = points_on_normal
     self.search_points_on_normal = search_points_on_normal
     self.feature_extractor = FeatureExtractor(self.pyramid_level,
                                               self.points_on_normal,
                                               self.search_points_on_normal)
     self.params_limits = None
Пример #11
0
 def __init__(self, name, regularization_C, percentile=None):
     name += self.get_relscorer_suffix()
     MLModel.__init__(self, name, None)
     # Note: The model is lazily when needed.
     self.model = None
     self.regularization_C = regularization_C
     self.top_percentile = percentile
     self.label_encoder = None
     self.dict_vec = None
     self.scaler = None
     # The index of the correct label.
     self.correct_index = -1
     self.feature_extractor = FeatureExtractor(False,
                                               True,
                                               entity_features=False)
Пример #12
0
 def __init__(self, name, rel_score_model):
     name += self.get_pruner_suffix()
     MLModel.__init__(self, name, None)
     # Note: The model is lazily when needed.
     self.model = None
     self.label_encoder = None
     self.dict_vec = None
     self.scaler = None
     # The index of the correct label.
     self.correct_index = -1
     self.feature_extractor = FeatureExtractor(
         True,
         False,
         relation_score_model=rel_score_model,
         entity_features=True)
Пример #13
0
def extract_all_to_csv(cut_res, obj, outfile, cut_requirement=-1):
    FExtractor = FeatureExtractor()

    #Track progress
    #total = float(len(list(cut_res.keys())))
    #counter = 0.0

    data = []
    for snid, info in cut_res.iteritems():
        
        #output progress
        #counter += 1
        #progress = counter / total * 100.0
        #sys.stdout.write('\rProgress:  %.2f %%' %progress)
        #sys.stdout.flush()

        #extract features if cut_requirement is met
        if info['cut'] == cut_requirement:
            flts = np.unique(info['lightcurve']['FLT'].values)
            data_dict = extract(info['lightcurve'], flts, FExtractor)
            data_dict['SNID'] = snid
            data_dict['OBJ'] = obj
            data.append(data_dict)

    df = pd.DataFrame(data)
    #drop rows where every values is NaN (these were events with no good observations)
    df = df.dropna(how='all')

    #replace NaNs with 'N' to be consistent
    df = df.fillna('N')

    df.to_csv(outfile, index=False)
    return
Пример #14
0
def main(args):
    logging.basicConfig(level=logging.DEBUG)
    if args.splits is None:
        print("At least one dataset split must be specified using --split")
        import sys
        sys.exit(1)
    cfg = RGBConfig(**load_jsonnet(args.cfg))
    model: AggregatedBackboneModel = cfg.get_model()
    backbone = model.backbone.eval()

    device = torch.device("cuda")
    backbone = torch.nn.DataParallel(backbone).to(device)

    datasets = get_datasets(cfg, args.splits)
    dataloaders = {
        name: DataLoader(
            dataset,
            # sadly since we are dealing with tensors of variable size we have to set
            # batch size to 1 unless we wish to deal with packing and unpacking which
            # is a massive pain.
            batch_size=1,
            pin_memory=True,
            shuffle=False,
            num_workers=args.n_workers,
        )
        for name, dataset in datasets.items()
    }

    feature_extractor = FeatureExtractor(
        backbone_2d=backbone, device=device, frame_batch_size=args.batch_size
    )
    total_instances = extract_features_to_hdf(
        dataloaders, feature_extractor, args.features_hdf, cfg.model.backbone_dim
    )
    print(f"Extracted {total_instances} features.")
Пример #15
0
class FeatureAdder(object):
  def __init__(self, tdm):
    self.tdm = tdm

    # Local (edges)
    self.local_fe = FeatureExtractor(get_all_feature_classes(LocalFeatureClass))


    # Local Node (nodes)
    self.node_fe = FeatureExtractor(get_all_feature_classes(NodeFeatureClass))

    # Local RHS
    rhs_fc  = [fc for fc in get_all_feature_classes(LocalFeatureClass) if fc.feature_side & LocalContext.TARGET ]
    self.rhs_fe = FeatureExtractor(rhs_fc)


  def add_features(self, tforest, just_list = False):
    allfeats = set()
    for node in tforest.nodes.values():
      node_local_context = LocalNodeContext(node, tforest.sent)
      node_features = self.node_fe.extract_all(node_local_context)

      if just_list:
        allfeats |= set([f.split('=')[0] for f in node_features])        
      else:
        #node.fvector = " ".join([f for f in node_features])
        node.fvector = Vector(" ".join([f for f in node_features]))
            
      for edge in node.edges:
        local_context = LocalContext(node, edge, edge.rule, tforest.sent)
        features = self.local_fe.extract_all(local_context)

        #local_context.set_cluster_level(self.tdm, 4)
        #features.extend(self.rhs_fe.extract_all(local_context))
        
        #local_context.set_cluster_level(self.tdm, 6)
        #features.extend(self.rhs_fe.extract_all(local_context))

        if just_list:
          allfeats |= set([f.split('=')[0] for f in features])
        else:
          # hack, add in features
          # edge.fvector = " ".join([f for f in features])
          edge.fvector = Vector(" ".join([f for f in features]))
          edge.rule.fields = Vector(" ".join(features))
    return allfeats  
Пример #16
0
def process_batch_frame(idx: int, file_paths: str, output_path: str, extractor: ft.FeatureExtractor):
    path = file_paths[idx] if idx < len(file_paths) else ''
    frame = cv2.imread(join(output_path, SUBFOLDER_FRAMES, path))
    frame = preprocess_frames(frame, output_path, idx)
    frame = np.random.randint(0, 256, (850, 850, 3), dtype=np.uint8) if frame is None or frame.size == 0 else frame
    feature_vec = extractor.extract_single_feature_vector(frame, extractor.haralick_dist, extractor.hist_size, extractor.clip_limit)

    return feature_vec
Пример #17
0
def eval_options(board, depth, max_depth=1, successor_number=0):
    if depth == 1:
        print(f"{successor_number}")
    #Given the board, board
    if depth == max_depth:
        return 0, 4

    # With a depth of 2:
    #   - Maxes at 512 with decay >= 0.8
    #   - Pretty consistent min of 1024 for decay <= 0.7
    #   - Sometimes reaches 2048 with 0.7
    decay = 0.7

    scores = []
    actions = []
    successors = []
    seen = []
    #Left = 0, Down = 1, Right = 2, Up = 3
    #for each possible action:
    for i in range(4):
        successors = board_generator(board.copy(), i)
        counter = 0
        num_samples = min(len(successors), 15)
        random_sample = random.sample(range(len(successors)), num_samples)
        if depth == 0:
            print(
                f"Depth: {depth}, Direction: {i}, Number of successors: {len(successors)}, Number of samples: {num_samples}"
            )
        for index in random_sample:
            new_board = successors[index]
            if in_seen(seen, new_board):
                continue
            seen.append(new_board)
            #extract features of result
            f = FeatureExtractor(new_board)
            #get score
            score = calculate_score(f.getfeatures())
            scores.append(score +
                          decay * eval_options(new_board, depth +
                                               1, max_depth, counter)[0])
            counter += 1
            actions.append(i)
    if not scores:
        return 0, 4
    max_score = max(scores)
    return statistics.mean(scores), actions[scores.index(max_score)]
    def __init__(self, image_names=[], f=800, mode='Spherical'):
        self.__images = []
        self.__image_masks = []
        self.__features = []
        self.__image_pairs = []
        self.__start = None
        self.extractor = FeatureExtractor()
        self.matcher = FeatureMatcher()

        for image_name in image_names:
            img = cv2.imread(image_name)

            if mode=='Cylindrical':
                self.__transform_method = 'affine'
                #### convert rectangler to cylindrical
                h,w = img.shape[:2]
                start=timer()
                K = np.array([[f, 0, w/2], [0, f, h/2], [0, 0, 1]]) # mock calibration matrix
                cylindrical_img, cylindrical_mask = cylindricalWarpImage(img, K)
                end=timer()
                print("convert time", end-start)
                print("converting", image_name, "to cylindrical ... ")
                # print("cylindrical mask dtype:", cylindrical_mask.dtype)
                # print("cylindrical mask shape:", cylindrical_mask.shape)

                self.__images.append(cylindrical_img)
                self.__image_masks.append(cylindrical_mask)

            if mode=='Spherical':
                self.__transform_method = 'affine'
                #### convert rectangler to spherical
                # f = 3000
                start=timer()
                spherical_img, spherical_mask = warpSpherical(img, f)
                end=timer()
                print("convert time", end-start)
                print("converting", image_name, "to spherical ... ")

                self.__images.append(spherical_img)
                self.__image_masks.append(spherical_mask)

            if mode=='Flat':
                self.__transform_method = 'homography'
                #### flat
                self.__images.append(img)
                self.__image_masks.append(np.ones(img.shape[:2], dtype=np.uint8)*255)
Пример #19
0
def extract_features(
        file_path: str,
        save_dir: str,
        selector: KeyframeSelector,
        extractor: FeatureExtractor,
        force=False
):
    """
    Extracts features for the video and saves them in the given dir.

    :param file_path: video path.
    :param save_dir: directory to save the features.
    :param selector: .
    :param extractor: .
    :param force: when True, calculates features even if it was done previously.
    """

    video_name = re.split('[/.]', file_path)[-2]
    save_path_feats = f'{save_dir}/{video_name}-feats.npy'
    save_path_tags = f'{save_dir}/{video_name}-tags.npy'

    # skip already processed videos
    if not force and os.path.isfile(save_path_feats) and os.path.isfile(save_path_tags):
        print(f'Skipping video {video_name}')
        return

    print(f'Extracting features from video {video_name}')

    # obtain keyframes
    t0 = time.time()
    keyframes, timestamps, total_frames = selector.select_keyframes(file_path)

    selection = time.time() - t0
    print(f'selected {len(keyframes)} of {total_frames} frames in {selection:.1f} secs')

    # log selection time
    log_persistent(f'{len(timestamps)}\t{selection:.2f}\n', f'{save_dir}/selection_log.txt')

    # measure time
    t0 = time.time()

    # extract features and save
    features = extractor.extract_features(keyframes)
    np.save(save_path_feats, features)

    # generate tags and save
    tags = np.empty(timestamps.shape[0], dtype='<U30')
    for i in range(timestamps.shape[0]):
        tags[i] = f'{video_name} # {timestamps[i]:.2f} # {i + 1}'
    np.save(save_path_tags, tags)

    extraction = time.time() - t0
    print(f'feature extraction for {len(timestamps)} frames took {extraction:.2f} seconds\n')

    # log extraction time
    log_persistent(f'{len(timestamps)}\t{extraction:.2f}\n', f'{save_dir}/extraction_log.txt')
    return
Пример #20
0
    def __init__(self, src=None, _dtype="video"):
        if (src == None):
            raise InputError("No source to video/image seq found!")
        else:
            self.loader = Loader(src, _dtype)
            self.loader._load_media_instance()
        self.scale = 1.0

        self.feature_extractor = FeatureExtractor()
        # load the first frame
        self.prev_frame = self.loader.load_frame()
        # given in kitti dataset / sequence under consideration
        self.focal = 718.8560
        self.camera_coords = (607.1928, 185.2157)

        # This is somewhat problammatic because we do not know the initial pose
        # Also it cannot be 0 since it's a matrix and a vector respectively
        self.R_pos = 0
        self.t_pos = 0
Пример #21
0
def process_video_frame(output_path, img_path: str,
                        extractor: ft.FeatureExtractor):
    frame = cv2.imread(join(output_path, SUBFOLDER_FRAMES, img_path))
    frame = preprocess_frames(frame, output_path, img_path)
    frame = np.zeros(
        (850, 850, 3), dtype=np.uint8) if frame is None or frame.size == 0 or (
            type(frame) != np.ndarray and type(frame) != np.memmap) else frame
    feature_vec = extractor.extract_single_feature_vector(
        frame, extractor.haralick_dist, extractor.hist_size,
        extractor.clip_limit)
    return feature_vec
Пример #22
0
 def _single_img_features(self,
                          img,
                          cspace='RGB',
                          orient=9,
                          pix_per_cell=8,
                          cell_per_block=2):
     return FeatureExtractor.extract_features_for_img(
         img=img,
         cspace=cspace,
         orient=orient,
         pix_per_cell=pix_per_cell,
         cell_per_block=cell_per_block)
Пример #23
0
def board_generator(board, direction):
    basic_board = move(board.copy(), direction)
    if np.array_equal(board, basic_board):
        return set()
    f = FeatureExtractor(basic_board)
    try:
        if f.getfeatures()["empty_tiles"] == 0:
            return [basic_board]
    except IndexError:
        print(":(")

    successors = []
    for r in range(4):
        for c in range(4):
            if basic_board[r][c] == 0:
                basic_board[r][c] = 2
                successors.append(basic_board.copy())
                basic_board[r][c] = 4
                successors.append(basic_board.copy())
                basic_board[r][c] = 0
    return successors
Пример #24
0
def group_features(selector: KeyframeSelector,
                   extractor: FeatureExtractor,
                   force: bool = False) -> Tuple[np.ndarray, np.ndarray]:
    """
    Groups all the features and tags in a directory and saves them in a file each.

    :param selector: .
    :param extractor: .
    :param force: when True, groups features even if it was done previously.
    """
    # full path to the features directory
    feats_dir = get_features_dir(selector=selector, extractor=extractor)

    # reload files if grouping was already done
    if os.path.isfile(f'{feats_dir}/{FEATURES_FILE}.npy') \
            and os.path.isfile(f'{feats_dir}/{TAGS_FILE}.npy') \
            and not force:
        print(f'Grouping already done for {feats_dir}')
        all_features = np.load(f'{feats_dir}/{FEATURES_FILE}.npy')
        all_tags = np.load(f'{feats_dir}/{TAGS_FILE}.npy')

        return all_tags, all_features

    # obtain all videos
    videos = os.listdir(get_videos_dir())

    all_tags = np.empty(0, dtype=np.str)
    all_features = np.empty((0, extractor.descriptor_size()), dtype='int8')

    i = 0

    # reads all the features files and groups them in one
    for video in videos:
        if video.endswith('.mp4'):
            video_name = video.split('.')[0]
            tags, features = read_features(video_name, feats_dir)

            all_tags = np.concatenate((all_tags, tags))
            all_features = np.concatenate((all_features, features))

            i += 1
            print(
                f'{all_features.shape[0]:,d} feats read in {i} file{"s" if i > 1 else ""}'
            )

    assert all_features.shape[0] == all_tags.shape[
        0], 'features and tags length must match'

    # save files
    np.save(f'{feats_dir}/{FEATURES_FILE}.npy', all_features)
    np.save(f'{feats_dir}/{TAGS_FILE}.npy', all_tags)

    return all_tags, all_features
Пример #25
0
class MonoVO:
    def __init__(self, src=None, _dtype="video"):
        if (src == None):
            raise InputError("No source to video/image seq found!")
        else:
            self.loader = Loader(src, _dtype)
            self.loader._load_media_instance()
        self.scale = 1.0

        self.feature_extractor = FeatureExtractor()
        # load the first frame
        self.prev_frame = self.loader.load_frame()
        # given in kitti dataset / sequence under consideration
        self.focal = 718.8560
        self.camera_coords = (607.1928, 185.2157)

        # This is somewhat problammatic because we do not know the initial pose
        # Also it cannot be 0 since it's a matrix and a vector respectively
        self.R_pos = 0
        self.t_pos = 0

    def _getFeatures(self):
        # keep a tracker to the previous frame and next frame
        self.curr_frame = self.loader.load_frame()
        prev_frame_kps, prev_frame_des = self.feature_extractor.find_keypoints(
            self.prev_frame)
        curr_frame_kps, curr_frame_des = self.feature_extractor.find_keypoints(
            self.curr_frame)
        R,t = self.feature_extractor.match_points_and_find_E(prev_frame_des, \
                                                                            curr_frame_des, self.focal, \
                                                                            self.camera_coords)
        self.construct_trajectory(R, t)

    def construct_trajectory(self, R, t):
        # the following equations come form the homogenous coordinate system
        self.R_pos = np.dot(R, R_pos)
        self.t_pos = self.t_pos + self.R_pos


#TODO: write a unit test
Пример #26
0
def get_extracted_features(feature_type, train, test):
    """
    Extracts specified features from training and testing set

    parameters
    ----------

    feature_type: array of feature string (valid values: word, wordcont, char)
    train: the training set
    test : the testing set

    Returns:
    -------
        tuple: train, test
            features extracted from training and testing

    """

    fx = FeatureExtractor()
    meta_train = fx.get_features(train, feature_type)
    meta_test = fx.get_features(test, feature_type)
    return meta_train, meta_test
Пример #27
0
 def __init__(self,
              name,
              train_dataset,
              top_ngram_percentile=5,
              rel_regularization_C=None,
              **kwargs):
     MLModel.__init__(self, name, train_dataset)
     Ranker.__init__(self, name, **kwargs)
     # Note: The model is lazily loaded when score is called.
     self.model = None
     self.label_encoder = None
     self.dict_vec = None
     # The index of the correct label.
     self.correct_index = -1
     self.cmp_cache = dict()
     self.relation_scorer = None
     self.pruner = None
     self.scaler = None
     self.kwargs = kwargs
     self.top_ngram_percentile = top_ngram_percentile
     self.rel_regularization_C = rel_regularization_C
     # Only extract ngram features.
     self.feature_extractor = FeatureExtractor(True, False, None)
Пример #28
0
def visualize_tags(data_path, classes):
    sents, labels, ids = load_data(data_path)

    feats = FeatureExtractor(bow=False,
                             negation=False,
                             emoji=False,
                             senti_words=False,
                             emoticon=False,
                             postag=True,
                             verbose=False)
    feats.make_bow(sents)
    tags = feats.get_representation(sents)

    df = pd.DataFrame(tags, index=ids, columns=['N', 'ADV', 'ADJ', 'V'])
    df['label'] = [classes[l] for l in labels]

    counts = df.groupby('label').sum()
    counts = counts.div(counts.sum(axis=1), axis=0)
    counts *= 100
    counts.plot.bar(rot=0)
    plt.xlabel('Class')
    plt.ylabel('Frequency of PoS tag (%)')
    plt.show()
Пример #29
0
 def __init__(self,
              name,
              rel_score_model):
     name += self.get_pruner_suffix()
     MLModel.__init__(self, name, None)
     # Note: The model is lazily when needed.
     self.model = None
     self.label_encoder = None
     self.dict_vec = None
     self.scaler = None
     # The index of the correct label.
     self.correct_index = -1
     self.feature_extractor = FeatureExtractor(True,
                                               False,
                                               relation_score_model=rel_score_model,
                                               entity_features=True)
Пример #30
0
def extract_features_to_hdf(
    dataloaders: Dict[str, DataLoader],
    feature_extractor: FeatureExtractor,
    features_path: Path,
    feature_dim: int,
):
    total_instances = 0
    with h5py.File(features_path, mode="w", swmr=True, libver="latest") as root_group:
        for dataset_name, dataloader in dataloaders.items():
            n_examples = len(dataloader.dataset)
            feature_writer = HdfFeatureWriter(
                root_group.create_group(dataset_name),
                n_examples,
                feature_dim,
            )
            total_instances += feature_extractor.extract(dataloader, feature_writer)
    return total_instances
Пример #31
0
 def __init__(self,
              name,
              regularization_C,
              percentile=None):
     name += self.get_relscorer_suffix()
     MLModel.__init__(self, name, None)
     # Note: The model is lazily when needed.
     self.model = None
     self.regularization_C = regularization_C
     self.top_percentile = percentile
     self.label_encoder = None
     self.dict_vec = None
     self.scaler = None
     # The index of the correct label.
     self.correct_index = -1
     self.feature_extractor = FeatureExtractor(False,
                                               True,
                                               entity_features=False)
Пример #32
0
def call_fold(arg_testfold, arg_numfolds, arg_foldoutput,
              arg_json, arg_nlp, arg_templates, arg_parameters):
    examples = LabeledExample.read(arg_json)
    indices = [e.index for e in examples.itervalues()][:5]  # TODO just 5 for testing
    natural_language = {i: NLP.read(arg_nlp, i) for i in indices}
    word_problems = [WordProblem(examples[i], natural_language[i])
                     for i in indices]

    fold_indices = make_fold_indices(arg_numfolds, len(word_problems))
    test_indices = fold_indices.pop(arg_testfold)
    train_indices = list()
    for per_fold in fold_indices:
        train_indices.extend(per_fold)

    with open(arg_templates, 'rt') as f_handle:
        raw = f_handle.read()

    parsed = json.loads(raw)
    unique_templates = [Template.from_json(j) for j in parsed['templates']]
    wp_template_map = {int(k): v
                       for k, v in parsed['wp_template_map'].iteritems()}

    train_wps = [word_problems[i] for i in train_indices]
    train_templates_indices = list({wp_template_map[wp.labeled_example.index]
                                    for wp in train_wps})
    remap_templates = {wp.labeled_example.index:
                       train_templates_indices.index(
                           wp_template_map[wp.labeled_example.index])
                       for wp in train_wps}
    train_templates = [unique_templates[i] for i in train_templates_indices]

    feature_extractor = FeatureExtractor(train_templates, train_wps)
    classifier = optimize_parameters(feature_extractor, train_wps,
                                     train_templates, remap_templates)
    with open(arg_parameters, 'wt') as f_handle:
        f_handle.write(json.dumps(classifier.to_json()))

    correct = 0
    for test_i in test_indices:
        test_wp = word_problems[test_i]
        correct += classifier.solve(test_wp)
    print('{} correct out of {}'.format(correct, len(test_indices)))
Пример #33
0
def split_song(original_file_name, folder_path, sz_limit):
    """
    Load a song, split in halves and export as independent files.
    :param original_file_name: string
    :param folder_path: pathlib.Path
    :return:
    """
    wav, sr = librosa.load(str(folder_path / original_file_name), sr=None)
    ext = original_file_name.split('.')[-1]

    # wav is np.ndarray [shape=(n,) or (2, n)]
    if len(wav.shape) == 1:
        # if is mono, add dummy dim
        wav = np.expand_dims(wav, axis=0)

    size = wav.shape[1]
    ctr = 0
    while size >= sz_limit:
        # count how many half split (binary) should i do to have enough small parts
        size = size // 2
        ctr += 1

    if ctr == 0:
        return [original_file_name]

    parts = 2 * ctr
    part_size = wav.shape[1] // parts
    file_names = []

    for part_idx in range(parts):
        wav_part = wav[:, part_idx * part_size:(part_idx + 1) * part_size]
        n_file_name = original_file_name.replace(ext,
                                                 '{}.{}'.format(part_idx, ext))
        n_file_name = FeatureExtractor.save_mp3(wav_part.swapaxes(0, 1), sr,
                                                None, folder_path, None, None,
                                                None, n_file_name)
        file_names.append(n_file_name)

    return file_names
Пример #34
0
 def run(self):
     self.reset()
     game = Easy21()
     S = game.state()
     A = self.epsilon_greedy_action(S)
     while not game.isTerminal():
         Aprime = None
         game, R = game.step(A)
         Sprime = game.state()
         # Initialize Q to zero for "all" states
         # Our lookup table is only for interesting states
         # So we hack around by putting Q = 0
         if game.isTerminal():
             Q = 0
         else:
             Aprime = self.epsilon_greedy_action(Sprime)
             Q = self.Q(Sprime, Aprime)
         """ This is our TD error """
         delta = R + Sarsa2.GAMMA * Q - self.Q(S, A)
         features = FeatureExtractor(S, A).features()
         self.update(delta, features)
         S = Sprime
         A = Aprime
Пример #35
0
 def __init__(self, name,
              train_dataset,
              top_ngram_percentile=5,
              rel_regularization_C=None,
              **kwargs):
     MLModel.__init__(self, name, train_dataset)
     Ranker.__init__(self, name, **kwargs)
     # Note: The model is lazily loaded when score is called.
     self.model = None
     self.label_encoder = None
     self.dict_vec = None
     # The index of the correct label.
     self.correct_index = -1
     self.cmp_cache = dict()
     self.relation_scorer = None
     self.pruner = None
     self.scaler = None
     self.kwargs = kwargs
     self.top_ngram_percentile = top_ngram_percentile
     self.rel_regularization_C = rel_regularization_C
     # Only extract ngram features.
     self.feature_extractor = FeatureExtractor(True,
                                               False,
                                               None)
Пример #36
0
send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
send_socket.connect(("none.cs.umass.edu", 9999))

# Load the classifier:

output_dir = 'training_output'
classifier_filename = 'classifier.pickle'

with open(os.path.join(output_dir, classifier_filename), 'rb') as f:
    classifier = pickle.load(f)

if classifier == None:
    print("Classifier is null; make sure you have trained it!")
    sys.exit()

feature_extractor = FeatureExtractor(debug=False)


def onSpeakerDetected(speaker):
    """
    Notifies the client of the current speaker
    """
    print("Speaker is {}.".format(speaker))
    sys.stdout.flush()
    send_socket.send(
        json.dumps({
            'user_id': user_id,
            'sensor_type': 'SENSOR_SERVER_MESSAGE',
            'message': 'SPEAKER_DETECTED',
            'data': {
                'speaker': speaker
Пример #37
0
            bit_representation = {'g': 1, 'r': 2, 'i': 4, 'z': 8}
            band_bits = bit_representation[band_set[0]] + bit_representation[band_set[1]]
            band_bit_dict = {3: 'gr', 6: 'ri', 12: 'iz', 5: 'gi', 9: 'gz', 10: 'rz'}
            pairs.append([band_bit_dict[band_bits]])
    lc_pairs = pairs[np.argmax(np.array([len(x) for x in pairs]))]
else:
    lc_pairs = []


### Summary
# Available filter pairs are listed in lc_pairs
# Available filters are listed in lc_bands
# Available number of observations is lc_nobs

#based on lc_pairs, lc_bands, and lc_nobs, find the meaningful features
FExtractor = FeatureExtractor()

#['nobs_brighter_than', 'slope', 'same_nite_color_diff', 'total_color_diff', 'snr', 'flat', 'half', 'mag']
good_families = FExtractor.families
if len(lc_pairs) == 0:
    good_families.remove('same_nite_color_diff')
if len(lc_bands) < 2:
    good_families.remove('total_color_diff')
if lc_nobs < 6:
    good_families.remove('half')
if lc_nobs < 3:
    good_families.remove('flat')
if lc_nobs < 2:
    good_families.remove('slope')

useable_features = []
Пример #38
0
def create_fingerprint_from_capture_data( name, capture_data ):
    assert isinstance( capture_data, KeystrokeCaptureData )
    fe= FeatureExtractor()
    capture_data.feed( fe )
    features= fe.extract_features()
    return Fingerprint.from_features( name, features ) 
Пример #39
0
    x, y = circle_perimeter(pupil[0], pupil[1], pupil[2])
    rgb[x,y] = 255
    #ex, ey = ellipse.center
    #major, minor = ellipse.axes
    #orientation = ellipse.orientation
    #imshow(rgb)
    #x, y = ellipse_perimeter(int(ex), int(ey), int(major), int(minor), orientation)
    #rgb[x,y] = (220, 40, 40)
    #imshow(rgb)
    return e, image

def get_rect(path):
    e, img = detect(path)
    img = cv2.linearPolar(img.T, (e.center[0], e.center[1]), 80, cv2.WARP_FILL_OUTLIERS).T
    imshow(img)
    img = img[0:23, :]
    return img

def feature(path, f):
    img = get_rect(path)
    #imshow(img)
    feature = f.extract(img)
    return feature

if __name__ == "__main__":
    d = Dataset('./data', suffix='.jpg')
    img = get_rect(d.images[1])
    imshow(img)
    f = FeatureExtractor(set(['daisy', 'hog']))
    feature = f.extract(img)
Пример #40
0
class AccuModel(MLModel, Ranker):
    """Performs a pair-wise transform to learn a ranking.

     It always compares two candidates and makes a classification decision
     using a random forest to decide which one should be ranked higher.
    """

    def score(self, candidate):
        pass

    def __init__(self, name,
                 train_dataset,
                 top_ngram_percentile=5,
                 rel_regularization_C=None,
                 **kwargs):
        MLModel.__init__(self, name, train_dataset)
        Ranker.__init__(self, name, **kwargs)
        # Note: The model is lazily loaded when score is called.
        self.model = None
        self.label_encoder = None
        self.dict_vec = None
        # The index of the correct label.
        self.correct_index = -1
        self.cmp_cache = dict()
        self.relation_scorer = None
        self.pruner = None
        self.scaler = None
        self.kwargs = kwargs
        self.top_ngram_percentile = top_ngram_percentile
        self.rel_regularization_C = rel_regularization_C
        # Only extract ngram features.
        self.feature_extractor = FeatureExtractor(True,
                                                  False,
                                                  None)

    def load_model(self):
        model_file = self.get_model_filename()
        try:

            [model, label_enc, dict_vec, scaler] \
                = joblib.load(model_file)
            self.model = model
            self.scaler = scaler
            relation_scorer = RelationNgramScorer(self.get_model_name(),
                                                  self.rel_regularization_C,
                                                  percentile=self.top_ngram_percentile)
            relation_scorer.load_model()
            self.feature_extractor.relation_score_model = relation_scorer
            pruner = CandidatePruner(self.get_model_name(),
                                     relation_scorer)
            pruner.load_model()
            self.pruner = pruner
            self.dict_vec = dict_vec
            self.label_encoder = label_enc
            self.correct_index = label_enc.transform([1])[0]
            logger.info("Loaded scorer model from %s" % model_file)
        except IOError:
            logger.warn("Model file %s could not be loaded." % model_file)
            raise

    def learn_rel_score_model(self, queries):
        rel_model = RelationNgramScorer(self.get_model_name(),
                                        self.rel_regularization_C,
                                        percentile=self.top_ngram_percentile)
        rel_model.learn_model(queries)
        return rel_model

    def learn_prune_model(self, labels, features):
        prune_model = CandidatePruner(self.get_model_name(),
                                      self.relation_scorer)
        prune_model.learn_model(labels, features)
        return prune_model

    def learn_model(self, train_queries, n_folds=6):
        # split the training queries into folds
        # for each fold extract n-gram features (and select best ones) # myahya: where does this selection happen?
        # also extract regular features
        # learn the relation classifier and score the "test" fold
        # add the score as feature in the test-fold
        # collect all test-folds
        # train the treepair classifier on the collected test-folds
        # train the relation classifier on the all relation-features

        kf = KFold(len(train_queries), n_folds=n_folds, shuffle=True,
                   random_state=999)
        num_fold = 1
        pair_features = [] # myahya: used for learn_ranking_model
        pair_labels = []   # myahya: used for learn_ranking_model
        features = []      # myahya: used for learn_prune_model
        labels = []        # myahya: used for learn_prune_model
        # myahya: So a bunch of relation scoring models are created and applied, one for each split. 
        
        for train, test in kf:
            logger.info("Training relation score model on fold %s/%s" % (
                num_fold, n_folds))
            test_fold = [train_queries[i] for i in test]
            train_fold = [train_queries[i] for i in train]
            # myahya: train of training fold
            rel_model = self.learn_rel_score_model(train_fold)
            self.feature_extractor.relation_score_model = rel_model # myahya: feature extractor uses relation score as a feature
            logger.info("Applying relation score model.")
            # myahya: extract PAIR features/labels for test fold
            testfoldpair_features, testfoldpair_labels = construct_pair_examples(
                test_fold,
                self.feature_extractor)
            # myahya: extract UNI features/labels for test fold
            testfold_features, testfold_labels = construct_examples(
                test_fold,
                self.feature_extractor)
            # myahya: add features from test fold (which features are these exactly?)
            features.extend(testfold_features)
            # myahya: add test fold labels UNI
            labels.extend(testfold_labels)
            # myahya: add pair features from fold
            pair_features.extend(testfoldpair_features)
            # myahya: add test fold labels PAIR
            pair_labels.extend(testfoldpair_labels)
            num_fold += 1
            logger.info("Done collecting features for fold.")
        logger.info("Training final relation scorer.")
        rel_model = self.learn_rel_score_model(train_queries)
        self.feature_extractor.relation_score_model = rel_model
        self.relation_scorer = rel_model
        self.pruner = self.learn_prune_model(labels, features)
        self.learn_ranking_model(pair_features, pair_labels)

    def learn_ranking_model(self, features, labels):
        logger.info("Training tree classifier for ranking.")
        logger.info("#of labeled examples: %s" % len(features))
        logger.info("#labels non-zero: %s" % sum(labels))
        label_encoder = LabelEncoder()
        logger.info(features[-1])
        labels = label_encoder.fit_transform(labels)
        vec = DictVectorizer(sparse=False)
        X = vec.fit_transform(features)
        X, labels = utils.shuffle(X, labels, random_state=999)
        decision_tree = RandomForestClassifier(class_weight='auto',
                                               random_state=999,
                                               n_jobs=6,
                                               n_estimators=90)
        logger.info("Training random forest...")
        decision_tree.fit(X, labels)
        logger.info("Done.")
        self.model = decision_tree
        self.dict_vec = vec
        self.label_encoder = label_encoder
        self.correct_index = label_encoder.transform([1])[0]

    def store_model(self):
        logger.info("Writing model to %s." % self.get_model_filename())
        joblib.dump([self.model, self.label_encoder,
                     self.dict_vec, self.scaler],
                    self.get_model_filename())
        self.relation_scorer.store_model()
        self.pruner.store_model()
        logger.info("Done.")

    def compare_pair(self, x_candidate, y_candidate):
        """Compare two candidates.

        Return 1 if x_candidate > y_candidate, else return -1.
        :param x_candidate:
        :param y_candidate:
        :return:
        """
        if not self.model:
            self.load_model()
        # Use the preferences for sorting.
        else:
            res = None
            if (x_candidate, y_candidate) in self.cmp_cache:
                res = self.cmp_cache[(x_candidate, y_candidate)]
            if res is None:
                x_features = self.feature_extractor.extract_features(
                    x_candidate)
                y_features = self.feature_extractor.extract_features(
                    y_candidate)
                diff = feature_diff(x_features, y_features)
                X = self.dict_vec.transform(diff)
                if self.scaler:
                    X = self.scaler.transform(X)
                self.model.n_jobs = 1
                p = self.model.predict(X)
                c = self.label_encoder.inverse_transform(p)
                res = c[0]
            if res == 1:
                return 1
            else:
                return -1

    def _precompute_cmp(self, candidates, max_cache_candidates=300):
        """Pre-compute comparisons.

        The main overhead is calling the classification routine. Therefore,
        pre-computing all O(n^2) comparisons (which can be done with a single
        classification call) is actually faster up to a limit.

        :param candidates:
        :param max_cache_candidates:
        :return:
        """
        if not self.model:
            self.load_model()
        self.cmp_cache = dict()
        pairs = []
        pair_features = []
        features = []
        if len(candidates) > max_cache_candidates:
            logger.info("Cannot precoumpte for  all of %s candidates."
                        % len(candidates))
            return
        start = time.time()
        for c in candidates[:max_cache_candidates]:
            f = self.feature_extractor.extract_features(c)
            features.append(f)
        duration = (time.time() - start) * 1000
        logger.debug("FExtract took %s ms" % duration)
        start = time.time()
        for i, x in enumerate(candidates[:max_cache_candidates]):
            x_f = features[i]
            for j, y in enumerate(candidates[:max_cache_candidates]):
                if i == y:
                    continue
                y_f = features[j]
                diff = feature_diff(x_f, y_f)
                pair_features.append(diff)
                pairs.append((x, y))
        duration = (time.time() - start) * 1000
        logger.debug("FDiff for %s took %s ms" % (len(pairs), duration))
        if len(pairs) > 0:
            X = self.dict_vec.transform(pair_features)
            if self.scaler:
                X = self.scaler.transform(X)
            self.model.n_jobs = 1
            start = time.time()
            p = self.model.predict(X)
            duration = (time.time() - start) * 1000
            logger.debug("Predict for %s took %s ms" % (len(pairs), duration))
            self.model.n_jobs = 1
            c = self.label_encoder.inverse_transform(p)
            # Remember the #of wins/losses for each candidate.
            for (x, y), s in zip(pairs, c):
                self.cmp_cache[(x, y)] = s

    def rank_query_candidates(self, query_candidates, key=lambda x: x):
        """Rank query candidates by scoring and then sorting them.

        :param query_candidates:
        :return:
        """
        if not self.model:
            self.load_model()
        query_candidates = shuffle_candidates(query_candidates, key)
        num_candidates = len(query_candidates)
        logger.debug("Pruning %s candidates" % num_candidates)
        query_candidates = self.prune_candidates(query_candidates, key)
        logger.debug("%s of %s candidates remain" % (len(query_candidates),
                                                    num_candidates))
        start = time.time()
        candidates = [key(q) for q in query_candidates]
        self._precompute_cmp(candidates)
        ranked_candidates = sorted(query_candidates,
                                   cmp=self.compare_pair,
                                   key=key,
                                   reverse=True)
        self.cmp_cache = dict()
        if len(query_candidates) > 0:
            duration = (time.time() - start) * 1000
            logger.debug(
                "Sorting %s candidates took %s ms. %s ms per candidate" %
                (len(query_candidates), duration,
                 float(duration) / len(query_candidates)))
        return ranked_candidates

    def prune_candidates(self, query_candidates, key):
        remaining = []
        if len(query_candidates) > 0:
            remaining = self.pruner.prune_candidates(query_candidates, key)
        return remaining
Пример #41
0
class CandidatePruner(MLModel):
    """Learns a recall-optimized pruning model."""

    def __init__(self,
                 name,
                 rel_score_model):
        name += self.get_pruner_suffix()
        MLModel.__init__(self, name, None)
        # Note: The model is lazily when needed.
        self.model = None
        self.label_encoder = None
        self.dict_vec = None
        self.scaler = None
        # The index of the correct label.
        self.correct_index = -1
        self.feature_extractor = FeatureExtractor(True,
                                                  False,
                                                  relation_score_model=rel_score_model,
                                                  entity_features=True)

    def get_pruner_suffix(self):
        return "_Pruner"

    def print_model(self, n_top=30):
        dict_vec = self.dict_vec
        classifier = self.model
        logger.info("Printing top %s weights." % n_top)
        logger.info("intercept: %.4f" % classifier.intercept_[0])
        feature_weights = []
        for name, index in dict_vec.vocabulary_.iteritems():
            feature_weights.append((name, classifier.coef_[0][index]))
        feature_weights = sorted(feature_weights, key=lambda x: math.fabs(x[1]),
                                 reverse=True)
        for name, weight in feature_weights[:n_top]:
            logger.info("%s: %.4f" % (name, weight))

    def learn_model(self, labels, features):
        logger.info("Learning prune classifier.")
        logger.info("#of labeled examples: %s" % len(features))
        logger.info("#labels non-zero: %s" % sum(labels))
        num_labels = float(len(labels))
        num_pos_labels = sum(labels)
        num_neg_labels = num_labels - num_pos_labels
        pos_class_weight = num_labels / num_pos_labels
        neg_class_weight = num_labels / num_neg_labels
        pos_class_boost = 2.0
        label_encoder = LabelEncoder()
        logger.info(features[-1])
        vec = DictVectorizer(sparse=False)
        X = vec.fit_transform(features)
        labels = label_encoder.fit_transform(labels)
        self.label_encoder = label_encoder
        self.scaler = StandardScaler()
        X = self.scaler.fit_transform(X)
        X, labels = utils.shuffle(X, labels, random_state=999)
        class_weights = {1: pos_class_weight * pos_class_boost,
                         0: neg_class_weight}
        logger.info(class_weights)
        logreg_cv = LogisticRegressionCV(Cs=20,
                                         class_weight=class_weights,
                                         cv=6,
                                         solver='lbfgs',
                                         n_jobs=6,
                                         # max_iter=40,
                                         verbose=True)
        logreg_cv.fit(X, labels)
        self.model = logreg_cv
        pred = self.model.predict(X)
        logger.info("F-1 score on train: %.4f" % metrics.f1_score(labels, pred,
                                                                  pos_label=1))
        logger.info("Classification report:\n"
                    + classification_report(labels, pred))
        self.dict_vec = vec
        self.label_encoder = label_encoder
        self.print_model()
        logger.info("Done learning prune classifier.")

    def load_model(self):
        model_file = self.get_model_filename()
        try:
            [model, label_enc, dict_vec, scaler] \
                = joblib.load(model_file)
            self.model = model
            self.dict_vec = dict_vec
            self.scaler = scaler
            self.label_encoder = label_enc
            self.correct_index = label_enc.transform([1])[0]
            logger.info("Loaded scorer model from %s" % model_file)
        except IOError:
            logger.warn("Model file %s could not be loaded." % model_file)
            raise

    def store_model(self):
        logger.info("Writing model to %s." % self.get_model_filename())
        joblib.dump([self.model, self.label_encoder,
                     self.dict_vec, self.scaler], self.get_model_filename())
        logger.info("Done.")

    def prune_candidates(self, query_candidates, key):
        remaining = []
        candidates = [key(q) for q in query_candidates]
        features = []
        for c in candidates:
            c_features = self.feature_extractor.extract_features(c)
            features.append(c_features)
        X = self.dict_vec.transform(features)
        X = self.scaler.transform(X)
        p = self.model.predict(X)
        # c = self.prune_label_encoder.inverse_transform(p)
        for candidate, predict in zip(query_candidates, p):
            if predict == 1:
                remaining.append(candidate)
        return remaining
Пример #42
0
class RelationNgramScorer(MLModel):
    """Learns a scoring based on question ngrams."""

    def __init__(self,
                 name,
                 regularization_C,
                 percentile=None):
        name += self.get_relscorer_suffix()
        MLModel.__init__(self, name, None)
        # Note: The model is lazily when needed.
        self.model = None
        self.regularization_C = regularization_C
        self.top_percentile = percentile
        self.label_encoder = None
        self.dict_vec = None
        self.scaler = None
        # The index of the correct label.
        self.correct_index = -1
        self.feature_extractor = FeatureExtractor(False,
                                                  True,
                                                  entity_features=False)

    def get_relscorer_suffix(self):
        return "_RelScore"

    def load_model(self):
        model_file = self.get_model_filename()
        try:
            [model, label_enc, dict_vec, scaler] \
                = joblib.load(model_file)
            self.model = model
            self.dict_vec = dict_vec
            self.scaler = scaler
            self.label_encoder = label_enc
            self.correct_index = label_enc.transform([1])[0]
            logger.info("Loaded scorer model from %s" % model_file)
        except IOError:
            logger.warn("Model file %s could not be loaded." % model_file)
            raise

    def learn_model(self, train_queries):
        if self.top_percentile:
            logger.info("Collecting frequent n-gram features...")
            n_grams_dict = get_top_chi2_candidate_ngrams(train_queries,
                                                         self.feature_extractor,
                                                         percentile=self.top_percentile)
            logger.info("Collected %s n-gram features" % len(n_grams_dict))
            self.feature_extractor.ngram_dict = n_grams_dict
        features, labels = construct_examples(train_queries,
                                              self.feature_extractor)
        logger.info("#of labeled examples: %s" % len(features))
        logger.info("#labels non-zero: %s" % sum(labels))
        label_encoder = LabelEncoder()
        logger.info(features[-1])
        labels = label_encoder.fit_transform(labels)
        vec = DictVectorizer(sparse=True)
        scaler = StandardScaler(with_mean=False)
        X = vec.fit_transform(features)
        X = scaler.fit_transform(X)
        X, labels = utils.shuffle(X, labels, random_state=999)
        logger.info("#Features: %s" % len(vec.vocabulary_))
        # Perform grid search or use provided C.
        if self.regularization_C is None:
            logger.info("Performing grid search.")
            relation_scorer = SGDClassifier(loss='log', class_weight='auto',
                                            n_iter=np.ceil(
                                                10 ** 6 / len(labels)),
                                            random_state=999)
            cv_params = [{"alpha": [10.0, 5.0, 2.0, 1.5, 1.0, 0.5,
                                    0.1, 0.01, 0.001, 0.0001]}]
            grid_search_cv = grid_search.GridSearchCV(relation_scorer,
                                                      cv_params,
                                                      n_jobs=8,
                                                      verbose=1,
                                                      cv=8,
                                                      refit=True)
            grid_search_cv.fit(X, labels)
            logger.info("Best score: %.5f" % grid_search_cv.best_score_)
            logger.info("Best params: %s" % grid_search_cv.best_params_)
            self.model = grid_search_cv.best_estimator_
        else:
            logger.info("Learning relation scorer with C: %s."
                        % self.regularization_C)
            relation_scorer = SGDClassifier(loss='log', class_weight='auto',
                                            n_iter=np.ceil(
                                                10 ** 6 / len(labels)),
                                            alpha=self.regularization_C,
                                            random_state=999)
            relation_scorer.fit(X, labels)
            logger.info("Done.")
            self.model = relation_scorer
        self.dict_vec = vec
        self.scaler = scaler
        self.label_encoder = label_encoder
        self.correct_index = label_encoder.transform([1])[0]
        self.print_model()

    def print_model(self, n_top=20):
        dict_vec = self.dict_vec
        classifier = self.model
        logger.info("Printing top %s weights." % n_top)
        logger.info("intercept: %.4f" % classifier.intercept_[0])
        feature_weights = []
        for name, index in dict_vec.vocabulary_.iteritems():
            feature_weights.append((name, classifier.coef_[0][index]))
        feature_weights = sorted(feature_weights, key=lambda x: math.fabs(x[1]),
                                 reverse=True)
        for name, weight in feature_weights[:n_top]:
            logger.info("%s: %.4f" % (name, weight))

    def store_model(self):
        logger.info("Writing model to %s." % self.get_model_filename())
        joblib.dump([self.model, self.label_encoder,
                     self.dict_vec, self.scaler], self.get_model_filename())
        logger.info("Done.")

    def score(self, candidate):
        if not self.model:
            self.load_model()
        features = self.feature_extractor.extract_features(candidate)
        X = self.dict_vec.transform(features)
        X = self.scaler.transform(X)
        prob = self.model.predict_proba(X)
        # Prob is an array of n_examples, n_classes
        score = prob[0][self.correct_index]
        return RankScore(score)