Пример #1
0
    def scale_features(self, X, final_model=False, pkl_filename='scaler.pkl'):
        """
        Scale the independent variables.

        Inputs:
            X: (DataFrame) Independent variables.

        Returns:
            pd_new_X: (DataFrame) Scaled independent variables.
        """
        if final_model:
            scaler = load_pkl(os.path.join(self.model_path, pkl_filename))
        else:
            if self.scale_mode.lower() == 'standard':
                scaler = StandardScaler().fit(X)
            elif self.scale_mode.lower() == 'minmax':
                scaler = MinMaxScaler().fit(X)
            elif self.scale_mode.lower() == 'robust':
                scaler = RobustScaler().fit(X)
            else:
                raise ValueError('Invalid scaling mode: {}'.format(self.scale_mode))

            if self.model_path:
                save_pkl(scaler, os.path.join(self.model_path, pkl_filename))

        pd_new_X = pd.DataFrame(
            scaler.transform(X),
            index=X.index,
            columns=X.columns)

        return pd_new_X
Пример #2
0
    def get_all_classes_dict(self):
        """
        Get all candidate classes.
        """
        log.info('Generating dictionary of all classes.')

        if file_exists(self.full_ontology_pkl) and not self.overwrite_pkl:
            log.info('Using pre-generated full classes dictionary file.')
            return load_pkl(self.full_ontology_pkl)

        full_classes_dict = {}
        for class_label in self.all_classes:
            pd_match = self.pd_foodon_pairs[self.pd_foodon_pairs['Parent'] ==
                                            class_label]
            children = pd_match['Child'].tolist()
            children_entities = [c for c in children if c in self.all_entities]

            node_from = self.graph_dict['foodon product type']
            node_to = self.graph_dict[class_label]

            paths = []
            if class_label == 'foodon product type':
                paths.append(tuple(['foodon product type']))
            else:
                for path in nx.all_simple_paths(self.foodon_graph,
                                                source=node_from,
                                                target=node_to):
                    translated_path = [self.graph_dict_flip[p] for p in path]
                    paths.append(tuple(translated_path[::-1]))

            full_classes_dict[class_label] = (paths, children_entities)

        save_pkl(full_classes_dict, self.full_ontology_pkl)

        return full_classes_dict
Пример #3
0
    def get_seeded_skeleton(self, candidate_classes_dict):
        log.info('Generating dictionary of skeleton candidate classes.')

        if file_exists(
                self.skeleton_and_entities_pkl) and not self.overwrite_pkl:
            log.info('Using pickled skeleton file: %s',
                     self.skeleton_and_entities_pkl)
            return load_pkl(self.skeleton_and_entities_pkl)

        skeleton_candidate_classes_dict = {}
        candidate_entities = []
        for candidate_class in candidate_classes_dict.keys():
            entities = candidate_classes_dict[candidate_class][1]

            if len(entities) <= self.num_seeds:
                temp_num_seeds = len(
                    entities) - self.num_min_extracted_entities

                if temp_num_seeds > 0:
                    seeds = random.sample(entities, temp_num_seeds)
                    candidate_entities.extend(list(set(entities) - set(seeds)))
                else:
                    seeds = entities.copy()
            else:
                seeds = random.sample(entities, self.num_seeds)
                candidate_entities.extend(list(set(entities) - set(seeds)))

            skeleton_candidate_classes_dict[candidate_class] = (
                candidate_classes_dict[candidate_class][0], seeds)

        candidate_entities = list(set(candidate_entities))
        candidate_entities.sort()

        log.info(
            'Found %d candidate entities to populate out of %d all entities.',
            len(candidate_entities), len(self.all_entities))

        return_value = (skeleton_candidate_classes_dict, candidate_entities)
        save_pkl(return_value, self.skeleton_and_entities_pkl)

        return return_value
Пример #4
0
def save_models(classifier,
                pre_built_models_dir,
                main_config,
                model_manager,
                num_classifiers):
    log.info('Pre-built model directory specified for %s does not exist.', classifier)
    log.info('Building models again.')

    # create directory
    create_dir(pre_built_models_dir)

    # load config parsers
    preprocess_config = ConfigParser(main_config.get_str('preprocess_config'))
    classifier_config = ConfigParser(main_config.get_str('classifier_config'))
    classifier_config.overwrite('classifier', classifier)

    # perform preprocessing
    X, y = model_manager.preprocess(preprocess_config, section=classifier)

    # select subset of features if requested
    selected_features = main_config.get_str_list('selected_features')
    if selected_features:
        log.info('Selecting subset of features: %s', selected_features)
        X = X[selected_features]

    # train multiple classifiers
    for i in range(num_classifiers):
        log.debug('Processing classifier %d/%s', i+1, num_classifiers)

        cmanager = ClassifierManager(classifier_config)
        clf = CalibratedClassifierCV(cmanager.get_classifier(), method='sigmoid', cv=5)
        clf.fit(X, y)

        save_pkl(
            clf,
            os.path.join(pre_built_models_dir, 'model_{}.pkl'.format(i)))
Пример #5
0
    def run_iteration(self):
        if file_exists(self.pairs_filepath) and file_exists(
                self.populated_filepath):
            log.info('Pre-calculated iterations found.')
            iteration_pairs = load_pkl(self.pairs_filepath)
            iteration_populated_dict = load_pkl(self.populated_filepath)
            return iteration_pairs, iteration_populated_dict

        num_iterations = math.floor(self.num_candidate_entities /
                                    self.num_mapping_per_iteration)
        iteration_pairs = {}
        iteration_populated_dict = {}

        iteration = 0
        while len(self.candidate_entities) > 0:
            log.info('Updating scores. Iteration: %d/%d', iteration,
                     num_iterations)
            t1 = time()

            # calculate score
            pd_scores = self.alpha * self.pd_siblings_scores + (
                1 - self.alpha) * self.pd_parents_scores

            # find top N unique entities with highest score
            num_scores = pd_scores.shape[0] * pd_scores.shape[1]
            pd_top_scores = pd_scores.stack().nlargest(
                num_scores).reset_index()
            pd_top_scores.columns = [
                'candidate class', 'candidate entity', 'score'
            ]
            pd_top_scores.drop_duplicates(subset='candidate entity',
                                          inplace=True)

            log.debug('Top scores: \n%s', str(pd_top_scores.head()))

            top_n_scores = list(
                zip(pd_top_scores['candidate class'],
                    pd_top_scores['candidate entity']))
            top_n_scores = top_n_scores[0:self.num_mapping_per_iteration]

            # populate skeleton using selected entity
            for (candidate_class, candidate_entity) in top_n_scores:
                self.candidate_classes_info[candidate_class][1].append(
                    candidate_entity)

            # save progress
            iteration_pairs[iteration] = top_n_scores.copy()
            iteration_populated_dict[
                iteration] = self.candidate_classes_info.copy()

            if len(self.candidate_entities) <= self.num_mapping_per_iteration:
                break

            classes_to_update = list(set([x[0] for x in top_n_scores]))
            entities_to_remove = list(set([x[1] for x in top_n_scores]))

            # remove selected entities from candidate entities and scores
            self.candidate_entities = list(
                set(self.candidate_entities) - set(entities_to_remove))
            self.pd_siblings_scores = self.pd_siblings_scores.drop(
                labels=entities_to_remove, axis=1)
            self.pd_parents_scores = self.pd_parents_scores.drop(
                labels=entities_to_remove, axis=1)

            # if alpha is 0, no need to update siblings score
            if self.alpha == 0.0:
                log.info('Skipping siblings score update since alpha is 0.')
            else:
                # update siblings score
                entity_class_pairs = list(
                    itertools.product(classes_to_update,
                                      self.candidate_entities))

                results = []
                for pair in entity_class_pairs:
                    results.append(self._calculate_siblings_score(pair))

                results = np.array(results).reshape(
                    len(classes_to_update), len(self.candidate_entities))

                pd_siblings_to_update = pd.DataFrame(
                    results,
                    index=classes_to_update,
                    columns=self.candidate_entities)

                self.pd_siblings_scores.update(pd_siblings_to_update)

            t2 = time()
            log.info('Elapsed time for updating scores: %.2f minutes',
                     (t2 - t1) / 60)

            iteration += 1

        save_pkl(iteration_pairs, self.pairs_filepath)
        save_pkl(iteration_populated_dict, self.populated_filepath)

        return iteration_pairs, iteration_populated_dict