def scale_features(self, X, final_model=False, pkl_filename='scaler.pkl'): """ Scale the independent variables. Inputs: X: (DataFrame) Independent variables. Returns: pd_new_X: (DataFrame) Scaled independent variables. """ if final_model: scaler = load_pkl(os.path.join(self.model_path, pkl_filename)) else: if self.scale_mode.lower() == 'standard': scaler = StandardScaler().fit(X) elif self.scale_mode.lower() == 'minmax': scaler = MinMaxScaler().fit(X) elif self.scale_mode.lower() == 'robust': scaler = RobustScaler().fit(X) else: raise ValueError('Invalid scaling mode: {}'.format(self.scale_mode)) if self.model_path: save_pkl(scaler, os.path.join(self.model_path, pkl_filename)) pd_new_X = pd.DataFrame( scaler.transform(X), index=X.index, columns=X.columns) return pd_new_X
def get_all_classes_dict(self): """ Get all candidate classes. """ log.info('Generating dictionary of all classes.') if file_exists(self.full_ontology_pkl) and not self.overwrite_pkl: log.info('Using pre-generated full classes dictionary file.') return load_pkl(self.full_ontology_pkl) full_classes_dict = {} for class_label in self.all_classes: pd_match = self.pd_foodon_pairs[self.pd_foodon_pairs['Parent'] == class_label] children = pd_match['Child'].tolist() children_entities = [c for c in children if c in self.all_entities] node_from = self.graph_dict['foodon product type'] node_to = self.graph_dict[class_label] paths = [] if class_label == 'foodon product type': paths.append(tuple(['foodon product type'])) else: for path in nx.all_simple_paths(self.foodon_graph, source=node_from, target=node_to): translated_path = [self.graph_dict_flip[p] for p in path] paths.append(tuple(translated_path[::-1])) full_classes_dict[class_label] = (paths, children_entities) save_pkl(full_classes_dict, self.full_ontology_pkl) return full_classes_dict
def get_seeded_skeleton(self, candidate_classes_dict): log.info('Generating dictionary of skeleton candidate classes.') if file_exists( self.skeleton_and_entities_pkl) and not self.overwrite_pkl: log.info('Using pickled skeleton file: %s', self.skeleton_and_entities_pkl) return load_pkl(self.skeleton_and_entities_pkl) skeleton_candidate_classes_dict = {} candidate_entities = [] for candidate_class in candidate_classes_dict.keys(): entities = candidate_classes_dict[candidate_class][1] if len(entities) <= self.num_seeds: temp_num_seeds = len( entities) - self.num_min_extracted_entities if temp_num_seeds > 0: seeds = random.sample(entities, temp_num_seeds) candidate_entities.extend(list(set(entities) - set(seeds))) else: seeds = entities.copy() else: seeds = random.sample(entities, self.num_seeds) candidate_entities.extend(list(set(entities) - set(seeds))) skeleton_candidate_classes_dict[candidate_class] = ( candidate_classes_dict[candidate_class][0], seeds) candidate_entities = list(set(candidate_entities)) candidate_entities.sort() log.info( 'Found %d candidate entities to populate out of %d all entities.', len(candidate_entities), len(self.all_entities)) return_value = (skeleton_candidate_classes_dict, candidate_entities) save_pkl(return_value, self.skeleton_and_entities_pkl) return return_value
def save_models(classifier, pre_built_models_dir, main_config, model_manager, num_classifiers): log.info('Pre-built model directory specified for %s does not exist.', classifier) log.info('Building models again.') # create directory create_dir(pre_built_models_dir) # load config parsers preprocess_config = ConfigParser(main_config.get_str('preprocess_config')) classifier_config = ConfigParser(main_config.get_str('classifier_config')) classifier_config.overwrite('classifier', classifier) # perform preprocessing X, y = model_manager.preprocess(preprocess_config, section=classifier) # select subset of features if requested selected_features = main_config.get_str_list('selected_features') if selected_features: log.info('Selecting subset of features: %s', selected_features) X = X[selected_features] # train multiple classifiers for i in range(num_classifiers): log.debug('Processing classifier %d/%s', i+1, num_classifiers) cmanager = ClassifierManager(classifier_config) clf = CalibratedClassifierCV(cmanager.get_classifier(), method='sigmoid', cv=5) clf.fit(X, y) save_pkl( clf, os.path.join(pre_built_models_dir, 'model_{}.pkl'.format(i)))
def run_iteration(self): if file_exists(self.pairs_filepath) and file_exists( self.populated_filepath): log.info('Pre-calculated iterations found.') iteration_pairs = load_pkl(self.pairs_filepath) iteration_populated_dict = load_pkl(self.populated_filepath) return iteration_pairs, iteration_populated_dict num_iterations = math.floor(self.num_candidate_entities / self.num_mapping_per_iteration) iteration_pairs = {} iteration_populated_dict = {} iteration = 0 while len(self.candidate_entities) > 0: log.info('Updating scores. Iteration: %d/%d', iteration, num_iterations) t1 = time() # calculate score pd_scores = self.alpha * self.pd_siblings_scores + ( 1 - self.alpha) * self.pd_parents_scores # find top N unique entities with highest score num_scores = pd_scores.shape[0] * pd_scores.shape[1] pd_top_scores = pd_scores.stack().nlargest( num_scores).reset_index() pd_top_scores.columns = [ 'candidate class', 'candidate entity', 'score' ] pd_top_scores.drop_duplicates(subset='candidate entity', inplace=True) log.debug('Top scores: \n%s', str(pd_top_scores.head())) top_n_scores = list( zip(pd_top_scores['candidate class'], pd_top_scores['candidate entity'])) top_n_scores = top_n_scores[0:self.num_mapping_per_iteration] # populate skeleton using selected entity for (candidate_class, candidate_entity) in top_n_scores: self.candidate_classes_info[candidate_class][1].append( candidate_entity) # save progress iteration_pairs[iteration] = top_n_scores.copy() iteration_populated_dict[ iteration] = self.candidate_classes_info.copy() if len(self.candidate_entities) <= self.num_mapping_per_iteration: break classes_to_update = list(set([x[0] for x in top_n_scores])) entities_to_remove = list(set([x[1] for x in top_n_scores])) # remove selected entities from candidate entities and scores self.candidate_entities = list( set(self.candidate_entities) - set(entities_to_remove)) self.pd_siblings_scores = self.pd_siblings_scores.drop( labels=entities_to_remove, axis=1) self.pd_parents_scores = self.pd_parents_scores.drop( labels=entities_to_remove, axis=1) # if alpha is 0, no need to update siblings score if self.alpha == 0.0: log.info('Skipping siblings score update since alpha is 0.') else: # update siblings score entity_class_pairs = list( itertools.product(classes_to_update, self.candidate_entities)) results = [] for pair in entity_class_pairs: results.append(self._calculate_siblings_score(pair)) results = np.array(results).reshape( len(classes_to_update), len(self.candidate_entities)) pd_siblings_to_update = pd.DataFrame( results, index=classes_to_update, columns=self.candidate_entities) self.pd_siblings_scores.update(pd_siblings_to_update) t2 = time() log.info('Elapsed time for updating scores: %.2f minutes', (t2 - t1) / 60) iteration += 1 save_pkl(iteration_pairs, self.pairs_filepath) save_pkl(iteration_populated_dict, self.populated_filepath) return iteration_pairs, iteration_populated_dict