def _verify_dataset_integrity(self): new_hash = compute_dataset_hash(self.dataset) if self.__dataset_hash != new_hash: print("Old hash: {}".format(self.__dataset_hash), file=sys.stderr) print("New hash: {}".format(new_hash), file=sys.stderr) #TODO exception handling self._reload_dataset()
def __init__(self, problem_id, username, orm, dataset={}, target=None, entities_featurized=None): self.problem_id = problem_id self.username = username self.orm = orm self.dataset = dataset self.target = target self.entities_featurized = entities_featurized if self.dataset: self.__dataset_hash = compute_dataset_hash(self.dataset) else: self.__dataset_hash = None
def _load_dataset_split(self, split="train", dataset={}, entities_featurized=None, target=None, dataset_hash=None, compute_hash=True): # query db for import parameters to load files is_present_dataset = bool(dataset) is_present_entities_featurized = not pd.DataFrame( entities_featurized).empty is_present_target = not pd.DataFrame(target).empty is_anything_missing = not all([ is_present_dataset, is_present_entities_featurized, is_present_target ]) if is_anything_missing: with self.orm.session_scope() as session: problem = session.query(Problem)\ .filter(Problem.id == self.problem_id).one() problem_data_dir = getattr(problem, "data_dir_{}".format(split)) problem_files = json.loads(problem.files) problem_table_names = json.loads(problem.table_names) problem_entities_featurized_table_name = \ problem.entities_featurized_table_name problem_target_table_name = problem.target_table_name # load entities and other tables if not is_present_dataset: # load other tables for (table_name, filename) in zip(problem_table_names, problem_files): if table_name == problem_entities_featurized_table_name or \ table_name == problem_target_table_name: continue abs_filename = os.path.join(problem_data_dir, filename) dataset[table_name] = pd.read_csv(abs_filename, low_memory=False, header=0) # compute/recompute hash if compute_hash: dataset_hash = compute_dataset_hash(dataset) else: dataset_hash = None # recompute dataset hash. condition only met if we dataset has already # loaded, but dataset hash had not been computed. (because we just # computed hash several lines above!) if compute_hash: if not dataset_hash: dataset_hash = compute_dataset_hash(dataset) # load entities featurized if not is_present_entities_featurized: # if empty string, we simply don't have any features to add if problem_entities_featurized_table_name: cols = list(problem_table_names) ind_features = cols.index( problem_entities_featurized_table_name) abs_filename = os.path.join(problem_data_dir, problem_files[ind_features]) entities_featurized = pd.read_csv(abs_filename, low_memory=False, header=0) # load target if not is_present_target: cols = list(problem_table_names) ind_target = cols.index(problem_target_table_name) abs_filename = os.path.join(problem_data_dir, problem_files[ind_target]) # target might not exist if we are making predictions on unseen # test data if os.path.exists(abs_filename): target = pd.read_csv(abs_filename, low_memory=False, header=0) else: target = None return dataset, entities_featurized, target, dataset_hash