def run(self): log.info(f"Fakeme starts at {datetime.now()}") # get all fields and schemas for tables self.schemas, fields = MultiTableRunner( self.tables, rls=self.rls).get_fields_and_schemas(dump_schema=self.dump_schema) walk_list = [] for path in self.paths_with_scripts: walker = Walker(path_to_dir=path, extension="hql", recursive=True) [walk_list.append(path) for path in walker.walk()] field_extractor = FieldRulesExtractor(fields, walk_list) # generate "value_rules.json" with rules for fields field_extractor.generate_rules() self.priority_dict = self.create_tables_priority_graph() for key, value in self.priority_dict.items(): for table in value: if (table not in created and table not in self.with_data and table not in self.priority_dict.get(key + 1, [])): self.create_table(table) log.info( f"Fakeme finished data generation successful \n {datetime.now()}")
def column_generator(self, column_cfg: Dict): """create column with values""" log.info("Generate column {}".format(column_cfg.name)) # unique_values - count of unique values in column in this table # unique - flag, must be all values unique in this table or not column = [] unique_values = self.row_numbers matches_k = self.cfg.matches unique = None percent_of_nulls = self.cfg.percent_of_nulls if self.table_settings: ( unique_values, unique, matches_k, percent_of_nulls, ) = self.__process_table_settings( column_cfg, unique_values, unique, matches_k, percent_of_nulls) # todo: refactor this # get field rule generating_rule = self.get_column_generating_rule(column_cfg.name) if self.table_id in self.chains: df_column = self.get_column_from_chained(column_cfg.name, matches_k) else: df_column = None if not unique_values: unique_values = self.row_numbers if df_column: column = self.__process_df_column(df_column, column, column_cfg, unique_values, unique) if len(column) < unique_values: unique_values = unique_values - len(column) else: column = column[:unique_values] unique_values = 0 if (column_cfg.len and math.isnan(generating_rule["len"]) or column_cfg.len and generating_rule["len"] > column_cfg.len): generating_rule["len"] = column_cfg.len while unique_values: value = values_generator(generating_rule, unique) column.append(value) unique_values -= 1 total_rows = self.row_numbers - len(column) rel_size = total_rows / len(column) num_copy = int(rel_size) base_column = copy.deepcopy(column) for _ in range(num_copy): column += base_column float_adding = rel_size - num_copy column += base_column[:int(len(base_column) * float_adding)] column = self.__config_mode_processing(column, column_cfg, percent_of_nulls) return column
def table_prefix_in_column_name(self, column_name: Text) -> Union[Text, None]: """check do we have table name prefix in column name, to get possible auto aliasing""" table_names = list(self.schemas.keys()) for table_name in table_names: if self._remove_plural_from_table_name(table_name) in column_name: log.info(f"Found alias with {table_name}") return table_name
def generate_rules(self, remove_existed=True): if not remove_existed and os.path.isfile(self.file_name): log.info("{} with rules founded in {}".format(self.file_name, os.getcwd())) else: values_rules_dict = self.rules_extracts() with open(self.file_name, "w+") as outfile: json.dump(values_rules_dict, outfile, indent=2) log.info("{} with rules for fields was created".format(self.file_name)) return True
def _prepare_path(file_path, remove_old): """prepare folder and check target file""" if os.path.isfile(file_path): log.info("Founded old file {}".format(file_path)) if remove_old: os.remove(file_path) log.info("File {} was removed".format(file_path)) else: raise Exception( "Impossible to generate data into file {}. " "File already exist. Please delete file or set " "'remove_old'=True".format(file_path)) else: if not os.path.isdir(os.path.abspath(os.path.dirname(file_path))): os.makedirs(os.path.dirname(file_path)) return file_path
def get_depend_on_file(self): """find depends on other tables (data files) TODO: add support for multiple depends on files """ dir_files = [] if self.chains and self.table_id in self.chains: dir_files = self.__chain_tables(dir_files) else: for item in self.schema: if item["name"] in self.chained: chained_tables = [ table for table in self.chained[item["name"]] if table != self.table_id ] [ dir_files.append(file_name) if file_name.startswith(table) else None for table in chained_tables for file_name in os.listdir(self.prefix) ] elif "all" in self.prefix and item["name"] in self.chains[ "all"]: table_chain = self.chains["all"] key = table_chain.keys()[0] if table_chain[key]["table"] != self.table_id: src_file = os.path.join( self.prefix, "{}.{}".format(table_chain[key]["table"], self.file_format), ) if not os.path.isfile(src_file): raise ValueError dir_files.append(src_file) if dir_files: log.info("Depend on: {}".format(dir_files)) return os.path.join(self.prefix, dir_files[0]) else: return []