def type_cast(): conn, cur = create_pgconn() cur.execute('set role el_salvador_mined_education_write') sql_stmnt = 'set role el_salvador_mined_education_write;' for sql_file, table in zip(['features_imputation','features_onehot','features_aggregate','features_timeaggregates_small'],['features_imputed','features_one_hot','features_aggregate','features_timeaggregates']): features = feature_list_from_SQL(f'{sql_file}') sql_stmnt += f"create table results.{table}_typecast as (select year_range, student" for feature in features: if feature not in ['year_range','student']: # logger.debug(f'{feature}') cur.execute(f"select data_type from information_schema.columns where table_schema='results' and table_name = '{table}' and column_name='{feature}';") feature_type = cur.fetchall()[0][0] if feature_type in ['integer','boolean']: logger.debug(f"{feature}: {feature_type}: int2") sql_stmnt += f', {feature}::int::int2' elif feature_type in ['double precision', 'numeric']: logger.debug(f"{feature}: {feature_type}: float8") sql_stmnt += f', {feature}::float8' else: logger.warn(f'{feature}: Type not caught') sql_stmnt += f" from results.{table});" with open(SQL_DIR + 'features_typecast.sql', 'w+') as fil: fil.write(sql_stmnt)
def create_single_row_per_student_year(feature_list=None): """ Groups by year and student, and aggregates all the raw features. During the aggregation, new features are created (avg, count...) according to the data type of the column. The returned SQL takes ~10min to run """ conn, cur = create_pgconn() cur.execute('set role el_salvador_mined_education_write') if feature_list is None: feature_list = feature_list_from_SQL('features_raw') logger.debug(feature_list) aggreg_sql = """set role el_salvador_mined_education_write; drop table if exists results.features_aggregate; create table results.features_aggregate as ( select year_range, student, """ aggregate_feature_list = ['year_range','student'] constant_features = ['raw_age', 'student_female'] for feature in feature_list: if feature not in ['year_range','student']: logger.debug(f'{feature}') cur.execute(f"select data_type from information_schema.columns where table_name = 'features_raw' and column_name='{feature}';") feature_type = cur.fetchall()[0][0] logger.debug(f'{feature}: {feature_type}') if feature in constant_features: # Constant features are the ones that if there are multiple values, it is due to data noise, and are hence set to null. aggreg_sql += f"case when array_length(array_agg(distinct {feature}), 1)=1 then (array_agg(distinct {feature}))[1] else null::{feature_type} end as {feature}," aggregate_feature_list += [f'{feature}'] elif feature_type == 'boolean': aggreg_sql += f"bool_or({feature}) as {feature}_any, bool_and({feature}) as {feature}_all," aggregate_feature_list += [f'{feature}_any'] aggregate_feature_list += [f'{feature}_all'] elif feature_type in ['bigint','smallint','double precision', 'numeric', 'integer']: aggreg_sql += f"avg({feature}) as {feature}_avg, max({feature}) as {feature}_max, min({feature}) as {feature}_min," aggregate_feature_list += [f'{feature}_avg'] aggregate_feature_list += [f'{feature}_max'] aggregate_feature_list += [f'{feature}_min'] elif feature_type == 'text': # Text features are aggregated by concatenating strings. e.g. if a student was in Ahuachapán and Usulután in the same year, the departamento column is 'Ahuachapán,Usulután' aggreg_sql += f"array_to_string(array_agg(distinct {feature}), ',') as {feature}, array_length(array_agg(distinct {feature}), 1) as {feature}_count," aggregate_feature_list += [f'{feature}', f'{feature}_count'] else: logger.warn(f"Don't know how to convert {feature} with {feature_type}") aggreg_sql = f"-- Feature list: {','.join(aggregate_feature_list)}\n" + aggreg_sql[:-1] + " from results.features_raw group by year_range, student); create index aggregate_year_range_student_idx on results.features_aggregate(year_range, student);" # logger.error(aggreg_sql) with open(SQL_DIR + '/features_aggregate.sql', 'w+') as fil: fil.write(aggreg_sql)
def extract(directory, extensions, raw_folder, preprocessing_folder): """ Finds all files with some extensions inside a directory and converts them into a carrot separated value files. In the conversion, the files are moved from their initial raw_folder to a preprocessing folder, creating the necessary folder structure. Conversion requires 'xls_csv.sh', 'sav_csv.sh' and 'csv_csv.sh' Each .csv file is cleaned by removing newlines, trailing and leading spaces, and replacing consecutive spaces with a single space. Finally, each file is checked to see if all rows have the same number of fields using 'csv_health.sh' Args: directory (str): the folder containing the files you want to convert to csv extensions (lst): iterable of extension strings raw_folder (str): name of the folder containing raw data preprocessing_folder (str): name of the preprocessing folder """ for extension in extensions: files = list_files(directory, [extension]) for input_file in files: output_file = input_file.replace(raw_folder, preprocessing_folder) output_file = output_file.replace(extension, 'csv') try: if not os.path.exists(output_file): logger.debug('Output file: %s' % output_file) dirname = os.path.dirname(output_file) if not os.path.exists(dirname): os.makedirs(dirname) logger.debug('Making directory: %s' % dirname) if extension in ['xls', 'xlsx']: subprocess.check_output([ 'bash', SCRIPTS_DIR + '/xls_csv.sh', input_file, output_file ]) logger.debug('Converted xls to csv') elif extension in ['sav']: subprocess.check_output([ 'bash', SCRIPTS_DIR + '/sav_csv.sh', input_file, output_file ]) logger.debug('Converted sav to csv') elif extension in ['csv']: subprocess.check_output([ 'bash', SCRIPTS_DIR + '/csv_csv.sh', input_file, output_file ]) logger.debug('Converted csv to csv') else: logger.warn('Cannot do anything with %s extension' % extension) healthy = subprocess.check_output( ['bash', SCRIPTS_DIR + '/csv_health.sh', output_file]) if not healthy: logger.warn('File is unhealthy: %s' % output_file) except Exception as e: logger.warn('File %s failed because %s' % (output_file, e)) if os.path.exists(output_file): subprocess.run(['rm', output_file])
def create_time_aggregates(feature_list=None): """ Like in the previous aggregation, boolean features are simplified to any, and numerics are averaged. The time aggregation pulls from results.features_imputed, staging.labels, results.features_aggregate """ conn, cur = create_pgconn() cur.execute('set role el_salvador_mined_education_write') if feature_list is None: imputed_features = set(feature_list_from_SQL('features_imputation')) nonimputed_features = set(feature_list_from_SQL('features_nonimputed')) # onehot_features = set(feature_list_from_SQL('features_onehot')) feature_list = imputed_features.union(nonimputed_features)#.union(onehot_features) logger.debug(feature_list) aggreg_sql = """set role el_salvador_mined_education_write; drop table if exists results.features_timeaggregates; create table results.features_timeaggregates as (select """ aggregate_feature_list = ['year_range','student'] for feature in feature_list: if feature not in ['year_range','student']: logger.debug(f'{feature}') if feature in imputed_features: cur.execute(f"select data_type from information_schema.columns where table_name = 'features_imputed' and column_name='{feature}';") dummy_table = 'a' # dummy_table is just a short hand for calling columns, e.g. a.family_members elif feature in nonimputed_features: cur.execute(f"select data_type from information_schema.columns where table_name = 'features_aggregate' and column_name='{feature}';") dummy_table = 'c' # elif feature in onehot_features: # cur.execute(f"select data_type from information_schema.columns where table_name = 'features_onehot' and column_name='{feature}';") # dummy_table = 'd' else: logger.warned('Not caught by if/else') feature_type = cur.fetchall()[0][0] logger.debug(f'{feature}: {feature_type}') for window in ['1y','2y','3y']: if feature_type == 'boolean': # for name in ['any', 'all']: aggreg_sql += f"(bool_and({dummy_table}.{feature}) over w_{window})::int as {feature}_{window}_all," aggregate_feature_list += [f'{feature}_{window}_all'] aggreg_sql += f"(bool_or({dummy_table}.{feature}) over w_{window})::int as {feature}_{window}_any," aggregate_feature_list += [f'{feature}_{window}_any'] elif feature_type in ['bigint','smallint','double precision', 'numeric', 'integer']: for name in ['avg','max','min']: aggreg_sql += f"{name}({dummy_table}.{feature}) over w_{window} as {feature}_{window}_{name}," aggregate_feature_list += [f'{feature}_{window}_{name}'] elif feature_type == 'text': # Text features are aggregated by concatenating strings. e.g. if a student was in Ahuachapán and Usulután in the consecutive years, the departamento_1y column is 'Ahuachapán,Usulután' and the departamento_1y_count is 2 aggreg_sql += f"array_to_string(array_agg({dummy_table}.{feature}) over w_{window}, ',') as {feature}_{window}," aggreg_sql += f"array_length(string_to_array(array_to_string(array_agg(distinct {dummy_table}.{feature}) over w_{window}, ','), ','), 1) as {feature}_{window}_count," aggregate_feature_list += [f'{feature}_{window}', f'{feature}_{window}_count'] else: logger.warn(f"Don't know how to convert {feature} with {feature_type}") previous_dropout_sql = "sum(b.label) over w_label_1y as dropout_1y, sum(b.label) over w_label_2y as dropout_2y, sum(b.label) over w_label_3y as dropout_3y" aggregate_feature_list += ['dropout_1y','dropout_2y','dropout_3y'] aggreg_sql = f"-- Feature list: {','.join(aggregate_feature_list)}\n" + aggreg_sql + previous_dropout_sql + " from results.features_imputed a left join staging.labels as b on a.student=b.student and a.year_range=b.year_range left join results.features_aggregate c on a.student=c.student and a.year_range=c.year_range window w_1y as (partition by a.student order by a.year_range asc rows between 1 preceding and current row), w_2y as (partition by a.student order by a.year_range asc rows between 2 preceding and current row), w_3y as (partition by a.student order by a.year_range asc rows between 3 preceding and current row), w_label_1y as (partition by a.student order by a.year_range asc rows between 1 preceding and 1 preceding), w_label_2y as (partition by a.student order by a.year_range asc rows between 2 preceding and 1 preceding), w_label_3y as (partition by a.student order by a.year_range asc rows between 3 preceding and 1 preceding)); create index timeaggregates_year_range_student_idx on results.features_timeaggregates(year_range, student);" # left join results.features_onehot d on a.student=d.student and a.year_range=d.year_range logger.debug(f'# Features: {len(aggregate_feature_list)}') with open(SQL_DIR + '/features_timeaggregates.sql', 'w+') as fil: fil.write(aggreg_sql)
def join_tables(folder_number, create_table=True): """ Finds all tables of some particular folder inside raw schema and joins them into a single table in preproc schema. In the conversion, the columns are renamed using .csv that need to be manually created in garfield. The column renaming can occur in one of two ways, as specified by the config file: 1. The .csv file provides three columns: old column names, new column names and types. 2. Two .csv files are provided, one for column names and one for column types. Each row in the .csv's corresponds to a table in raw, and the first column should be the table name. The rest of the columns correspond to the columns in the raw.table, e.g. if you want the third column of raw.super_table to be renamed as something_useful, the .csv should have a row in which the first column is raw.super_table and the fourth column is something_useful. Similarly for types. In either mode, if the new column name is not included, it will not appear in the joined table. If the data type is not included, it will be treated as varchar or handled as a special case in change_column_types() Args: folder_number (int): the folder number, used for finding the .csv and the tables in the raw schema create_table (bool): whether to create the joined table and populate it """ folder_number = int(folder_number) with open(BASE_DIR + '/mined/ETL/config.yaml') as config_file: configs = yaml.load(config_file) # Checking whether the column mapping .csv exists. If not, it prints the names of the original columns and returns None # With these original column names, the user can create a .csv that maps the original column names to the desired column names for the database if not os.path.isfile(f"""{configs['base_dir']}/{configs['pre_dir']}/{folder_number}_column_mapping.csv"""): logger.warn(f"""To join tables you need to create: {folder_number}_column_mapping.csv""") [print(x) for x in unique_original_column_names(folder_number)] return conn, cur = create_pgconn() cur.execute("set role el_salvador_mined_education_write") # Some folders contain information about semantically different entities (e.g. folder 1 contains school information but also some aggregate descriptive excels). However, each table has keywords identifying what information they contain (e.g. for folder 1, the keyword is Base_de_Centros). This keyword can be used to join each group of tables into separate tables in the preprocessing schema. This only works for tables that are joined in column_mode_1, where the file_name is included in the csv, allowing the script to select the appropriate columns for the tables being joined. if folder_number in configs['join_table_string']: string_patterns = configs['join_table_string'][folder_number] else: string_patterns = [''] for string_pattern in string_patterns: if folder_number in configs['column_mode_1']: joined_name = f'{folder_number}_joined' + string_pattern # The renaming uses the ordering of the new column names in 1_column_mapping.csv, combined with the automatic naming of columns by csvkit (a,b,c...aa,bb,cc...aaa,bbb,...) with open(f"""{configs['base_dir']}/{configs['pre_dir']}/{folder_number}_column_mapping.csv""", 'r') as fil: col_file = fil.read() col_map = dict() all_new_cols = [] for pair in col_file.split('\n'): lst = pair.split(',') key = f'raw."{lst[0]}"' # The first column in the csv is meant to the file name, so only select those rows for which we have specified the keyword (string_pattern) if string_pattern in key.lower(): col_map[key] = dict() new = [] mapped = [] for idx, new_col in enumerate(lst[1:]): if new_col != '': mapped += ['%s%d' %(chr(96+folder_number), idx)] new += [new_col] col_map[key]['mapped'] = mapped col_map[key]['new'] = new all_new_cols += new union = set(all_new_cols) create_table_stmnt = 'original_table varchar, year varchar,' + ' varchar, '.join(union) + ' varchar' else: logger.debug(f"""Opening {folder_number}_column_mapping.csv""") with open(f"""{configs['base_dir']}/{configs['pre_dir']}/{folder_number}_column_mapping.csv""", 'r') as fil: col_map = fil.read() # Maps the old column names to the new column names and types, using the .csv file. It takes into account that not all old columns will have names (i.e. we leave them behind in the data), and that not all named columns have types (i.e. they stay as varchars) logger.debug(f'Creating mapping dictionary') col_names=dict() col_types=dict() union = [] for pair in col_map.split('\n'): split_pair = pair.split(',') if split_pair[1] != '': col_names[split_pair[0]] = split_pair[1] if len(split_pair)>2: if split_pair[2] != '': col_types[split_pair[1]] = split_pair[2] union += [split_pair[1]] union = set(union) # In case we want multiple old columns names (in different files) to get remapped to the same new column union=set(union).difference(set([''])) create_table_stmnt = 'original_table varchar,' + ' varchar, '.join(union) + ' varchar' if create_table: logger.debug(f'Fetching list of table names in raw') cur.execute(f"""select distinct(table_name) from raw.column_mapping where lower(table_name) ~ 'raw."{folder_number}.*{string_pattern}.*';""") table_list = [x[0] for x in cur.fetchall()] logger.debug(f'Creating joined table with renamed columns. Dropping it if it exists') cur.execute(f"""drop table if exists preproc."{joined_name}";""") logger.debug(f'Creating table') cur.execute(f"""create table if not exists preproc."{joined_name}" ({create_table_stmnt});""") # Iterates over all tables in raw (with columns a0, a1, a2...), inserting into a joined preprocessing table (with columns nie, dpto_code_ce, year...) logger.debug('Iterating over tables and inserting into joined table, with renamed columns') for table in table_list: logger.debug(f' {table}') if folder_number in configs['column_mode_1']: cols = col_map[table] if len(cols['new'])>0: year = re.findall('[0-9]{4}', table)[0] # Extracts the year from the file_name logger.debug(f'Inserting into {joined_name}, with year {year}') cur.execute(f"""insert into preproc."{joined_name}" (original_table, year, {','.join(cols['new'])}) select {"'" + table + "' as original_table," + year + ',' + ','.join(cols['mapped'])} from {table};""") else: cur.execute(f"""select mapped_col_name, original_col_name from raw.column_mapping where table_name='{table}' order by mapped_col_name;""") col_pairs = cur.fetchall() raw_cols = [] renamed_cols = [] for col_pair in col_pairs: new_name=col_pair[1].replace('"', '') if new_name in col_names: if col_names[new_name] != '': raw_cols += [col_pair[0]] renamed_cols += [col_names[new_name]] cmnd = ' varchar, '.join(renamed_cols) + ' varchar' cur.execute(f"""insert into preproc."{joined_name}" ({'original_table,' + ','.join(renamed_cols)}) select {"'" + table[5:-1] + "' as original_table," + ','.join(raw_cols)} from {table};""") logger.debug(f'Finished inserting into {joined_name}') conn.commit() logger.debug('Committed connection')