def _load_bgen_samples(self): if self.bgen_sample_file is None or not os.path.isfile( self.bgen_sample_file): logger.warning( 'BGEN sample file not set or does not exist: {}'.format( self.bgen_sample_file)) return logger.info('Loading BGEN sample file: {}'.format( self.bgen_sample_file)) create_table( BGEN_SAMPLES_TABLE, columns=[ 'index bigint NOT NULL', 'eid bigint NOT NULL', ], constraints=[ 'pk_{} PRIMARY KEY (index, eid)'.format(BGEN_SAMPLES_TABLE) ], db_engine=self._get_db_engine()) samples_data = pd.read_table(self.bgen_sample_file, sep=' ', header=0, usecols=['ID_1', 'ID_2'], skiprows=[1]) samples_data.set_index(np.arange(1, samples_data.shape[0] + 1), inplace=True) samples_data.drop('ID_2', axis=1, inplace=True) samples_data.rename(columns={'ID_1': 'eid'}, inplace=True) samples_data.to_sql(BGEN_SAMPLES_TABLE, self._get_db_engine(), if_exists='append')
def _load_events(self): if self.db_type == 'sqlite': logger.warning('Events loading is not supported in SQLite') return logger.info('Loading events table') # create table db_engine = self._get_db_engine() create_table( 'events', columns=[ 'eid bigint NOT NULL', 'field_id integer NOT NULL', 'instance integer NOT NULL', 'event text NOT NULL', ], constraints=[ 'pk_events PRIMARY KEY (eid, field_id, instance, event)' ], db_engine=db_engine) # insert data of categorical multiple fields categorical_variables = pd.read_sql( """ select column_name, field_id, inst, table_name from fields where type = 'Categorical (multiple)' """, self._get_db_engine()) for (field_id, field_instance), field_data in categorical_variables.groupby( by=['field_id', 'inst']): sql_st = """ insert into events (eid, field_id, instance, event) ( select distinct * from ( select eid, {field_id}, {field_instance}, unnest(array[{field_columns}]) as event from {tables} ) t where t.event is not null ) """.format( field_id=field_id, field_instance=field_instance, field_columns=', '.join( [cn for cn in set(field_data['column_name'])]), tables=self._create_joins(list(set(field_data['table_name'])), join_type='inner join'), ) with db_engine.connect() as con: con.execute(sql_st)
def load_codings(self, codings_dir): logger.info('Loading codings from {}'.format(codings_dir)) db_engine = self._get_db_engine() create_table( 'codings', columns=[ 'data_coding bigint NOT NULL', 'coding text NOT NULL', 'meaning text NOT NULL', 'node_id bigint NULL', 'parent_id bigint NULL', 'selectable boolean NULL', ], constraints=[ 'pk_codings PRIMARY KEY (data_coding, coding, meaning)' ], db_engine=self._get_db_engine()) for afile in glob(join(codings_dir, '*.tsv')): afile_base = basename(afile) logger.info('Processing coding file: {}'.format(afile_base)) data = pd.read_table(afile, sep='\t+', na_filter=False, engine='python') data_coding = int(splitext(afile_base)[0].split('_')[1]) data['data_coding'] = data_coding data.to_sql('codings', db_engine, if_exists='append', index=False) create_indexes( 'codings', ['data_coding', 'coding', 'node_id', 'parent_id', 'selectable'], db_engine=db_engine) self._vacuum('codings')
def _load_all_eids(self): logger.info('Loading all eids into table {}'.format(ALL_EIDS_TABLE)) create_table( ALL_EIDS_TABLE, columns=[ 'eid bigint NOT NULL', ], constraints=['pk_{} PRIMARY KEY (eid)'.format(ALL_EIDS_TABLE)], db_engine=self._get_db_engine()) select_eid_sql = ' UNION DISTINCT '.join( 'select eid from {}'.format(table_name) for table_name in self.table_list) insert_eids_sql = """ insert into {all_eids_table} (eid) ( {sql_eids} ) """.format(all_eids_table=ALL_EIDS_TABLE, sql_eids=select_eid_sql) with self._get_db_engine().connect() as con: con.execute(insert_eids_sql)
def _create_tables_schema(self, csv_file, csv_file_idx): """ Reads the data types of each data field in csv_file and create the necessary database tables. :return: """ logger.info('Creating database tables') tmp = pd.read_csv(csv_file, index_col=0, header=0, nrows=1, low_memory=False) old_columns = tmp.columns.tolist() del tmp new_columns = [self._rename_columns(x) for x in old_columns] # Remove columns that were previously loaded in other datasets if 'existing_col_names' not in self._loading_tmp: # dictionary with data-field as key and csv file as value columns_and_csv_files = {} else: columns_and_csv_files = self._loading_tmp['existing_col_names'] old_columns_clean = [] new_columns_clean = [] for old_col_name, new_col_name in tuple(zip(old_columns, new_columns)): if new_col_name in columns_and_csv_files: corresponding_csv_file = columns_and_csv_files[new_col_name] logger.warning( f'Column {new_col_name} already loaded from {corresponding_csv_file}. Skipping.' ) continue columns_and_csv_files[new_col_name] = csv_file old_columns_clean.append(old_col_name) new_columns_clean.append(new_col_name) self._loading_tmp['existing_col_names'] = columns_and_csv_files # keep only unique columns (not loaded in previous files) old_columns = old_columns_clean new_columns = new_columns_clean all_columns = tuple(zip(old_columns, new_columns)) # FIXME: check if self.n_columns_per_table is greater than the real number of columns self._loading_tmp['chunked_column_names'] = tuple( enumerate(self._chunker(all_columns, self.n_columns_per_table))) self._loading_tmp['chunked_table_column_names'] = \ {self._get_table_name(col_idx, csv_file_idx): [col[1] for col in col_names] for col_idx, col_names in self._loading_tmp['chunked_column_names']} # get columns dtypes (for PostgreSQL and standard ones) db_types_old_column_names, all_fields_dtypes, all_fields_description, all_fields_coding = self._get_db_columns_dtypes( csv_file) db_dtypes = { self._rename_columns(k): v for k, v in db_types_old_column_names.items() } self._fields_dtypes.update(all_fields_dtypes) data_sample = pd.read_csv(csv_file, index_col=0, header=0, nrows=1, dtype=str) data_sample = data_sample.rename(columns=self._rename_columns) # create fields table if csv_file_idx == 0: create_table('fields', columns=[ 'column_name text NOT NULL', 'table_name text', 'field_id text NOT NULL', 'description text', 'coding bigint', 'inst bigint', 'arr bigint', 'type text NOT NULL', ], constraints=['pk_fields PRIMARY KEY (column_name)'], db_engine=self._get_db_engine(), drop_if_exists=True) current_stop = 0 for column_names_idx, column_names in self._loading_tmp[ 'chunked_column_names']: new_columns_names = [x[1] for x in column_names] fields_ids = [] instances = [] arrays = [] fields_dtypes = [] fields_descriptions = [] fields_codings = [] for col_name in new_columns_names: match = re.match(Pheno2SQL.RE_FIELD_INFO, col_name) fields_ids.append(match.group('field_id')) instances.append(int(match.group('instance'))) arrays.append(int(match.group('array'))) fields_dtypes.append(all_fields_dtypes[col_name]) fields_descriptions.append(all_fields_description[col_name]) if col_name in all_fields_coding: fields_codings.append(all_fields_coding[col_name]) else: fields_codings.append(np.nan) # Create main table structure table_name = self._get_table_name(column_names_idx, csv_file_idx) logger.info('Table {} ({} columns)'.format(table_name, len(new_columns_names))) data_sample.loc[[], new_columns_names].to_sql(table_name, self._get_db_engine(), if_exists='replace', dtype=db_dtypes) with self._get_db_engine().connect() as conn: conn.execute(""" ALTER TABLE {table_name} ADD CONSTRAINT pk_{table_name} PRIMARY KEY (eid); """.format(table_name=table_name)) with self._get_db_engine().connect() as conn: conn.execute('DROP INDEX ix_{table_name}_eid;'.format( table_name=table_name)) # Create auxiliary table n_column_names = len(new_columns_names) current_start = current_stop current_stop = current_start + n_column_names aux_table = pd.DataFrame({ 'column_name': new_columns_names, 'field_id': fields_ids, 'inst': instances, 'arr': arrays, 'coding': fields_codings, 'table_name': table_name, 'type': fields_dtypes, 'description': fields_descriptions }) # aux_table = aux_table.set_index('column_name') aux_table.to_sql('fields', self._get_db_engine(), index=False, if_exists='append')