Пример #1
0
    def _load_bgen_samples(self):
        if self.bgen_sample_file is None or not os.path.isfile(
                self.bgen_sample_file):
            logger.warning(
                'BGEN sample file not set or does not exist: {}'.format(
                    self.bgen_sample_file))
            return

        logger.info('Loading BGEN sample file: {}'.format(
            self.bgen_sample_file))

        create_table(
            BGEN_SAMPLES_TABLE,
            columns=[
                'index bigint NOT NULL',
                'eid bigint NOT NULL',
            ],
            constraints=[
                'pk_{} PRIMARY KEY (index, eid)'.format(BGEN_SAMPLES_TABLE)
            ],
            db_engine=self._get_db_engine())

        samples_data = pd.read_table(self.bgen_sample_file,
                                     sep=' ',
                                     header=0,
                                     usecols=['ID_1', 'ID_2'],
                                     skiprows=[1])
        samples_data.set_index(np.arange(1, samples_data.shape[0] + 1),
                               inplace=True)
        samples_data.drop('ID_2', axis=1, inplace=True)
        samples_data.rename(columns={'ID_1': 'eid'}, inplace=True)

        samples_data.to_sql(BGEN_SAMPLES_TABLE,
                            self._get_db_engine(),
                            if_exists='append')
Пример #2
0
    def _load_events(self):
        if self.db_type == 'sqlite':
            logger.warning('Events loading is not supported in SQLite')
            return

        logger.info('Loading events table')

        # create table
        db_engine = self._get_db_engine()

        create_table(
            'events',
            columns=[
                'eid bigint NOT NULL',
                'field_id integer NOT NULL',
                'instance integer NOT NULL',
                'event text NOT NULL',
            ],
            constraints=[
                'pk_events PRIMARY KEY (eid, field_id, instance, event)'
            ],
            db_engine=db_engine)

        # insert data of categorical multiple fields
        categorical_variables = pd.read_sql(
            """
            select column_name, field_id, inst, table_name
            from fields
            where type = 'Categorical (multiple)'
        """, self._get_db_engine())

        for (field_id,
             field_instance), field_data in categorical_variables.groupby(
                 by=['field_id', 'inst']):
            sql_st = """
                insert into events (eid, field_id, instance, event)
                (
                    select distinct *
                    from (
                        select eid, {field_id}, {field_instance}, unnest(array[{field_columns}]) as event
                        from {tables}
                    ) t
                    where t.event is not null
                )
            """.format(
                field_id=field_id,
                field_instance=field_instance,
                field_columns=', '.join(
                    [cn for cn in set(field_data['column_name'])]),
                tables=self._create_joins(list(set(field_data['table_name'])),
                                          join_type='inner join'),
            )

            with db_engine.connect() as con:
                con.execute(sql_st)
Пример #3
0
    def load_codings(self, codings_dir):
        logger.info('Loading codings from {}'.format(codings_dir))
        db_engine = self._get_db_engine()

        create_table(
            'codings',
            columns=[
                'data_coding bigint NOT NULL',
                'coding text NOT NULL',
                'meaning text NOT NULL',
                'node_id bigint NULL',
                'parent_id bigint NULL',
                'selectable boolean NULL',
            ],
            constraints=[
                'pk_codings PRIMARY KEY (data_coding, coding, meaning)'
            ],
            db_engine=self._get_db_engine())

        for afile in glob(join(codings_dir, '*.tsv')):
            afile_base = basename(afile)

            logger.info('Processing coding file: {}'.format(afile_base))

            data = pd.read_table(afile,
                                 sep='\t+',
                                 na_filter=False,
                                 engine='python')

            data_coding = int(splitext(afile_base)[0].split('_')[1])
            data['data_coding'] = data_coding

            data.to_sql('codings', db_engine, if_exists='append', index=False)

        create_indexes(
            'codings',
            ['data_coding', 'coding', 'node_id', 'parent_id', 'selectable'],
            db_engine=db_engine)

        self._vacuum('codings')
Пример #4
0
    def _load_all_eids(self):
        logger.info('Loading all eids into table {}'.format(ALL_EIDS_TABLE))

        create_table(
            ALL_EIDS_TABLE,
            columns=[
                'eid bigint NOT NULL',
            ],
            constraints=['pk_{} PRIMARY KEY (eid)'.format(ALL_EIDS_TABLE)],
            db_engine=self._get_db_engine())

        select_eid_sql = ' UNION DISTINCT '.join(
            'select eid from {}'.format(table_name)
            for table_name in self.table_list)

        insert_eids_sql = """
            insert into {all_eids_table} (eid)
            (
                {sql_eids}
            )
        """.format(all_eids_table=ALL_EIDS_TABLE, sql_eids=select_eid_sql)

        with self._get_db_engine().connect() as con:
            con.execute(insert_eids_sql)
Пример #5
0
    def _create_tables_schema(self, csv_file, csv_file_idx):
        """
        Reads the data types of each data field in csv_file and create the necessary database tables.
        :return:
        """
        logger.info('Creating database tables')

        tmp = pd.read_csv(csv_file,
                          index_col=0,
                          header=0,
                          nrows=1,
                          low_memory=False)
        old_columns = tmp.columns.tolist()
        del tmp
        new_columns = [self._rename_columns(x) for x in old_columns]

        # Remove columns that were previously loaded in other datasets
        if 'existing_col_names' not in self._loading_tmp:
            # dictionary with data-field as key and csv file as value
            columns_and_csv_files = {}
        else:
            columns_and_csv_files = self._loading_tmp['existing_col_names']

        old_columns_clean = []
        new_columns_clean = []

        for old_col_name, new_col_name in tuple(zip(old_columns, new_columns)):
            if new_col_name in columns_and_csv_files:
                corresponding_csv_file = columns_and_csv_files[new_col_name]
                logger.warning(
                    f'Column {new_col_name} already loaded from {corresponding_csv_file}. Skipping.'
                )
                continue

            columns_and_csv_files[new_col_name] = csv_file

            old_columns_clean.append(old_col_name)
            new_columns_clean.append(new_col_name)

        self._loading_tmp['existing_col_names'] = columns_and_csv_files

        # keep only unique columns (not loaded in previous files)
        old_columns = old_columns_clean
        new_columns = new_columns_clean
        all_columns = tuple(zip(old_columns, new_columns))

        # FIXME: check if self.n_columns_per_table is greater than the real number of columns
        self._loading_tmp['chunked_column_names'] = tuple(
            enumerate(self._chunker(all_columns, self.n_columns_per_table)))
        self._loading_tmp['chunked_table_column_names'] = \
            {self._get_table_name(col_idx, csv_file_idx): [col[1] for col in col_names]
             for col_idx, col_names in self._loading_tmp['chunked_column_names']}

        # get columns dtypes (for PostgreSQL and standard ones)
        db_types_old_column_names, all_fields_dtypes, all_fields_description, all_fields_coding = self._get_db_columns_dtypes(
            csv_file)
        db_dtypes = {
            self._rename_columns(k): v
            for k, v in db_types_old_column_names.items()
        }
        self._fields_dtypes.update(all_fields_dtypes)

        data_sample = pd.read_csv(csv_file,
                                  index_col=0,
                                  header=0,
                                  nrows=1,
                                  dtype=str)
        data_sample = data_sample.rename(columns=self._rename_columns)

        # create fields table
        if csv_file_idx == 0:
            create_table('fields',
                         columns=[
                             'column_name text NOT NULL',
                             'table_name text',
                             'field_id text NOT NULL',
                             'description text',
                             'coding bigint',
                             'inst bigint',
                             'arr bigint',
                             'type text NOT NULL',
                         ],
                         constraints=['pk_fields PRIMARY KEY (column_name)'],
                         db_engine=self._get_db_engine(),
                         drop_if_exists=True)

        current_stop = 0
        for column_names_idx, column_names in self._loading_tmp[
                'chunked_column_names']:
            new_columns_names = [x[1] for x in column_names]

            fields_ids = []
            instances = []
            arrays = []
            fields_dtypes = []
            fields_descriptions = []
            fields_codings = []

            for col_name in new_columns_names:
                match = re.match(Pheno2SQL.RE_FIELD_INFO, col_name)

                fields_ids.append(match.group('field_id'))
                instances.append(int(match.group('instance')))
                arrays.append(int(match.group('array')))

                fields_dtypes.append(all_fields_dtypes[col_name])
                fields_descriptions.append(all_fields_description[col_name])

                if col_name in all_fields_coding:
                    fields_codings.append(all_fields_coding[col_name])
                else:
                    fields_codings.append(np.nan)

            # Create main table structure
            table_name = self._get_table_name(column_names_idx, csv_file_idx)
            logger.info('Table {} ({} columns)'.format(table_name,
                                                       len(new_columns_names)))
            data_sample.loc[[],
                            new_columns_names].to_sql(table_name,
                                                      self._get_db_engine(),
                                                      if_exists='replace',
                                                      dtype=db_dtypes)

            with self._get_db_engine().connect() as conn:
                conn.execute("""
                    ALTER TABLE {table_name} ADD CONSTRAINT pk_{table_name} PRIMARY KEY (eid);
                """.format(table_name=table_name))

            with self._get_db_engine().connect() as conn:
                conn.execute('DROP INDEX ix_{table_name}_eid;'.format(
                    table_name=table_name))

            # Create auxiliary table
            n_column_names = len(new_columns_names)
            current_start = current_stop
            current_stop = current_start + n_column_names

            aux_table = pd.DataFrame({
                'column_name': new_columns_names,
                'field_id': fields_ids,
                'inst': instances,
                'arr': arrays,
                'coding': fields_codings,
                'table_name': table_name,
                'type': fields_dtypes,
                'description': fields_descriptions
            })
            # aux_table = aux_table.set_index('column_name')
            aux_table.to_sql('fields',
                             self._get_db_engine(),
                             index=False,
                             if_exists='append')