示例#1
0
def mock_data():
    """Create mock tables temporarily for testing purposes."""

    src_dirname = os.path.join(mkdtemp(), '')  # ensure path ends with /
    dest_dirname = mkdtemp()

    os.mkdir(os.path.join(src_dirname, 'test_subfolder'))

    with psycopg2.connect(port='5432', host='localhost', **db_args) as conn:
        table_id = 'log_sensor_file'
        column_names = ['log_sensor_file_id', 'sensor_file_path', 'another_column']
        dtypes = ['VARCHAR (255)', 'text[]', 'VARCHAR (255)']
        drop_table(table_id, conn)
        create_table(table_id, conn, column_names, dtypes,
                     primary_key='log_sensor_file_id')

        table_id = 'log_file'
        column_names = ['operation_id', 'log_sensor_file_id', 'src_dirname',
                        'dest_dirname', 'fname', 'time_verified',
                        'rsync_operation', 'is_deleted', 'is_finished']
        dtypes = ['SERIAL', 'text', 'text',
                  'text', 'text', 'timestamp', 'text',
                  'boolean', 'boolean']
        drop_table(table_id, conn)
        create_table(table_id, conn, column_names, dtypes,
                     primary_key='operation_id',
                     foreign_key={'log_sensor_file_id': 'log_sensor_file'})

    yield src_dirname, dest_dirname

    # cleanup
    shutil.rmtree(src_dirname)
    shutil.rmtree(dest_dirname)
def test_rename_subject_ids():
    """Test renaming of subject."""
    table_id = 'subject'
    column_names = [
        'subject_id', 'redcap_event_name', 'first_name_birth',
        'last_name_birth', 'date_of_birth_subject', 'old_subject_id'
    ]
    dtypes = [
        'VARCHAR (255)', 'VARCHAR (255)', 'VARCHAR (255)', 'VARCHAR (255)',
        'date', 'VARCHAR (255)'
    ]
    rows = [('1001', 'arm1', 'anoopum', 'gupta', '1985-11-28', None),
            ('1002', 'arm1', 'adonay', 'nunes', '1987-09-13', None)]
    index = {
        'subject_identifier':
        ['first_name_birth', 'last_name_birth', 'date_of_birth_subject']
    }

    with psycopg2.connect(connect_str) as conn:
        drop_table(table_id, conn)
        table_subject = create_table(table_id,
                                     conn=conn,
                                     column_names=column_names,
                                     dtypes=dtypes,
                                     index=index)
        table_subject.insert_rows(rows, cols=column_names)

        # Simulate changing subject_id in redcap and updating old_subject_id.
        redcap_df = table_subject.query().reset_index()
        row_idx = redcap_df['subject_id'] == '1001'
        redcap_df.loc[row_idx, 'subject_id'] = '901'
        redcap_df.loc[row_idx, 'old_subject_id'] = '1001'
        # Add subject in redcap
        redcap_df = redcap_df.append(pd.DataFrame({
            'subject_id': ['1003'],
            'first_name_birth': ['sheraz'],
            'last_name_birth': ['khan'],
            'date_of_birth_subject': ['1980-05-15'],
            'old_subject_id': [None]
        }),
                                     ignore_index=True)

        rename_subject_ids(table_subject, redcap_df)

        # test renaming
        table_df_updated = table_subject.query().reset_index()
        assert '1001' in table_df_updated['old_subject_id'].values
        assert '1001' not in table_df_updated['subject_id'].values
        assert '901' in table_df_updated['subject_id'].values
        # only rename, don't add rows
        assert '1003' not in table_df_updated['subject_id'].values
def test_compound_primary_key():
    """Test compound primary key."""
    conn = psycopg2.connect(connect_str)

    table_id = 'test'
    drop_table(table_id, conn)

    column_names = ['subject_id', 'first_name_birth', 'last_name_birth', 'Age']
    dtypes = ['VARCHAR (255)', 'VARCHAR (255)', 'VARCHAR (255)', 'INTEGER']
    primary_key = ['first_name_birth', 'last_name_birth']
    table_subject = create_table(table_id,
                                 conn=conn,
                                 column_names=column_names,
                                 dtypes=dtypes,
                                 primary_key=primary_key)
    assert set(table_subject.primary_key) == set(primary_key)
def test_delete():
    """Test deleting rows"""

    conn = psycopg2.connect(connect_str)

    table_id = 'test'
    drop_table(table_id, conn)

    column_names = ['subject_id', 'first_name_birth', 'last_name_birth', 'Age']
    dtypes = ['VARCHAR (255)', 'VARCHAR (255)', 'VARCHAR (255)', 'INTEGER']
    table_subject = create_table(table_id,
                                 conn=conn,
                                 column_names=column_names,
                                 dtypes=dtypes)
    table_subject.insert_rows([('x5dc', 'mainak', 'jas', 21),
                               ('y5d3', 'anoopum', 'gupta', 25),
                               ('abcd', 'mayank', 'jas', 25)],
                              cols=column_names)
    table_subject.delete_row("first_name_birth LIKE 'ma%'")
    df = table_subject.query()
    assert len(df['first_name_birth']) == 1
示例#5
0
from neurobooth_terra import Table, create_table
from neurobooth_terra.fixes import OptionalSSHTunnelForwarder
from neurobooth_terra.dataflow import write_files

from config import ssh_args, db_args

do_create_table = False
write_table = True
dest_dir = '/autofs/nas/neurobooth/data_test/'

with OptionalSSHTunnelForwarder(**ssh_args) as tunnel:
    with psycopg2.connect(port=tunnel.local_bind_port,
                          host=tunnel.local_bind_host, **db_args) as conn:

        if do_create_table:
            table_id = 'log_file'
            column_names = ['operation_id', 'log_sensor_file_id', 'src_dirname',
                            'dest_dirname', 'fname', 'time_verified',
                            'rsync_operation', 'is_deleted']
            dtypes = ['SERIAL', 'text', 'text',
                      'text', 'text', 'timestamp', 'text',
                      'boolean']
            create_table(table_id, conn, column_names, dtypes,
                         primary_key='operation_id',
                         foreign_key={'log_sensor_file_id': 'log_sensor_file'})

        if write_table:
            sensor_file_table = Table('log_sensor_file', conn)
            db_table = Table('log_file', conn)
            write_files(sensor_file_table, db_table, dest_dir)
def test_upsert():
    """Test upsert operation."""

    conn = psycopg2.connect(connect_str)

    table_id = 'test'
    drop_table(table_id, conn)

    column_names = [
        'subject_id', 'first_name_birth', 'last_name_birth', 'Age',
        'attributes'
    ]
    dtypes = [
        'VARCHAR (255)', 'VARCHAR (255)', 'VARCHAR (255)', 'INTEGER', 'JSONB'
    ]
    table_subject = create_table(table_id,
                                 conn=conn,
                                 column_names=column_names,
                                 dtypes=dtypes)
    table_subject.insert_rows([('x5dc', 'mainak', 'jas', 21, {
        'a': 1
    }), ('y5d3', 'anoopum', 'gupta', 25, {
        'b': 2
    }), ('abcd', 'mayank', 'jas', 25, {
        'a': 1
    })],
                              cols=column_names)
    table_subject.insert_rows([('x5dc', 'mainak_new', 'jas_new', 21, {
        'a': 1
    }), ('zzzz', 'deepak', 'singh', 32, {
        'd': 1
    })],
                              cols=column_names,
                              on_conflict='nothing')
    df = table_subject.query()
    assert 'x5dc' in df.index
    assert 'zzzz' in df.index
    assert df.loc['x5dc']['first_name_birth'] == 'mainak'  # not mainak_new

    df = table_subject.query(where="subject_id = 'x5dc'")
    assert 'x5dc' in df.index
    assert 'zzzz' not in df.index

    # smoke test include_columns
    df = table_subject.query(include_columns='first_name_birth',
                             where="subject_id = 'x5dc'")
    df = table_subject.query(include_columns=['first_name_birth'],
                             where="subject_id = 'x5dc'")

    df = query(conn,
               "SELECT first_name_birth FROM test where subject_id = 'x5dc' ",
               column_names='first_name_birth')

    table_subject.insert_rows([('x5dc', 'mainak', 'jazz', 32, {
        'a': 1
    })],
                              cols=column_names,
                              on_conflict='update')
    df = table_subject.query(where="subject_id = 'x5dc'")
    assert df.loc['x5dc']['last_name_birth'] == 'jazz'

    # create index for conflicts
    cmd = 'DROP INDEX IF EXISTS subject_identifier'
    execute(table_subject.conn, table_subject.cursor, cmd)
    cmd = (f'CREATE UNIQUE INDEX subject_identifier ON '
           f'{table_id} (first_name_birth, last_name_birth);')
    execute(table_subject.conn, table_subject.cursor, cmd)

    table_subject.insert_rows(
        [('x5de', 'mainak', 'jazz', 32, {
            'a': 1
        })],
        cols=column_names,
        on_conflict='update',
        conflict_cols=['first_name_birth', 'last_name_birth'],
        update_cols='subject_id')
    df = table_subject.query()
    assert 'x5de' in df.index
    table_subject.insert_rows(
        [('blah', 'mainak', 'jazz', 32, {
            'a': 1
        }), ('yyyy', 'mainak', 'jazz', 31, {
            'a': 1
        })],
        cols=column_names,
        on_conflict='update',
        conflict_cols=['first_name_birth', 'last_name_birth'],
        update_cols='subject_id',
        where='test."Age" = excluded."Age"')
    df = table_subject.query()
    assert 'yyyy' not in df.index
    assert 'blah' in df.index
def test_psql_connection():
    """Test that we can connect to the database"""

    conn = psycopg2.connect(connect_str)

    table_id = 'test'
    drop_table(table_id, conn)
    assert 'test' not in list_tables(conn)

    column_names = ['subject_id', 'first_name_birth', 'last_name_birth', 'Age']
    dtypes = ['VARCHAR (255)', 'VARCHAR (255)', 'VARCHAR (255)', 'INTEGER']
    table_subject = create_table(table_id,
                                 conn=conn,
                                 column_names=column_names,
                                 dtypes=dtypes)
    with pytest.raises(Exception, match='already exists'):
        table_subject = create_table(table_id,
                                     conn=conn,
                                     column_names=column_names,
                                     dtypes=dtypes)
    conn.rollback()  # XXX: needed for the error testing above
    table_subject.insert_rows([('x5dc', 'mainak', 'jas', 21),
                               ('y5d3', 'anoopum', 'gupta', 25)],
                              cols=column_names)
    with pytest.raises(ValueError, match='vals must be a list of tuple'):
        table_subject.insert_rows('blah', ['subject_id'])
    with pytest.raises(ValueError, match='entries in vals must be tuples'):
        table_subject.insert_rows(['blah'], ['subject_id'])
    with pytest.raises(ValueError, match='tuple length must match'):
        table_subject.insert_rows([('x5dc', 'mainak', 'jas')], ['subject_id'])

    assert sorted(table_subject.data_types) == sorted(dtypes)
    table_subject.close()

    table_test = Table('test', conn)
    assert table_test.primary_key == ['subject_id']

    # test updating row
    table_test.update_row('y5d3', ('blah', 'anupum', 'gupta', 32),
                          cols=column_names)
    df = table_test.query()
    assert 'blah' in df.index

    with pytest.raises(ValueError, match='vals must be a tuple'):
        table_test.update_row('blah', 'mainak', ['first_name_birth'])

    # test updating row partially
    table_test.update_row('blah', ('mainak', ), ['first_name_birth'])
    df = table_test.query()
    assert df.loc['blah']['first_name_birth'] == 'mainak'

    with pytest.raises(ValueError, match='column blah is not present'):
        table_test.update_row('blah', ('mainak', ), ['blah'])

    with pytest.raises(ValueError, match='length of vals'):
        table_test.update_row('blah', ('mainak', ),
                              ['first_name_birth', 'last_name_birth'])

    # test dropping a column
    table_test.drop_column(col='subject_id')
    assert 'subject_id' not in table_test.column_names

    # test adding an auto-incrementing default value to the column
    table_test.add_column(col='subject_id', dtype='VARCHAR')
    table_test.alter_column(col='subject_id', default=dict(prefix='SUBJ'))
    pk_val = table_test.insert_rows(
        [('mainak', 'jas', 21)],
        cols=['first_name_birth', 'last_name_birth', 'Age'])
    assert pk_val == 'SUBJ1'
    df = table_test.query()
    assert 'SUBJ1' in df.index

    # test insertion of date
    table_id = 'test_consent'
    drop_table(table_id, conn)

    column_names = ['subject_id', 'consent_date']
    dtypes = ['VARCHAR (255)', 'date']
    table_consent = create_table(table_id, conn, column_names, dtypes)
    date = datetime.datetime.today().strftime('%Y-%m-%d')
    table_consent.insert_rows([('x5dc', date)], cols=column_names)

    conn.close()
示例#8
0
with OptionalSSHTunnelForwarder(**ssh_args) as tunnel:
    with psycopg2.connect(port=tunnel.local_bind_port,
                          host=tunnel.local_bind_host, **db_args) as conn:

        table_metadata = Table('rc_data_dictionary', conn)
        table_metadata.insert_rows(rows_metadata, cols_metadata,
                                   on_conflict='update')

        # for table_id, table_info in table_infos.items():
        for table_id, _ in survey_ids.items():
            table_info = table_infos[table_id]
            print(f'Overwriting table {table_id}')
            drop_table('rc_' + table_id, conn)
            table = create_table('rc_' + table_id, conn, table_info['columns'],
                                 table_info['dtypes'],
                                 primary_key=['subject_id', 'redcap_event_name'])
            df = fetch_survey(project, survey_name=table_id,
                              survey_id=survey_ids[table_id])
            df = df.rename(columns={'record_id': 'subject_id'})

            # XXX: not consistent.
            complete_col = [col for col in df.columns if
                            col.endswith('complete')]
            if len(complete_col) == 0:
                warn(f'Skipping {table_id} because of missing complete col')
                continue
            df = df[df[complete_col[0]] == 2]

            report_cols = set([col.split('___')[0] for col in df.columns])
            extra_cols = report_cols - (set(table_info['columns']) |