Пример #1
0
def ingest_all():

    ib_brainregion = InsertBuffer(reference_ingest.BrainRegion)

    for key in tqdm(keys, position=0):
        fields = key['fields']
        graph_order = atlas[atlas['id'] == key['pk']]['graph_order']

        if np.isnan(graph_order.to_list()[0]):
            graph_order = None
        else:
            graph_order = int(graph_order)

        ib_brainregion.insert1(
            dict(brain_region_pk=key['pk'],
                 acronym=fields['acronym'],
                 brain_region_name=fields['name'],
                 parent=fields['parent'],
                 brain_region_level=fields['level'],
                 graph_order=graph_order))
        if ib_brainregion.flush(skip_duplicates=True, chunksz=1000):
            print('Inserted 1000 raw tuples.')

    if ib_brainregion.flush(skip_duplicates=True):
        print('Inserted all remaining raw field tuples')
Пример #2
0
def insert_to_alyxraw(keys, alyxraw_module=alyxraw, alyx_type='all'):

    # use insert buffer to speed up the insertion process
    if alyx_type in ('all', 'main'):

        ib_main = InsertBuffer(alyxraw_module.AlyxRaw)
        # insert into AlyxRaw table
        for key in tqdm(keys, position=0):
            try:
                pk = uuid.UUID(key['pk'])
            except Exception:
                print('Error for key: {}'.format(key))
                continue

            ib_main.insert1(dict(uuid=pk, model=key['model']))
            if ib_main.flush(skip_duplicates=True, chunksz=10000):
                logger.debug('Inserted 10000 raw tuples.')

        if ib_main.flush(skip_duplicates=True):
            logger.debug('Inserted remaining raw tuples')
            ib_main = InsertBuffer(alyxraw_module.AlyxRaw)

    if alyx_type in ('all', 'part'):
        ib_part = InsertBuffer(alyxraw_module.AlyxRaw.Field)
        # insert into the part table AlyxRaw.Field
        for ikey, key in tqdm(enumerate(keys), position=0):
            try:
                try:
                    pk = uuid.UUID(key['pk'])
                except ValueError:
                    print('Error for key: {}'.format(key))
                    continue

                key_field = dict(uuid=uuid.UUID(key['pk']))
                for field_name, field_value in key['fields'].items():
                    key_field = dict(key_field, fname=field_name)

                    if field_name == 'json' and field_value is not None:

                        key_field['value_idx'] = 0
                        key_field['fvalue'] = json.dumps(field_value)
                        if len(key_field['fvalue']) < 10000:
                            ib_part.insert1(key_field)
                        else:
                            continue
                    if field_name == 'narrative' and field_value is not None:
                        # filter out emoji
                        emoji_pattern = re.compile(
                            "["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+",
                            flags=re.UNICODE)

                        key_field['value_idx'] = 0
                        key_field['fvalue'] = emoji_pattern.sub(
                            r'', field_value)

                    elif field_value is None or field_value == '' or field_value == [] or \
                            (isinstance(field_value, float) and math.isnan(field_value)):
                        key_field['value_idx'] = 0
                        key_field['fvalue'] = 'None'
                        ib_part.insert1(key_field)

                    elif type(field_value) is list and \
                            (type(field_value[0]) is dict or type(field_value[0]) is str):
                        for value_idx, value in enumerate(field_value):
                            key_field['value_idx'] = value_idx
                            key_field['fvalue'] = str(value)
                            ib_part.insert1(key_field)
                    else:
                        key_field['value_idx'] = 0
                        key_field['fvalue'] = str(field_value)
                        ib_part.insert1(key_field)

                    if ib_part.flush(skip_duplicates=True, chunksz=10000):
                        logger.debug('Inserted 10000 raw field tuples')

            except Exception:
                print('Problematic entry:{}'.format(ikey))
                raise

        if ib_part.flush(skip_duplicates=True):
            logger.debug('Inserted all remaining raw field tuples')
Пример #3
0
def main(excluded_tables=[], modified_pks=None):

    kwargs = dict(display_progress=True, suppress_errors=True)

    for t in SHADOW_TABLES:
        if t.__name__ in excluded_tables:
            continue
        print(f'Ingesting shadow table {t.__name__}...')

        if t.__name__ == 'Session' and modified_pks:
            modified_session_keys = [{
                'session_uuid': pk
            } for pk in modified_pks]
            sessions = acquisition.Session & modified_session_keys
            if sessions:
                modified_session_entries = []
                for key in sessions.fetch('KEY'):
                    try:
                        entry = acquisition.Session.create_entry(key)
                        modified_session_entries.append(entry)
                    except:
                        print("Error creating entry for key: {}".format(key))
                if modified_session_entries:
                    t.insert(modified_session_entries,
                             allow_direct_insert=True,
                             replace=True)

        t.populate(**kwargs)

    if 'DataSet' not in excluded_tables:

        print('Ingesting dataset entries...')
        key_source = (alyxraw.AlyxRaw & 'model="data.dataset"').proj(
            dataset_uuid="uuid") - data.DataSet

        data_set = InsertBuffer(data.DataSet)

        for key in tqdm(key_source.fetch('KEY'), position=0):
            key_ds = key.copy()
            key['uuid'] = key['dataset_uuid']

            session = grf(key, 'session')
            if not len(acquisition.Session
                       & dict(session_uuid=uuid.UUID(session))):
                print('Session {} is not in the table acquisition.Session'.
                      format(session))
                print('dataset_uuid: {}'.format(str(key['uuid'])))
                continue

            key_ds['subject_uuid'], key_ds['session_start_time'] = \
                (acquisition.Session &
                    dict(session_uuid=uuid.UUID(session))).fetch1(
                    'subject_uuid', 'session_start_time')

            key_ds['dataset_name'] = grf(key, 'name')

            dt = grf(key, 'dataset_type')
            key_ds['dataset_type_name'] = \
                (data.DataSetType & dict(dataset_type_uuid=uuid.UUID(dt))).fetch1(
                    'dataset_type_name')

            user = grf(key, 'created_by')

            if user != 'None':
                try:
                    key_ds['dataset_created_by'] = \
                        (reference.LabMember & dict(user_uuid=uuid.UUID(user))).fetch1(
                            'user_name')
                except:
                    print(user)
            else:
                key_ds['dataset_created_by'] = None

            format = grf(key, 'data_format')
            key_ds['format_name'] = \
                (data.DataFormat & dict(format_uuid=uuid.UUID(format))).fetch1(
                    'format_name')

            key_ds['created_datetime'] = grf(key, 'created_datetime')

            software = grf(key, 'generating_software')
            if software != 'None':
                key_ds['generating_software'] = software
            else:
                key_ds['generating_software'] = None

            directory = grf(key, 'provenance_directory')
            if directory != 'None':
                key_ds['provenance_directory'] = directory
            else:
                key_ds['provenance_directory'] = None

            md5 = grf(key, 'md5')
            if md5 != 'None':
                key_ds['md5'] = md5
            else:
                key_ds['md5'] = None

            file_size = grf(key, 'file_size')
            if file_size != 'None':
                key_ds['file_size'] = file_size
            else:
                key_ds['file_size'] = None

            data_set.insert1(key_ds)

            if data_set.flush(skip_duplicates=True,
                              allow_direct_insert=True,
                              chunksz=100):
                print('Inserted 100 dataset tuples')

        if data_set.flush(skip_duplicates=True, allow_direct_insert=True):
            print('Inserted all remaining dataset tuples')

    if 'FileRecord' not in excluded_tables:
        print('Ingesting file record entries...')
        records = alyxraw.AlyxRaw & 'model="data.filerecord"'
        repos = (data.DataRepository
                 & 'repo_name LIKE "flatiron%"').fetch('repo_uuid')
        records_flatiron = alyxraw.AlyxRaw.Field & records & \
            'fname = "data_repository"' & [{'fvalue': str(repo)} for repo in repos]
        record_exists = alyxraw.AlyxRaw.Field & records & \
            'fname = "exists"' & 'fvalue="True"'
        key_source = (alyxraw.AlyxRaw & record_exists & records_flatiron).proj(
            record_uuid='uuid') - data.FileRecord

        file_record = InsertBuffer(data.FileRecord)

        for key in tqdm(key_source.fetch('KEY'), position=0):
            key_fr = key.copy()
            key['uuid'] = key['record_uuid']
            key_fr['exists'] = True

            dataset = grf(key, 'dataset')
            if not len(data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))):
                print('Dataset {} is not in the table data.DataSet')
                print('Record_uuid: {}'.format(str(key['uuid'])))
                continue

            key_fr['subject_uuid'], key_fr['session_start_time'], \
                key_fr['dataset_name'] = \
                (data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))).fetch1(
                    'subject_uuid', 'session_start_time', 'dataset_name')

            repo = grf(key, 'data_repository')
            key_fr['repo_name'] = \
                (data.DataRepository & dict(repo_uuid=uuid.UUID(repo))).fetch1(
                    'repo_name')

            key_fr['relative_path'] = grf(key, 'relative_path')

            file_record.insert1(key_fr)

            if file_record.flush(skip_duplicates=True,
                                 allow_direct_insert=True,
                                 chunksz=1000):
                print('Inserted 1000 raw field tuples')

        if file_record.flush(skip_duplicates=True, allow_direct_insert=True):
            print('Inserted all remaining file record tuples')
    key_s = dict()
    key_s['subject_uuid'], key_s['session_start_time'] = \
        (acquisition.Session & key).fetch1(
            'subject_uuid', 'session_start_time')

    users = grf(key, 'users', multiple_entries=True)

    for user in users:
        key_su = key_s.copy()
        key_su['user_name'] = \
            (reference.LabMember & dict(user_uuid=uuid.UUID(user))).fetch1(
                'user_name')
        acquisition.SessionUser.insert1(key_su, skip_duplicates=True)

        session_user.insert1(key_su)
        if session_user.flush(skip_duplicates=True, chunksz=1000):
            print('Inserted 1000 session user tuples')

if session_user.flush(skip_duplicates=True):
    print('Inserted all remaining session user tuples')

# acquisition.SessionProcedure
print('Ingesting acquisition.SessionProcedure...')
sessions = alyxraw.AlyxRaw & 'model="actions.session"'
sessions_with_procedures = alyxraw.AlyxRaw.Field & sessions & \
    'fname="procedures"' & 'fvalue!="None"'
keys = (alyxraw.AlyxRaw & sessions_with_procedures).proj(session_uuid='uuid')

session_procedure = InsertBuffer(acquisition.SessionProcedure)
Пример #5
0
with open(filename, 'r') as fid:
    keys = json.load(fid)

# remove invalid uuid from unused tables
keys = [
    key for key in keys if key['model'] not in
    ['auth.group', 'sessions.session', 'authtoken.token']
]

# use insert buffer to speed up the insersion process
ib_main = InsertBuffer(alyxraw.AlyxRaw)
ib_part = InsertBuffer(alyxraw.AlyxRaw.Field)

# insert into AlyxRaw table
for key in keys:
    ib_main.insert1(dict(uuid=uuid.UUID(key['pk']), model=key['model']))
    if ib_main.flush(skip_duplicates=True, chunksz=10000):
        logger.debug('Inserted 10000 raw tuples.')

if ib_main.flush(skip_duplicates=True):
    logger.debug('Inserted remaining raw tuples')

# insert into the part table AlyxRaw.Field
for ikey, key in enumerate(keys):
    try:
        key_field = dict(uuid=uuid.UUID(key['pk']))
        for field_name, field_value in key['fields'].items():
            key_field = dict(key_field, fname=field_name)

            if field_name == 'json' and field_value is not None:
Пример #6
0
    else:
        key_ds['provenance_directory'] = None

    md5 = grf(key, 'md5')
    if md5 != 'None':
        key_ds['md5'] = md5
    else:
        key_ds['md5'] = None

    file_size = grf(key, 'file_size')
    if file_size != 'None':
        key_ds['file_size'] = file_size
    else:
        key_ds['file_size'] = None

    data_set.insert1(key_ds)

    if data_set.flush(
            skip_duplicates=True,
            allow_direct_insert=True, chunksz=100):
        print('Inserted 100 dataset tuples')

if data_set.flush(skip_duplicates=True, allow_direct_insert=True):
    print('Inserted all remaining dataset tuples')


# ingest file record entries
records = alyxraw.AlyxRaw & 'model="data.filerecord"'
repos = (data.DataRepository & 'repo_name LIKE "flatiron%"').fetch(
    'repo_uuid')
records_flatiron = alyxraw.AlyxRaw.Field & records & \
Пример #7
0
    key['uuid'] = key['record_uuid']
    key_fr['exists'] = True

    dataset = grf(key, 'dataset')
    if not len(data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))):
        print('Dataset {} is not in the table data.DataSet')
        print('Record_uuid: {}'.format(str(key['uuid'])))
        continue

    key_fr['subject_uuid'], key_fr['session_start_time'], \
        key_fr['dataset_name'] = \
        (data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))).fetch1(
            'subject_uuid', 'session_start_time', 'dataset_name')

    repo = grf(key, 'data_repository')
    key_fr['repo_name'] = \
        (data.DataRepository & dict(repo_uuid=uuid.UUID(repo))).fetch1(
            'repo_name')

    key_fr['relative_path'] = grf(key, 'relative_path')

    file_record.insert1(key_fr)

    if file_record.flush(skip_duplicates=True,
                         allow_direct_insert=True,
                         chunksz=1000):
        print('Inserted 1000 raw field tuples')

if file_record.flush(skip_duplicates=True, allow_direct_insert=True):
    print('Inserted all remaining file record tuples')