def ingest_all(): ib_brainregion = InsertBuffer(reference_ingest.BrainRegion) for key in tqdm(keys, position=0): fields = key['fields'] graph_order = atlas[atlas['id'] == key['pk']]['graph_order'] if np.isnan(graph_order.to_list()[0]): graph_order = None else: graph_order = int(graph_order) ib_brainregion.insert1( dict(brain_region_pk=key['pk'], acronym=fields['acronym'], brain_region_name=fields['name'], parent=fields['parent'], brain_region_level=fields['level'], graph_order=graph_order)) if ib_brainregion.flush(skip_duplicates=True, chunksz=1000): print('Inserted 1000 raw tuples.') if ib_brainregion.flush(skip_duplicates=True): print('Inserted all remaining raw field tuples')
def insert_to_alyxraw(keys, alyxraw_module=alyxraw, alyx_type='all'): # use insert buffer to speed up the insertion process if alyx_type in ('all', 'main'): ib_main = InsertBuffer(alyxraw_module.AlyxRaw) # insert into AlyxRaw table for key in tqdm(keys, position=0): try: pk = uuid.UUID(key['pk']) except Exception: print('Error for key: {}'.format(key)) continue ib_main.insert1(dict(uuid=pk, model=key['model'])) if ib_main.flush(skip_duplicates=True, chunksz=10000): logger.debug('Inserted 10000 raw tuples.') if ib_main.flush(skip_duplicates=True): logger.debug('Inserted remaining raw tuples') ib_main = InsertBuffer(alyxraw_module.AlyxRaw) if alyx_type in ('all', 'part'): ib_part = InsertBuffer(alyxraw_module.AlyxRaw.Field) # insert into the part table AlyxRaw.Field for ikey, key in tqdm(enumerate(keys), position=0): try: try: pk = uuid.UUID(key['pk']) except ValueError: print('Error for key: {}'.format(key)) continue key_field = dict(uuid=uuid.UUID(key['pk'])) for field_name, field_value in key['fields'].items(): key_field = dict(key_field, fname=field_name) if field_name == 'json' and field_value is not None: key_field['value_idx'] = 0 key_field['fvalue'] = json.dumps(field_value) if len(key_field['fvalue']) < 10000: ib_part.insert1(key_field) else: continue if field_name == 'narrative' and field_value is not None: # filter out emoji emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) key_field['value_idx'] = 0 key_field['fvalue'] = emoji_pattern.sub( r'', field_value) elif field_value is None or field_value == '' or field_value == [] or \ (isinstance(field_value, float) and math.isnan(field_value)): key_field['value_idx'] = 0 key_field['fvalue'] = 'None' ib_part.insert1(key_field) elif type(field_value) is list and \ (type(field_value[0]) is dict or type(field_value[0]) is str): for value_idx, value in enumerate(field_value): key_field['value_idx'] = value_idx key_field['fvalue'] = str(value) ib_part.insert1(key_field) else: key_field['value_idx'] = 0 key_field['fvalue'] = str(field_value) ib_part.insert1(key_field) if ib_part.flush(skip_duplicates=True, chunksz=10000): logger.debug('Inserted 10000 raw field tuples') except Exception: print('Problematic entry:{}'.format(ikey)) raise if ib_part.flush(skip_duplicates=True): logger.debug('Inserted all remaining raw field tuples')
def main(excluded_tables=[], modified_pks=None): kwargs = dict(display_progress=True, suppress_errors=True) for t in SHADOW_TABLES: if t.__name__ in excluded_tables: continue print(f'Ingesting shadow table {t.__name__}...') if t.__name__ == 'Session' and modified_pks: modified_session_keys = [{ 'session_uuid': pk } for pk in modified_pks] sessions = acquisition.Session & modified_session_keys if sessions: modified_session_entries = [] for key in sessions.fetch('KEY'): try: entry = acquisition.Session.create_entry(key) modified_session_entries.append(entry) except: print("Error creating entry for key: {}".format(key)) if modified_session_entries: t.insert(modified_session_entries, allow_direct_insert=True, replace=True) t.populate(**kwargs) if 'DataSet' not in excluded_tables: print('Ingesting dataset entries...') key_source = (alyxraw.AlyxRaw & 'model="data.dataset"').proj( dataset_uuid="uuid") - data.DataSet data_set = InsertBuffer(data.DataSet) for key in tqdm(key_source.fetch('KEY'), position=0): key_ds = key.copy() key['uuid'] = key['dataset_uuid'] session = grf(key, 'session') if not len(acquisition.Session & dict(session_uuid=uuid.UUID(session))): print('Session {} is not in the table acquisition.Session'. format(session)) print('dataset_uuid: {}'.format(str(key['uuid']))) continue key_ds['subject_uuid'], key_ds['session_start_time'] = \ (acquisition.Session & dict(session_uuid=uuid.UUID(session))).fetch1( 'subject_uuid', 'session_start_time') key_ds['dataset_name'] = grf(key, 'name') dt = grf(key, 'dataset_type') key_ds['dataset_type_name'] = \ (data.DataSetType & dict(dataset_type_uuid=uuid.UUID(dt))).fetch1( 'dataset_type_name') user = grf(key, 'created_by') if user != 'None': try: key_ds['dataset_created_by'] = \ (reference.LabMember & dict(user_uuid=uuid.UUID(user))).fetch1( 'user_name') except: print(user) else: key_ds['dataset_created_by'] = None format = grf(key, 'data_format') key_ds['format_name'] = \ (data.DataFormat & dict(format_uuid=uuid.UUID(format))).fetch1( 'format_name') key_ds['created_datetime'] = grf(key, 'created_datetime') software = grf(key, 'generating_software') if software != 'None': key_ds['generating_software'] = software else: key_ds['generating_software'] = None directory = grf(key, 'provenance_directory') if directory != 'None': key_ds['provenance_directory'] = directory else: key_ds['provenance_directory'] = None md5 = grf(key, 'md5') if md5 != 'None': key_ds['md5'] = md5 else: key_ds['md5'] = None file_size = grf(key, 'file_size') if file_size != 'None': key_ds['file_size'] = file_size else: key_ds['file_size'] = None data_set.insert1(key_ds) if data_set.flush(skip_duplicates=True, allow_direct_insert=True, chunksz=100): print('Inserted 100 dataset tuples') if data_set.flush(skip_duplicates=True, allow_direct_insert=True): print('Inserted all remaining dataset tuples') if 'FileRecord' not in excluded_tables: print('Ingesting file record entries...') records = alyxraw.AlyxRaw & 'model="data.filerecord"' repos = (data.DataRepository & 'repo_name LIKE "flatiron%"').fetch('repo_uuid') records_flatiron = alyxraw.AlyxRaw.Field & records & \ 'fname = "data_repository"' & [{'fvalue': str(repo)} for repo in repos] record_exists = alyxraw.AlyxRaw.Field & records & \ 'fname = "exists"' & 'fvalue="True"' key_source = (alyxraw.AlyxRaw & record_exists & records_flatiron).proj( record_uuid='uuid') - data.FileRecord file_record = InsertBuffer(data.FileRecord) for key in tqdm(key_source.fetch('KEY'), position=0): key_fr = key.copy() key['uuid'] = key['record_uuid'] key_fr['exists'] = True dataset = grf(key, 'dataset') if not len(data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))): print('Dataset {} is not in the table data.DataSet') print('Record_uuid: {}'.format(str(key['uuid']))) continue key_fr['subject_uuid'], key_fr['session_start_time'], \ key_fr['dataset_name'] = \ (data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))).fetch1( 'subject_uuid', 'session_start_time', 'dataset_name') repo = grf(key, 'data_repository') key_fr['repo_name'] = \ (data.DataRepository & dict(repo_uuid=uuid.UUID(repo))).fetch1( 'repo_name') key_fr['relative_path'] = grf(key, 'relative_path') file_record.insert1(key_fr) if file_record.flush(skip_duplicates=True, allow_direct_insert=True, chunksz=1000): print('Inserted 1000 raw field tuples') if file_record.flush(skip_duplicates=True, allow_direct_insert=True): print('Inserted all remaining file record tuples')
key_s = dict() key_s['subject_uuid'], key_s['session_start_time'] = \ (acquisition.Session & key).fetch1( 'subject_uuid', 'session_start_time') users = grf(key, 'users', multiple_entries=True) for user in users: key_su = key_s.copy() key_su['user_name'] = \ (reference.LabMember & dict(user_uuid=uuid.UUID(user))).fetch1( 'user_name') acquisition.SessionUser.insert1(key_su, skip_duplicates=True) session_user.insert1(key_su) if session_user.flush(skip_duplicates=True, chunksz=1000): print('Inserted 1000 session user tuples') if session_user.flush(skip_duplicates=True): print('Inserted all remaining session user tuples') # acquisition.SessionProcedure print('Ingesting acquisition.SessionProcedure...') sessions = alyxraw.AlyxRaw & 'model="actions.session"' sessions_with_procedures = alyxraw.AlyxRaw.Field & sessions & \ 'fname="procedures"' & 'fvalue!="None"' keys = (alyxraw.AlyxRaw & sessions_with_procedures).proj(session_uuid='uuid') session_procedure = InsertBuffer(acquisition.SessionProcedure) for key in tqdm(keys, position=0):
keys = json.load(fid) # remove invalid uuid from unused tables keys = [ key for key in keys if key['model'] not in ['auth.group', 'sessions.session', 'authtoken.token'] ] # use insert buffer to speed up the insersion process ib_main = InsertBuffer(alyxraw.AlyxRaw) ib_part = InsertBuffer(alyxraw.AlyxRaw.Field) # insert into AlyxRaw table for key in keys: ib_main.insert1(dict(uuid=uuid.UUID(key['pk']), model=key['model'])) if ib_main.flush(skip_duplicates=True, chunksz=10000): logger.debug('Inserted 10000 raw tuples.') if ib_main.flush(skip_duplicates=True): logger.debug('Inserted remaining raw tuples') # insert into the part table AlyxRaw.Field for ikey, key in enumerate(keys): try: key_field = dict(uuid=uuid.UUID(key['pk'])) for field_name, field_value in key['fields'].items(): key_field = dict(key_field, fname=field_name) if field_name == 'json' and field_value is not None: key_field['value_idx'] = 0
md5 = grf(key, 'md5') if md5 != 'None': key_ds['md5'] = md5 else: key_ds['md5'] = None file_size = grf(key, 'file_size') if file_size != 'None': key_ds['file_size'] = file_size else: key_ds['file_size'] = None data_set.insert1(key_ds) if data_set.flush( skip_duplicates=True, allow_direct_insert=True, chunksz=100): print('Inserted 100 dataset tuples') if data_set.flush(skip_duplicates=True, allow_direct_insert=True): print('Inserted all remaining dataset tuples') # ingest file record entries records = alyxraw.AlyxRaw & 'model="data.filerecord"' repos = (data.DataRepository & 'repo_name LIKE "flatiron%"').fetch( 'repo_uuid') records_flatiron = alyxraw.AlyxRaw.Field & records & \ 'fname = "data_repository"' & [{'fvalue': str(repo)} for repo in repos] record_exists = alyxraw.AlyxRaw.Field & records & \ 'fname = "exists"' & 'fvalue="True"'