def transform_gen3(item_paths, output_dir, project_id, compresslevel=0): """Creates gen3.treatment, returns set of treatment_ids.""" diagnoses = set([line['submitter_id'] for line in reader('{}/diagnosis.json'.format(output_dir))]) treatment_emitter = emitter('treatment', output_dir=output_dir) treatment_ids = set([]) missing_diagnoses = [] for p,treatment_type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): participantid = line.get('ParticipantID', line.get('participantid', None)) assert participantid, 'ParticipantID not in {} {}'.format(p, line.keys()) diagnosis_submitter_id = '{}-diagnosis'.format(participantid) treatment_submitter_id = '{}-{}-{}'.format(diagnosis_submitter_id, treatment_type, get_uniq(line)) if diagnosis_submitter_id not in diagnoses: missing_diagnoses.append(missing_parent(parent_id=diagnosis_submitter_id, parent_type='diagnosis', child_id=treatment_submitter_id, child_type='treatment')) print('skipping missing diagnosis', treatment_submitter_id) continue if treatment_submitter_id in treatment_ids: print('skipping ',treatment_submitter_id, p, line.keys()) continue treatment_ids.add(treatment_submitter_id) treatment = default_treatment(treatment_submitter_id, diagnosis_submitter_id, treatment_type, project_id) treatment = obscure_dates(treatment, output_dir=output_dir, participantid=participantid) treatment_emitter.write(treatment) save_missing_parents(missing_diagnoses) return treatment_ids
def transform(output_dir, compresslevel=0): """Read bmeg json and writes gen3 json.""" ssm_emitter = JSONEmitter(os.path.join(output_dir, 'submitted_somatic_mutation.json'), compresslevel=0) read_groups = {} # [ "_id", "data", "from", "gid", "label", "to" ] for line in reader('source/ccle/DerivedFrom.Edge.json.gz'): read_groups[line['from']] = 'read_group-{}'.format(line['to']) for line in reader('source/ccle/File.Vertex.json.gz'): ssm_submitter_id = line['gid'] read_group_submitter_id = read_groups[ssm_submitter_id] ssm = { 'type': 'submitted_somatic_mutation', '*read_groups': { 'submitter_id': read_group_submitter_id } } ssm['*submitter_id'] = ssm_submitter_id ssm['md5sum'] = line['data']['md5'] ssm['file_size'] = line['data']['size'] ssm['file_name'] = line['data']['path'] ssm['experimental_strategy'] = 'etl' ssm['data_type'] = 'maf like' ssm['data_format'] = 'tsv' ssm['data_category'] = 'omics' ssm_emitter.write(ssm) ssm_emitter.close()
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0): """Creates gen3.treatment, returns set of treatment_ids.""" case_lookup = { line['MRN']: line['OPTR'] for line in reader('{}/bcc-cases.tsv'.format('source/bcc')) } biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir) for item_path in item_paths: biomarkers = [line for line in reader(item_path)] # missing_cases = [b['MRN'] for b in biomarkers if b['MRN'] not in case_lookup] def add_case(b): case_submitter_id = case_lookup[b['MRN']] submitter_id = '{}-{}-bcc_biomarker'.format( case_submitter_id, b['ID_Event']) for p in [ "MRN", "Participant ID", "_not_available_notes", "_not_available_reason_id", "cBiomarker Label dont use", ]: del b[p] for p in [ "CA19 Values After Specimen Collection", "Order Proc ID", "assay version id", "biomarker level", "unit of measure id", ]: new_p = p.replace(' ', '_').lower() b[new_p] = b[p] del b[p] b['cbiomarker_label'] = b["cBiomarker Label use this"] del b["cBiomarker Label use this"] biomarker = { 'type': 'bcc_biomarker', 'cases': { 'submitter_id': case_submitter_id }, 'submitter_id': submitter_id, 'project_id': project_id } biomarker.update(b) return biomarker biomarkers_with_case = [ add_case(b) for b in biomarkers if b['MRN'] in case_lookup ] print('there are', len(biomarkers_with_case), 'biomarkers with cases, out of ', len(biomarkers), 'biomarkers') [ biomarker_emitter.write(obscure_dates(b)) for b in biomarkers_with_case ] biomarker_emitter.close()
def transform(item_paths, output_dir, experiment_code, compresslevel=0, callback=None): """Read bcc labkey json and writes gen3 json.""" genetrails_emitter = emitter('genetrails_variant', output_dir=output_dir) with open('output/reference/gene_lookup.tsv') as f: gene_lookup = {k: v for k, v in (line.split() for line in f)} for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): line['source'] = source if callback: line = callback(line) genetrails_variant = { 'type': 'genetrails_variant', 'project_id': DEFAULT_PROJECT_ID, 'aliquot': { 'submitter_id': '{}-aliquot'.format(line['sample_code']) }, 'submitter_id': line['lsid'] } if 'gene_symbol' in line and line['gene_symbol'].lower( ) in gene_lookup: line['gene'] = { 'submitter_id': gene_lookup[line['gene_symbol'].lower()], 'project_id': 'smmart-reference' } genetrails_variant.update(line) genetrails_variant = obscure_dates(genetrails_variant, output_dir=output_dir) genetrails_emitter.write(genetrails_variant) genetrails_emitter.close()
def transform_surgery(item_paths, output_dir, project_id, treatment_ids, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" bcc_treatment_emitter = emitter('bcc_surgery', output_dir=output_dir) bcc_treatment_submitter_ids = [] for p,type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): line['source'] = source if callback: line = callback(line) participantid = line.get('ParticipantID', line.get('participantid', None)) assert participantid, 'ParticipantID not in {} {}'.format(p, line.keys()) diagnosis_submitter_id = '{}-diagnosis'.format(participantid) treatment_submitter_id = '{}-Surgery-{}'.format(diagnosis_submitter_id, get_uniq(line)) bcc_treatment_submitter_id = '{}-bcc_surgery'.format(treatment_submitter_id) if treatment_submitter_id not in treatment_ids: # print('transform_surgery {} not in treatment_ids, skipping.'.format(treatment_submitter_id)) continue if bcc_treatment_submitter_id in bcc_treatment_submitter_ids: # print('transform_surgery {} in bcc_treatment_submitter_ids, skipping.'.format(treatment_submitter_id)) continue bcc_treatment_submitter_ids.append(bcc_treatment_submitter_id) bcc_treatment = { 'type': 'bcc_surgery', 'project_id': project_id, 'treatment': {'submitter_id': treatment_submitter_id}, 'submitter_id': bcc_treatment_submitter_id } if 'type' in line and p == 'source/bcc/vResectionDate.json': del line['type'] bcc_treatment.update(line) bcc_treatment = obscure_dates(bcc_treatment, output_dir=output_dir) bcc_treatment_emitter.write(bcc_treatment) bcc_treatment_emitter.close()
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0): """Creates gen3.treatment, returns set of treatment_ids.""" submitted_file_emitter = emitter('submitted_file', output_dir=output_dir) for item_path in item_paths: for line in reader(item_path): submitter_id = '{}-{}'.format(line['participantid'], line['document']) submitted_file = { 'type': 'bcc_submitted_file', 'cases': { 'submitter_id': line['participantid'] }, 'submitter_id': submitter_id, 'project_id': project_id } submitted_file.update(line) for k in [ "_labkeyurl_data_owner", "_labkeyurl_doctype_id", "_labkeyurl_document", "_labkeyurl_participantid", ]: del submitted_file[k] submitted_file_emitter.write(submitted_file) submitted_file_emitter.close()
def transform(item_paths, output_dir, project_id, type, callback=None, compresslevel=0): """Read bcc labkey json and writes postgres TSV with embedded gen3 json.""" path = os.path.join(output_dir, '{}.tsv'.format(type)) node_ids = set([]) with open(path, 'w') as output_file: for p in item_paths: for line in reader(p): node_id = uuid.uuid5(uuid.NAMESPACE_DNS, line['gid'].lower()) if node_id in node_ids: continue node_ids.add(node_id) line['data']['project_id'] = project_id line['data']['submitter_id'] = line['gid'].lower() line['node_id'] = node_id if callback: line = callback(line) # copy node_gene(node_id, acl, _sysan, _props) from stdin csv delimiter E'\x01' quote E'\x02' ;" output_file.write('{}\x01{}\x01{}\x01{}\n'.format( node_id, '{}', '{}', json.dumps(line['data'], separators=(',', ':'))))
def transform(output_dir, compresslevel=0): """Read bmeg json and writes gen3 json.""" read_groups_emitter = JSONEmitter(os.path.join(output_dir, 'read_group.json'), compresslevel=0) read_groups = {} # [ "_id", "data", "from", "gid", "label", "to" ] # {"_id": "(Callset:ccle:ACH-001270:None)--CallsetFor->(Aliquot:ACH-001270)", "gid": "(Callset:ccle:ACH-001270:None)--CallsetFor->(Aliquot:ACH-001270)", "label": "CallsetFor", "from": "Callset:ccle:ACH-001270:None", "to": "Aliquot:ACH-001270", "data": {}} for line in reader('source/ccle/maf.CallsetFor.Edge.json.gz'): # *type project_id *submitter_id *aliquots.submitter_id RIN adapter_name adapter_sequence barcoding_applied base_caller_name base_caller_version experiment_name flow_cell_barcode includes_spike_ins instrument_model is_paired_end library_name library_preparation_kit_catalog_number library_preparation_kit_name library_preparation_kit_vendor library_preparation_kit_version library_selection library_strand library_strategy platform read_group_name read_length sequencing_center sequencing_date size_selection_range spike_ins_concentration spike_ins_fasta target_capture_kit_catalog_number target_capture_kit_name target_capture_kit_target_region target_capture_kit_vendor target_capture_kit_version to_trim_adapter_sequence ] read_group_submitter_id = 'read_group-{}'.format(line['from']) if read_group_submitter_id in read_groups: continue read_group = { 'type': 'read_group', '*aliquots': { 'submitter_id': line['to'] } } read_group['*submitter_id'] = read_group_submitter_id read_groups[read_group_submitter_id] = read_group for read_group in read_groups: read_groups_emitter.write(read_groups[read_group]) read_groups_emitter.close()
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" aliquots_emitter = emitter('aliquot', output_dir=output_dir) for line in reader('{}/sample.json'.format(output_dir)): assert 'submitter_id' in line, line aliquots_emitter.write( default_aliquot(line['submitter_id'], project_id=DEFAULT_PROJECT_ID)) aliquots_emitter.close()
def sample(item_paths, limit=100): """Reads limit number of records from each file in paths.""" for path in item_paths: i = 0 for line in reader(path): if i < limit: yield line i = i + 1 else: break
def lookups(): look_ups = {} for p in LOOKUP_PATHS: c = p.replace('source/bcc/','').replace('.json','') look_ups[c] = {} print(p, c) for line in reader(p): name = line.get('display_name', line.get('alt_display_name', None)) val = [line[k] for k in line if not k.startswith('_') and k.endswith('_id')][0] look_ups[c][val] = name return look_ups
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Reads bcc labkey json and writes participantid, dob json.""" dob_emitter = emitter('bcc_participant_dob', output_dir=output_dir) for p in item_paths: for line in reader(p): dob_emitter.write({ 'participantid': line['ParticipantID'], 'DateOfBirth': line['DateOfBirth'] }) dob_emitter.close()
def _DOBs(output_dir): global DOBs if DOBs: return DOBs # load date of birth cache DOBs = {} gdan_tmp_participant_path = '{}/gdan-tmp_participant_dob.json'.format(output_dir) if os.path.isfile(gdan-tmp_participant_path): for line in reader(gdan-tmp_participant_path): DOBs[line['participantid']] = datetime.strptime(line['DateOfBirth'], DATE_FORMAT) return DOBs
def upload(path, program, project, submission_client, batch_size, delete_first): """Read gen3 json and write to gen3.""" pool = mp.Pool(mp.cpu_count()) def collect_result(response): is_error = False for entity in response['entities']: for error in entity.get('errors', []): logger.error('{} {} {}'.format(error['type'], entity['type'], entity)) is_error = True for error in response['transactional_errors']: logger.error('transactional_error {}'.format(error)) logger.error(json.dumps(response)) is_error = True if is_error: logger.debug(response) for p in glob(path): deleted = False print(p) for lines in grouper(batch_size, reader(p)): nodes = [l for l in lines] if nodes[0]['type'] == 'project': for node in nodes: print('creating program') response = submission_client.create_program({'name': program, 'dbgap_accession_number': program, 'type': 'program'}) # response = None # try: # response = json.loads(r) # except Exception as e: # pass assert response, 'could not parse response {}'.format(r) # assert 'code' in response, f'Unexpected response {response}' # assert response['code'] == 200, 'could not create {} program'.format(response) assert 'id' in response, 'could not create {} program'.format(response) assert program in response['name'], 'could not create {} program'.format(response) response = submission_client.create_project(program, node) assert response, 'could not parse response' assert 'code' in response, f'Unexpected response {response}' assert response['code'] == 200, 'could not create {} {}'.format(nodes[0]['type'], response) assert 'successful' in response['message'], 'could not create {} {}'.format(nodes[0]['type'], response) print('Created project {}'.format(node['code']), file=sys.stderr) continue if nodes[0]['type'] == 'experiment': project = nodes[0]['projects'][0]['code'] if not deleted and delete_first: delete_all(submission_client, program, project, types=[nodes[0]['type']]) deleted = True collect_result(create_node(submission_client, program, project, nodes))
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" cases_emitter = emitter('case', output_dir=output_dir) bcc_cases_emitter = emitter('bcc_participant', output_dir=output_dir) cases = {} bcc_cases = {} submitter_ids = [] for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): submitter_id = line.get('participantid', line.get('ParticipantID', None)) submitter_ids.append(submitter_id) bcc_submitter_id = '{}-{}'.format(submitter_id, source) primary_site = line.get('site', None) case = { 'type': 'case', 'experiments': { 'submitter_id': experiment_code }, 'primary_site': primary_site, 'submitter_id': submitter_id, 'project_id': DEFAULT_PROJECT_ID } bcc_case = { 'type': 'bcc_participant', 'case': { 'submitter_id': submitter_id }, 'source': source, 'submitter_id': bcc_submitter_id, 'project_id': DEFAULT_PROJECT_ID } cases[submitter_id] = case if bcc_submitter_id in bcc_cases: # merge dupes bcc_case = bcc_cases[bcc_submitter_id] bcc_case.update(line) bcc_cases[bcc_submitter_id] = bcc_case for k in cases: cases_emitter.write( obscure_dates(cases[k], participantid=k, output_dir=output_dir)) for k in bcc_cases: bcc_case = bcc_cases[k] for p in [ 'FirstName', 'MRN', 'LastName', 'DateOfBirth', '_labkeyurl_Gender_ID', '_labkeyurl_ParticipantID', 'Gender_ID' ]: del bcc_case[p] bcc_case = obscure_dates(bcc_case, output_dir=output_dir) bcc_cases_emitter.write(bcc_case) cases_emitter.close() bcc_cases_emitter.close()
def diagnosis_lookup_values(paths): look_ups = {} for p in paths: c = p.replace('source/bcc/', '').replace('genetrails_', '').replace( '.json', '').replace('diagnoses', 'diagnosis') look_ups[c] = {} for line in reader(p): name = line.get('display_name', line.get('diagnosis')) val = line.get('rowid', line.get('diagnosis_id')) if val == None or name == None: print(line) look_ups[c][val] = name return look_ups
def treatment_lookup_values(paths): look_ups = {} for p in paths: c = p.replace('source/bcc/', '').replace('.json', '') look_ups[c] = {} for line in reader(p): name = line.get('display_name', line.get('alt_display_name', None)) for val in [ line[k] for k in line if not k.startswith('_') and k.endswith('_id') ]: look_ups[c][val] = name return look_ups
def sample_lookup_values(paths): look_ups = {} for p in paths: c = p.replace('source/bcc/', '').replace('genetrails_', '').replace( '.json', '').replace('sample_type', 'sample_type_id') look_ups[c] = {} for line in reader(p): name = line.get('display_name') val = line.get('sample_type_id') if val == None or name == None: print(line) look_ups[c][val] = name return look_ups
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0): """Creates gen3.lesion, returns set of lesion_ids.""" cases = set([ line['submitter_id'] for line in reader('{}/case.json'.format(output_dir)) ]) observation_emitter = emitter('observation', output_dir=output_dir) observation_ids = set([]) missing_cases = [] for p, observation_type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): participantid = line.get('ParticipantID', line.get('participantid', None)) assert participantid, 'ParticipantID not in {} {}'.format( p, line.keys()) case_submitter_id = participantid observation = default_observation(case_submitter_id, project_id, line['date'], observation_type, line) observation_submitter_id = observation['submitter_id'] if case_submitter_id not in cases: missing_cases.append( missing_parent(parent_id=case_submitter_id, parent_type='case', child_id=observation_submitter_id, child_type='observation')) continue if observation_submitter_id in observation_ids: continue observation_ids.add(observation_submitter_id) observation = obscure_dates( observation, output_dir=output_dir, participantid=observation['cases']['submitter_id']) observation_emitter.write(observation) save_missing_parents(missing_cases) return observation_ids
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read medable csv and writes gen3 json.""" somatic_variants_emitter = emitter('somatic_variants2', output_dir=output_dir) for line in reader(item_paths[0]): line['aliquot'] = {'submitter_id': line['aliquot']} line['submitter_id'] = '{}-{}-{}'.format(line['aliquot'], line['allele_id'], line['ensembl_transcript']) line['type'] = 'somatic_variant' del line['ensembl_transcript'] del line['allele_id'] somatic_variants_emitter.write(line) somatic_variants_emitter.close()
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" demographics_emitter = emitter('demographic', output_dir=output_dir) bcc_demographics_emitter = emitter('bcc_demographic', output_dir=output_dir) demographics = {} bcc_demographics = {} for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): case_submitter_id = line['participantid'] submitter_id = '{}-demographic'.format(case_submitter_id) bcc_submitter_id = '{}-{}'.format(submitter_id, source) demographic = { 'type': 'demographic', 'cases': { 'submitter_id': case_submitter_id }, 'submitter_id': submitter_id, 'project_id': DEFAULT_PROJECT_ID } bcc_demographic = { 'type': 'bcc_demographic', 'demographic': { 'submitter_id': submitter_id }, 'source': source, 'submitter_id': bcc_submitter_id, 'project_id': DEFAULT_PROJECT_ID } demographics[submitter_id] = demographic if bcc_submitter_id in bcc_demographics: bcc_demographic = bcc_demographics[bcc_submitter_id] bcc_demographic.update(line) bcc_demographics[bcc_submitter_id] = bcc_demographic for k in demographics: demographics[k] = obscure_dates( demographics[k], output_dir=output_dir, participantid=demographics[k]['cases']['submitter_id']) demographics_emitter.write(demographics[k]) demographics_emitter.close() for k in bcc_demographics: bcc_demographics[k] = obscure_dates(bcc_demographics[k], output_dir=output_dir) bcc_demographics_emitter.write(bcc_demographics[k]) bcc_demographics_emitter.close()
def observation_lookup_values(paths): look_ups = {} for p in paths: c = p.replace('source/bcc/', '').replace('genetrails_', '').replace('.json', '') look_ups[c] = {} for line in reader(p): name = line.get('display_name') val = line.get(f'{c}_id') if val == None: val = line.get(c) if name == None: name = line.get('type_name', None) if val == None or name == None: print(c) look_ups[c][val] = name return look_ups
def transform_old(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" genes_emitter = emitter('gene', output_dir=output_dir) genes = {} for p in item_paths: for line in reader(p): case = { 'type': 'gene', 'experiments': { 'submitter_id': experiment_code }, 'submitter_id': line['participantid'] } if line['participantid'] in genes: # print('merge', line['participantid']) case = genes[line['participantid']] case.update(line) genes[line['participantid']] = case
def transform(item_paths, output_dir, project_id, compresslevel=0): """Read bcc labkey json and writes postgres TSV with embedded gen3 json.""" path = os.path.join(output_dir, 'gene.tsv') lookup_path = os.path.join(output_dir, 'gene_lookup.tsv') with open(lookup_path, 'w') as lookup_file: with open(path, 'w') as output_file: for p in item_paths: for line in reader(p): gene_id = line['data']['gene_id'].lower() symbol = line['data']['symbol'].lower() node_id = uuid.uuid5(uuid.NAMESPACE_DNS, gene_id) line['data']['project_id'] = project_id line['data']['submitter_id'] = gene_id # copy node_gene(node_id, acl, _sysan, _props) from stdin with delimiter E'\t' ; output_file.write('{}\x01{}\x01{}\x01{}\n'.format( node_id, '{}', '{}', json.dumps(line['data'], separators=(',', ':')))) lookup_file.write('{}\t{}\n'.format(symbol, gene_id))
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" samples_emitter = emitter('sample', output_dir=output_dir) for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): sample_id = line.rstrip('\n') submitter_id = f"sample-{sample_id}" sample = { 'type': 'sample', 'cases': { 'submitter_id': sample_id }, 'submitter_id': submitter_id, 'project_id': DEFAULT_PROJECT_ID } samples_emitter.write(sample) samples_emitter.close()
def transform(item_paths, output_dir, experiment_code, compresslevel=0, callback=None): """Read bcc labkey json and writes gen3 json.""" bcc_aliquot_emitter = emitter('bcc_aliquot', output_dir=output_dir) for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): line['source'] = source if callback: line = callback(line) bcc_aliquot = { 'type': 'bcc_aliquot', 'project_id': DEFAULT_PROJECT_ID, 'aliquot': {'submitter_id': '{}-aliquot'.format(line['sample_code'])}, 'submitter_id': line['lsid']} bcc_aliquot.update(line) bcc_aliquot = obscure_dates(bcc_aliquot, output_dir=output_dir) bcc_aliquot_emitter.write(bcc_aliquot) bcc_aliquot_emitter.close()
def upload(path, program, project, submission_client, delete_first, output_dir): """Transforms submission record to node and edge files""" for p in glob(path): tables = None for line in reader(p): if 'project_id' not in line: line['project_id'] = '{}-{}'.format(program, project) assert 'project_id' in line, 'must have project_id' assert 'submitter_id' in line, 'must have submitter_id' if not tables: tables = get_tables(submission_client, line) tables['handle'] = open( '{}/{}/{}.tsv'.format(output_dir, project, tables['node_table']), 'w') for l in tables['links']: l['handle'] = open( '{}/{}/{}.tsv'.format(output_dir, project, l['edge_table']), 'w') if delete_first: print( "$psql -c \"delete from {} where _props->>'project_id' = '{}-{}' ;\"" .format(tables['node_table'], program, project)) for l in tables['links']: line = write_edge(l, line, submission_client, '{}-{}'.format(program, project)) write_node(tables['handle'], line) tables['handle'].close() node_path = '{}/{}/{}.tsv'.format(output_dir, project, tables['node_table']) print( "cat $DATA/{} | $psql -c \"copy {}(node_id, acl, _sysan, _props) from stdin csv delimiter E'\\x01' quote E'\\x02' ;\"" .format(node_path, tables['node_table'])) for l in tables['links']: l['handle'].close() edge_path = '{}/{}/{}.tsv'.format(output_dir, project, l['edge_table']) print( "cat $DATA/{} | $psql -c \"copy {}(src_id, dst_id, acl, _sysan, _props) from stdin csv delimiter E'\\x01' quote E'\\x02' ;\"" .format(edge_path, l['edge_table']))
def transform(item_paths, output_dir, project_id, type, filter=None): """Read bcc labkey json and writes postgres TSV with embedded gen3 json.""" path = os.path.join(output_dir, '{}.tsv'.format(type)) dedupes = set([]) with open(path, 'w') as output_file: for p in item_paths: for line in reader(p): if filter and not filter(line): continue src_id = uuid.uuid5(uuid.NAMESPACE_DNS, line['from'].lower()) dst_id = uuid.uuid5(uuid.NAMESPACE_DNS, line['to'].lower()) dedupe = '{}-{}'.format(src_id, dst_id) if dedupe in dedupes: continue line['data']['from'] = line['from'] line['data']['to'] = line['to'] # copy $type (src_id, dst_id, acl, _sysan, _props) from stdin csv delimiter E'\x01' quote E'\x02' ;" output_file.write('{}\x01{}\x01{}\x01{}\x01{}\n'.format( src_id, dst_id, '{}', '{}', json.dumps(line['data'], separators=(',', ':')))) dedupes.add(dedupe)
def transform(item_paths, output_dir, experiment_code, project_id, compresslevel=0, callback=None): """Read bcc labkey json and writes gen3 json.""" alleles_emitter = emitter('allele', output_dir=output_dir) alleles = {} for p in item_paths: for line in reader(p): if callback: line = callback(line) allele = { 'type': 'allele', 'aliquots': {'submitter_id': '{}-aliquot'.format(line['sample_code'])}, 'projects': {'code': 'reference'}, 'submitter_id': line['lsid']} if line['lsid'] in alleles: allele = alleles[line['lsid']] allele['project_id'] = project_id allele.update(line) alleles[line['lsid']] = allele for k in alleles: alleles[k] = obscure_dates(alleles[k], output_dir=output_dir) alleles_emitter.write(alleles[k]) alleles_emitter.close()
def transform_biomarker(item_paths, output_dir, project_id, observation_ids, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" bcc_biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir) for p, observation_type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): participantid = line.get('ParticipantID', line.get('participantid', None)) observation = default_observation(participantid, project_id, line['date'], observation_type, line) observation_submitter_id = observation['submitter_id'] biomarker_submitter_id = '{}-bcc_biomarker'.format( observation_submitter_id) if observation_submitter_id not in observation_ids: print( 'transform_biomarker {} not in observation_ids, skipping.'. format(biomarker_submitter_id)) continue bcc_biomarker = { 'type': 'bcc_biomarker', 'project_id': project_id, 'observation': { 'submitter_id': observation_submitter_id }, 'submitter_id': biomarker_submitter_id } line['source'] = source if callback: line = callback(line) bcc_biomarker.update(line) bcc_biomarker = obscure_dates(bcc_biomarker, output_dir=output_dir) bcc_biomarker_emitter.write(bcc_biomarker) bcc_biomarker_emitter.close()