def create_collection_user_file(parser, uid, orig_filename, file_type): # create a MongoDB collection in userdb to store genome_nodes from file file_id = 'Gfile_' + str(uuid.uuid4()) collection = userdb.create_collection(file_id) # parse the file in chunks finished = False while not finished: finished = parser.parse_chunk() # use the orig filename as source parser.metadata['source'] = orig_filename genome_nodes, _, _ = parser.get_mongo_nodes() update_insert_many(collection, genome_nodes, update=False) # save file metadata in UserInfo file_num_docs = collection.estimated_document_count() file_info = { 'fileName': orig_filename, 'fileType': file_type, 'fileID': file_id, 'numDocs': file_num_docs, } update_doc = { '$set': { '_id': uid, }, '$push': { 'files': file_info } } UserInfo.update_one({'_id': uid}, update_doc, upsert=True) return collection
def upload(): # XML for patient info os.chdir('BCRXML') xml_files = [] for root, d, files in os.walk('.'): for f in files: if f.endswith('.xml'): xml_files.append(os.path.join(root, f)) xml_files.sort() all_patient_infonodes = [] # this is used in MAF parser patient_barcode_tumor_site = dict() # these are used in CNV parser patient_uuid_tumor_site = dict() patient_uuid_barcode = dict() print(f"Parsing {len(xml_files)} patient xml files") for f in xml_files: parser = TCGA_XMLParser(f, verbose=True) parser.parse() genome_nodes, info_nodes, edges = parser.get_mongo_nodes() # record the tumor site for each patient barcode info = info_nodes[0]['info'] patient_barcode = info['patient_barcode'] patient_barcode_tumor_site[patient_barcode] = info['biosample'] patient_uuid = info['patient_uuid'] patient_uuid_tumor_site[patient_uuid] = info['biosample'] patient_uuid_barcode[patient_uuid] = patient_barcode # collection individual info_nodes for each patient all_patient_infonodes += info_nodes # upload all patient info_nodes at once update_insert_many(InfoNodes, all_patient_infonodes)
def parse_upload_HGNC(): filename = os.path.basename(HGNC_URL) parser = TSVParser_HGNC(filename, verbose=True) parser.parse() genome_nodes, info_nodes, edges = parser.get_mongo_nodes() # patch the gene GenomeNodes update_skip_insert(GenomeNodes, genome_nodes) # upload the dataSource info node update_insert_many(InfoNodes, info_nodes)
def parse_upload(): parser = VCFParser_ExAC(FILENAME, verbose=True) parser.metadata['sourceurl'] = ExAC_URL i_chunk = 0 while True: finished = parser.parse_chunk(100000) print(f'Parsing and uploading chunk {i_chunk}') genome_nodes, info_nodes, edges = parser.get_mongo_nodes() update_insert_many(GenomeNodes, genome_nodes) i_chunk += 1 if finished == True: break update_insert_many(InfoNodes, info_nodes)
def parse_upload(): parser = VCFParser_dbSNP(FILENAME, verbose=True) parser.metadata['sourceurl'] = DBSNP_URL i_chunk = 0 while True: finished = parser.parse_chunk() print(f'Parsing and uploading chunk {i_chunk}') genome_nodes, info_nodes, edges = parser.get_mongo_nodes() update_insert_many(GenomeNodes, genome_nodes) i_chunk += 1 if finished == True: break # we only insert the infonode for dbSNP dataSource once update_insert_many(InfoNodes, info_nodes)
def parse_upload_ExAC_chunk(): filename = os.path.basename(ExAC_URL) parser = VCFParser_ExAC(filename, verbose=True) parser.metadata['sourceurl'] = ExAC_URL i_chunk = 0 finished = False while finished is not True: finished = parser.parse_chunk(100000) print(f'Parsing and uploading chunk {i_chunk}') genome_nodes, info_nodes, edges = parser.get_mongo_nodes() update_insert_many(GenomeNodes, genome_nodes) i_chunk += 1 # we only insert the infonode for ExAC dataSource once update_insert_many(InfoNodes, info_nodes)
def insert_encode_dataSource(): from sirius.helpers.constants import DATA_SOURCE_ENCODE ds = DATA_SOURCE_ENCODE # prevent duplicate if not InfoNodes.find_one({'_id': 'I' + ds}): update_insert_many(InfoNodes, [{ '_id': 'I' + ds, 'type': 'dataSource', 'name': ds, 'source': ds, 'info': { 'searchURL': SEARCHURL } }])
def parse_upload_GWAS(): filename = 'gwas.tsv' parser = TSVParser_GWAS(filename, verbose=True) parser.parse() genome_nodes, info_nodes, edges = parser.get_mongo_nodes() # upload the dataSource info node update_insert_many(InfoNodes, info_nodes) update_insert_many(Edges, edges) # patch the GenomeNodes with the data source gids = list(parser.parsed_snp_ids) uresult = GenomeNodes.update_many({'_id': { '$in': gids }}, {'$addToSet': { 'source': DATA_SOURCE_GWAS }}) print( f"Prepared {len(gids)} and updated {uresult.matched_count} GenomeNodes with source {DATA_SOURCE_GWAS}" )
def parse_upload_KEGG(): filename = os.path.basename(KEGG_URL) # the big tar.gz file contains many individual data files print(f"Decompressing {filename}") subprocess.check_call(f"tar zxf {filename} --skip-old-files", shell=True) foldername = 'kegg_pathways' # aggregate all pathways kegg_xmls = sorted([ os.path.join(foldername, f) for f in os.listdir(foldername) if f.startswith('path') and f.endswith('.xml') ]) gene_in_paths = collections.defaultdict(list) all_pathway_infonodes = [] for fname in kegg_xmls: parser = KEGG_XMLParser(fname) parser.parse() _, info_nodes, _ = parser.get_mongo_nodes() pathway = info_nodes[0] # aggregate all pathways for each gene for gene in pathway['info']['genes']: gene_in_paths[gene].append(pathway['name']) all_pathway_infonodes.append(pathway) update_insert_many(InfoNodes, all_pathway_infonodes) # prepare genome_nodes for patching existing_gene_name_id = dict() for gnode in GenomeNodes.find({'type': { '$in': ENSEMBL_GENE_SUBTYPES }}, projection=['_id', 'name']): existing_gene_name_id[gnode['name']] = gnode['_id'] print( f"Pulling existing genes finished, total {len(existing_gene_name_id)} genes" ) genome_nodes = [] for gene_name, path_names in gene_in_paths.items(): if gene_name in existing_gene_name_id: genome_nodes.append({ '_id': existing_gene_name_id[gene_name], 'source': DATA_SOURCE_KEGG, 'info': { 'kegg_pathways': path_names, } }) update_skip_insert(GenomeNodes, genome_nodes)
def parse_upload_data(parser, metadata={}): parser.parse() parser.metadata.update(metadata) genome_nodes, info_nodes, edges = parser.get_mongo_nodes() update_insert_many(GenomeNodes, genome_nodes) update_insert_many(InfoNodes, info_nodes) update_insert_many(Edges, edges)
def parse_upload_ROADMAP_EPIGENOMICS(): filename = os.path.basename(ROADMAP_EPIGENOMICS_URL) print(f"Decompressing {filename}") subprocess.check_call(f"tar zxf {filename} --skip-old-files", shell=True) os.chdir('roadmap_sort') bedgz_files = sorted([f for f in os.listdir('.') if f.endswith('.bed.gz')]) print(f"Parsing {len(bedgz_files)} .bed.gz files") for i, fname in enumerate(bedgz_files): print(f"{i:3d} {fname[:20]:20s} ", end='', flush=True) parser = BEDParser_ROADMAP_EPIGENOMICS(fname) parser.parse() genome_nodes, info_nodes, _ = parser.get_mongo_nodes() update_insert_many(GenomeNodes, genome_nodes) update_insert_many(InfoNodes, info_nodes) # Add one info node for dataSource update_insert_many(InfoNodes, [{ '_id': 'I' + DATA_SOURCE_ROADMAP_EPIGENOMICS, "type": "dataSource", 'name': DATA_SOURCE_ROADMAP_EPIGENOMICS, "source": DATA_SOURCE_ROADMAP_EPIGENOMICS, 'info': { 'filenames': bedgz_files, } }]) # finish os.chdir('..')
def parse_upload_ImmuneAtlas(): bedgz_files = sorted([f for f in os.listdir('.') if f.endswith('.bed.gz')]) print(f"Parsing {len(bedgz_files)} .bed.gz files") distinct_biosamples = set() for i, fname in enumerate(bedgz_files): print(f"{i:3d} {fname[:20]:20s} ", end='', flush=True) parser = BEDParser_ImmuneAtlas(fname) parser.parse() genome_nodes, _, _ = parser.get_mongo_nodes() # aggregate all biosamples distinct_biosamples.add(genome_nodes[0]['info']['biosample']) update_insert_many(GenomeNodes, genome_nodes) # Add one info node for dataSource update_insert_many(InfoNodes, [{ '_id': 'I' + DATA_SOURCE_ImmuneAtlas, "type": "dataSource", 'name': DATA_SOURCE_ImmuneAtlas, "source": DATA_SOURCE_ImmuneAtlas, 'info': { 'biosample': sorted(distinct_biosamples), } }])
def parse_upload_GTEx_files(): filename = os.path.basename(GTEx_URL) # the big tar.gz file contains many individual data files print(f"Decompressing {filename}") subprocess.check_call(f"tar zxf {filename} --skip-old-files", shell=True) foldername = filename.split('.', 1)[0] # aggregate all biosamples distinct_biosamples = set() for f in os.listdir(foldername): if f.endswith('egenes.txt.gz'): fname = os.path.join(foldername, f) print(f"Parsing and uploading from {fname}") parser = EQTLParser_GTEx(fname, verbose=True) parser.parse() # the first word in filename is parsed as the biosample biosample = f.split('.', 1)[0] # reformat to be consistent with ENCODE dataset biosample = ' '.join(biosample.lower().split('_')) distinct_biosamples.add(biosample) genome_nodes, info_nodes, edges = parser.get_mongo_nodes( {'biosample': biosample}) # we only insert the edges here for each file update_insert_many(Edges, edges) # patch the SNPs with the data source gids = list(parser.parsed_snp_ids) + list(parser.parsed_gene_ids) uresult = GenomeNodes.update_many({'_id': { '$in': gids }}, {'$addToSet': { 'source': DATA_SOURCE_GTEX }}) print( f"Prepared {len(gids)} and updated {uresult.matched_count} GenomeNodes with source {DATA_SOURCE_GTEX}" ) # change the filename to the big tar.gz file info_nodes[0]['info']['filename'] = filename info_nodes[0]['info']['biosample'] = list(distinct_biosamples) # insert one infonode for the GTEx dataSource update_insert_many(InfoNodes, info_nodes)
def parse_upload_bed(metadata, liftover=True): filename = metadata['filename'] parser = BEDParser_ENCODE(filename) parser.parse() parser.metadata.update(metadata) genome_nodes, info_nodes, edges = parser.get_mongo_nodes(liftover=liftover) print( f'parsing {filename} results in {len(genome_nodes)} GenomeNodes, {len(info_nodes)} InfoNodes, {len(edges)} Edges' ) print("Uploading to MongoDB") update_insert_many(GenomeNodes, genome_nodes, update=False) update_insert_many(InfoNodes, info_nodes, update=False) update_insert_many(Edges, edges, update=False)
def parse_upload_gff_chunk(): filename = os.path.basename(GRCH38_URL) parser = GFFParser_ENSEMBL(filename, verbose=True) parser.metadata['sourceurl'] = GRCH38_URL i_chunk = 0 finished = False while finished is not True: finished = parser.parse_chunk() genome_nodes, info_nodes, edges = parser.get_mongo_nodes() update_insert_many(GenomeNodes, genome_nodes) update_insert_many(InfoNodes, info_nodes[1:]) print(f"Data of chunk {i_chunk} uploaded") i_chunk += 1 # we only upload info_nodes[0] once here because all the chunks has the same first info node for the dataSource. update_insert_many(InfoNodes, info_nodes[0:1]) print("InfoNodes uploaded")
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("filename") parser.add_argument('datatype', choices=ParserClass.keys(), help='What data are we parsing?') parser.add_argument("--url", help='sourceurl of data') parser.add_argument("--save", action='store_true', help='Save parsed file to disk') parser.add_argument("--upload", action='store_true', help='Upload to MongoDB') parser.add_argument("--skip_insert", action='store_true', help='Only update existing docs in MongoDB') args = parser.parse_args() parser = ParserClass[args.datatype](args.filename, verbose=True) parser.parse() if args.url: parser.metadata['sourceurl'] = args.url # set some metadata for demonstration, they should be downloaded from ENCODE website if args.datatype == 'encode': parser.metadata['biosample'] = '#biosample#' parser.metadata['accession'] = '#accession#' parser.metadata['description'] = '#description#' parser.metadata['targets'] = ['#Target#'] if args.save: parser.save_json() parser.save_mongo_nodes() if args.upload == True: genome_nodes, info_nodes, edges = parser.get_mongo_nodes() from sirius.mongo import GenomeNodes, InfoNodes, Edges if not args.skip_insert: print("Uploading to MongoDB") update_insert_many(GenomeNodes, genome_nodes) update_insert_many(InfoNodes, info_nodes) update_insert_many(Edges, edges) else: print("Updating existing docs in MongoDB") update_skip_insert(GenomeNodes, genome_nodes) update_skip_insert(InfoNodes, info_nodes) update_skip_insert(Edges, edges)
def parse_upload_TCGA_files(): filename = os.path.basename(TCGA_URL) print(f"Decompressing {filename}") subprocess.check_call(f"tar zxf {filename} --skip-old-files", shell=True) # three subfolders have been prepared and we will parse them one by one # XML for patient info os.chdir('BCRXML') xml_files = [] for root, d, files in os.walk('.'): for f in files: if f.endswith('.xml'): xml_files.append(os.path.join(root, f)) xml_files.sort() all_patient_infonodes = [] # this is used in MAF parser patient_barcode_tumor_site = dict() # these are used in CNV parser patient_uuid_tumor_site = dict() patient_uuid_barcode = dict() print(f"Parsing {len(xml_files)} patient xml files") for f in xml_files: parser = TCGA_XMLParser(f, verbose=True) parser.parse() genome_nodes, info_nodes, edges = parser.get_mongo_nodes() # record the tumor site for each patient barcode info = info_nodes[0]['info'] patient_barcode = info['patient_barcode'] patient_barcode_tumor_site[patient_barcode] = info['biosample'] patient_uuid = info['patient_uuid'] patient_uuid_tumor_site[patient_uuid] = info['biosample'] patient_uuid_barcode[patient_uuid] = patient_barcode # collection individual info_nodes for each patient all_patient_infonodes += info_nodes # upload all patient info_nodes at once update_insert_many(InfoNodes, all_patient_infonodes) os.chdir('..') # MAF for mutations in tumors os.chdir('MAF') maf_files = [] variant_tags = set() for root, d, files in os.walk('.'): for f in files: if f.endswith('.maf.gz'): maf_files.append(os.path.join(root, f)) maf_files.sort() print(f"Parsing {len(maf_files)} maf files") for i, f in enumerate(maf_files): parser = TCGA_MAFParser(f) # Parse in chunk since MAF files may be too large to fit in 16G memory i_chunk = 0 finished = False while finished is not True: finished = parser.parse_chunk() print(f"{i:3d}-{i_chunk:2d} ", end='', flush=True) # provide the patient_barcode_tumor_site so the gnode will have 'info.biosample' genome_nodes, info_nodes, edges = parser.get_mongo_nodes( patient_barcode_tumor_site) # aggregate variant tags for gnode in genome_nodes: variant_tags.update(gnode['info']['variant_tags']) update_insert_many(GenomeNodes, genome_nodes) i_chunk += 1 os.chdir('..') # CNV os.chdir('CNV') cnv_file_caseIDs = dict() for d in json.load(open('metadata.json')): # Each file only have one case cnv_file_caseIDs[d['file_name']] = d['cases'][0]['case_id'] cnv_files = [] for root, d, files in os.walk('.'): for f in files: if f.endswith('.seg.v2.txt'): cnv_files.append(os.path.join(root, f)) cnv_files.sort() print(f"Parsing {len(cnv_files)} cnv files") # we parse 1000 files each time then upload at once i_batch, batch_size = 0, 1000 while True: start, end = i_batch * batch_size, (i_batch + 1) * batch_size parsing_files = cnv_files[start:end] if len(parsing_files) == 0: break end = start + len(parsing_files) print(f"Parsing CNV files {start+1:6d} ~ {end:6d}") batch_genome_nodes = [] for f in parsing_files: parser = TCGA_CNVParser(f) filebasename = os.path.basename(f) patient_uuid = cnv_file_caseIDs[filebasename] biosample = patient_uuid_tumor_site.get(patient_uuid, None) patient_barcode = patient_uuid_barcode.get(patient_uuid, None) # some patient data are not available because they are in the "controlled access" catogory if biosample == None or patient_barcode == None: continue parser.parse() extra_info = { 'patient_barcode': patient_barcode, 'biosample': biosample } genome_nodes, info_nodes, edges = parser.get_mongo_nodes( extra_info) batch_genome_nodes += genome_nodes update_insert_many(GenomeNodes, batch_genome_nodes) i_batch += 1 # Add one info node for dataSource update_insert_many(InfoNodes, [{ '_id': 'I' + DATA_SOURCE_TCGA, "type": "dataSource", 'name': DATA_SOURCE_TCGA, "source": DATA_SOURCE_TCGA, 'info': { 'variant_tags': list(variant_tags) } }]) # finish os.chdir('..')