def create_collection_user_file(parser, uid, orig_filename, file_type):
    # create a MongoDB collection in userdb to store genome_nodes from file
    file_id = 'Gfile_' + str(uuid.uuid4())
    collection = userdb.create_collection(file_id)
    # parse the file in chunks
    finished = False
    while not finished:
        finished = parser.parse_chunk()
        # use the orig filename as source
        parser.metadata['source'] = orig_filename
        genome_nodes, _, _ = parser.get_mongo_nodes()
        update_insert_many(collection, genome_nodes, update=False)
    # save file metadata in UserInfo
    file_num_docs = collection.estimated_document_count()
    file_info = {
        'fileName': orig_filename,
        'fileType': file_type,
        'fileID': file_id,
        'numDocs': file_num_docs,
    }
    update_doc = {
        '$set': {
            '_id': uid,
        },
        '$push': {
            'files': file_info
        }
    }
    UserInfo.update_one({'_id': uid}, update_doc, upsert=True)
    return collection
示例#2
0
def upload():
    # XML for patient info
    os.chdir('BCRXML')
    xml_files = []
    for root, d, files in os.walk('.'):
        for f in files:
            if f.endswith('.xml'):
                xml_files.append(os.path.join(root, f))
    xml_files.sort()
    all_patient_infonodes = []
    # this is used in MAF parser
    patient_barcode_tumor_site = dict()
    # these are used in CNV parser
    patient_uuid_tumor_site = dict()
    patient_uuid_barcode = dict()
    print(f"Parsing {len(xml_files)} patient xml files")
    for f in xml_files:
        parser = TCGA_XMLParser(f, verbose=True)
        parser.parse()
        genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
        # record the tumor site for each patient barcode
        info = info_nodes[0]['info']
        patient_barcode = info['patient_barcode']
        patient_barcode_tumor_site[patient_barcode] = info['biosample']
        patient_uuid = info['patient_uuid']
        patient_uuid_tumor_site[patient_uuid] = info['biosample']
        patient_uuid_barcode[patient_uuid] = patient_barcode
        # collection individual info_nodes for each patient
        all_patient_infonodes += info_nodes
    # upload all patient info_nodes at once
    update_insert_many(InfoNodes, all_patient_infonodes)
def parse_upload_HGNC():
    filename = os.path.basename(HGNC_URL)
    parser = TSVParser_HGNC(filename, verbose=True)
    parser.parse()
    genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
    # patch the gene GenomeNodes
    update_skip_insert(GenomeNodes, genome_nodes)
    # upload the dataSource info node
    update_insert_many(InfoNodes, info_nodes)
def parse_upload():
    parser = VCFParser_ExAC(FILENAME, verbose=True)
    parser.metadata['sourceurl'] = ExAC_URL
    i_chunk = 0
    while True:
        finished = parser.parse_chunk(100000)
        print(f'Parsing and uploading chunk {i_chunk}')
        genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
        update_insert_many(GenomeNodes, genome_nodes)
        i_chunk += 1
        if finished == True:
            break
    update_insert_many(InfoNodes, info_nodes)
def parse_upload():
    parser = VCFParser_dbSNP(FILENAME, verbose=True)
    parser.metadata['sourceurl'] = DBSNP_URL
    i_chunk = 0
    while True:
        finished = parser.parse_chunk()
        print(f'Parsing and uploading chunk {i_chunk}')
        genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
        update_insert_many(GenomeNodes, genome_nodes)
        i_chunk += 1
        if finished == True:
            break
    # we only insert the infonode for dbSNP dataSource once
    update_insert_many(InfoNodes, info_nodes)
def parse_upload_ExAC_chunk():
    filename = os.path.basename(ExAC_URL)
    parser = VCFParser_ExAC(filename, verbose=True)
    parser.metadata['sourceurl'] = ExAC_URL
    i_chunk = 0
    finished = False
    while finished is not True:
        finished = parser.parse_chunk(100000)
        print(f'Parsing and uploading chunk {i_chunk}')
        genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
        update_insert_many(GenomeNodes, genome_nodes)
        i_chunk += 1
    # we only insert the infonode for ExAC dataSource once
    update_insert_many(InfoNodes, info_nodes)
def insert_encode_dataSource():
    from sirius.helpers.constants import DATA_SOURCE_ENCODE
    ds = DATA_SOURCE_ENCODE
    # prevent duplicate
    if not InfoNodes.find_one({'_id': 'I' + ds}):
        update_insert_many(InfoNodes, [{
            '_id': 'I' + ds,
            'type': 'dataSource',
            'name': ds,
            'source': ds,
            'info': {
                'searchURL': SEARCHURL
            }
        }])
def parse_upload_GWAS():
    filename = 'gwas.tsv'
    parser = TSVParser_GWAS(filename, verbose=True)
    parser.parse()
    genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
    # upload the dataSource info node
    update_insert_many(InfoNodes, info_nodes)
    update_insert_many(Edges, edges)
    # patch the GenomeNodes with the data source
    gids = list(parser.parsed_snp_ids)
    uresult = GenomeNodes.update_many({'_id': {
        '$in': gids
    }}, {'$addToSet': {
        'source': DATA_SOURCE_GWAS
    }})
    print(
        f"Prepared {len(gids)} and updated {uresult.matched_count} GenomeNodes with source {DATA_SOURCE_GWAS}"
    )
def parse_upload_KEGG():
    filename = os.path.basename(KEGG_URL)
    # the big tar.gz file contains many individual data files
    print(f"Decompressing {filename}")
    subprocess.check_call(f"tar zxf {filename} --skip-old-files", shell=True)
    foldername = 'kegg_pathways'
    # aggregate all pathways
    kegg_xmls = sorted([
        os.path.join(foldername, f) for f in os.listdir(foldername)
        if f.startswith('path') and f.endswith('.xml')
    ])
    gene_in_paths = collections.defaultdict(list)
    all_pathway_infonodes = []
    for fname in kegg_xmls:
        parser = KEGG_XMLParser(fname)
        parser.parse()
        _, info_nodes, _ = parser.get_mongo_nodes()
        pathway = info_nodes[0]
        # aggregate all pathways for each gene
        for gene in pathway['info']['genes']:
            gene_in_paths[gene].append(pathway['name'])
        all_pathway_infonodes.append(pathway)
    update_insert_many(InfoNodes, all_pathway_infonodes)
    # prepare genome_nodes for patching
    existing_gene_name_id = dict()
    for gnode in GenomeNodes.find({'type': {
            '$in': ENSEMBL_GENE_SUBTYPES
    }},
                                  projection=['_id', 'name']):
        existing_gene_name_id[gnode['name']] = gnode['_id']
    print(
        f"Pulling existing genes finished, total {len(existing_gene_name_id)} genes"
    )
    genome_nodes = []
    for gene_name, path_names in gene_in_paths.items():
        if gene_name in existing_gene_name_id:
            genome_nodes.append({
                '_id': existing_gene_name_id[gene_name],
                'source': DATA_SOURCE_KEGG,
                'info': {
                    'kegg_pathways': path_names,
                }
            })
    update_skip_insert(GenomeNodes, genome_nodes)
def parse_upload_data(parser, metadata={}):
    parser.parse()
    parser.metadata.update(metadata)
    genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
    update_insert_many(GenomeNodes, genome_nodes)
    update_insert_many(InfoNodes, info_nodes)
    update_insert_many(Edges, edges)
def parse_upload_ROADMAP_EPIGENOMICS():
    filename = os.path.basename(ROADMAP_EPIGENOMICS_URL)
    print(f"Decompressing {filename}")
    subprocess.check_call(f"tar zxf {filename} --skip-old-files", shell=True)
    os.chdir('roadmap_sort')
    bedgz_files = sorted([f for f in os.listdir('.') if f.endswith('.bed.gz')])
    print(f"Parsing {len(bedgz_files)} .bed.gz files")
    for i, fname in enumerate(bedgz_files):
        print(f"{i:3d} {fname[:20]:20s} ", end='', flush=True)
        parser = BEDParser_ROADMAP_EPIGENOMICS(fname)
        parser.parse()
        genome_nodes, info_nodes, _ = parser.get_mongo_nodes()
        update_insert_many(GenomeNodes, genome_nodes)
        update_insert_many(InfoNodes, info_nodes)
    # Add one info node for dataSource
    update_insert_many(InfoNodes, [{
        '_id': 'I' + DATA_SOURCE_ROADMAP_EPIGENOMICS,
        "type": "dataSource",
        'name': DATA_SOURCE_ROADMAP_EPIGENOMICS,
        "source": DATA_SOURCE_ROADMAP_EPIGENOMICS,
        'info': {
            'filenames': bedgz_files,
        }
    }])
    # finish
    os.chdir('..')
def parse_upload_ImmuneAtlas():
    bedgz_files = sorted([f for f in os.listdir('.') if f.endswith('.bed.gz')])
    print(f"Parsing {len(bedgz_files)} .bed.gz files")
    distinct_biosamples = set()
    for i, fname in enumerate(bedgz_files):
        print(f"{i:3d} {fname[:20]:20s} ", end='', flush=True)
        parser = BEDParser_ImmuneAtlas(fname)
        parser.parse()
        genome_nodes, _, _ = parser.get_mongo_nodes()
        # aggregate all biosamples
        distinct_biosamples.add(genome_nodes[0]['info']['biosample'])
        update_insert_many(GenomeNodes, genome_nodes)
    # Add one info node for dataSource
    update_insert_many(InfoNodes, [{
        '_id': 'I' + DATA_SOURCE_ImmuneAtlas,
        "type": "dataSource",
        'name': DATA_SOURCE_ImmuneAtlas,
        "source": DATA_SOURCE_ImmuneAtlas,
        'info': {
            'biosample': sorted(distinct_biosamples),
        }
    }])
def parse_upload_GTEx_files():
    filename = os.path.basename(GTEx_URL)
    # the big tar.gz file contains many individual data files
    print(f"Decompressing {filename}")
    subprocess.check_call(f"tar zxf {filename} --skip-old-files", shell=True)
    foldername = filename.split('.', 1)[0]
    # aggregate all biosamples
    distinct_biosamples = set()
    for f in os.listdir(foldername):
        if f.endswith('egenes.txt.gz'):
            fname = os.path.join(foldername, f)
            print(f"Parsing and uploading from {fname}")
            parser = EQTLParser_GTEx(fname, verbose=True)
            parser.parse()
            # the first word in filename is parsed as the biosample
            biosample = f.split('.', 1)[0]
            # reformat to be consistent with ENCODE dataset
            biosample = ' '.join(biosample.lower().split('_'))
            distinct_biosamples.add(biosample)
            genome_nodes, info_nodes, edges = parser.get_mongo_nodes(
                {'biosample': biosample})
            # we only insert the edges here for each file
            update_insert_many(Edges, edges)
            # patch the SNPs with the data source
            gids = list(parser.parsed_snp_ids) + list(parser.parsed_gene_ids)
            uresult = GenomeNodes.update_many({'_id': {
                '$in': gids
            }}, {'$addToSet': {
                'source': DATA_SOURCE_GTEX
            }})
            print(
                f"Prepared {len(gids)} and updated {uresult.matched_count} GenomeNodes with source {DATA_SOURCE_GTEX}"
            )
    # change the filename to the big tar.gz file
    info_nodes[0]['info']['filename'] = filename
    info_nodes[0]['info']['biosample'] = list(distinct_biosamples)
    # insert one infonode for the GTEx dataSource
    update_insert_many(InfoNodes, info_nodes)
def parse_upload_bed(metadata, liftover=True):
    filename = metadata['filename']
    parser = BEDParser_ENCODE(filename)
    parser.parse()
    parser.metadata.update(metadata)
    genome_nodes, info_nodes, edges = parser.get_mongo_nodes(liftover=liftover)
    print(
        f'parsing {filename} results in {len(genome_nodes)} GenomeNodes, {len(info_nodes)} InfoNodes, {len(edges)} Edges'
    )
    print("Uploading to MongoDB")
    update_insert_many(GenomeNodes, genome_nodes, update=False)
    update_insert_many(InfoNodes, info_nodes, update=False)
    update_insert_many(Edges, edges, update=False)
def parse_upload_gff_chunk():
    filename = os.path.basename(GRCH38_URL)
    parser = GFFParser_ENSEMBL(filename, verbose=True)
    parser.metadata['sourceurl'] = GRCH38_URL
    i_chunk = 0
    finished = False
    while finished is not True:
        finished = parser.parse_chunk()
        genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
        update_insert_many(GenomeNodes, genome_nodes)
        update_insert_many(InfoNodes, info_nodes[1:])
        print(f"Data of chunk {i_chunk} uploaded")
        i_chunk += 1
    # we only upload info_nodes[0] once here because all the chunks has the same first info node for the dataSource.
    update_insert_many(InfoNodes, info_nodes[0:1])
    print("InfoNodes uploaded")
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("filename")
    parser.add_argument('datatype', choices=ParserClass.keys(), help='What data are we parsing?')
    parser.add_argument("--url", help='sourceurl of data')
    parser.add_argument("--save", action='store_true', help='Save parsed file to disk')
    parser.add_argument("--upload", action='store_true', help='Upload to MongoDB')
    parser.add_argument("--skip_insert", action='store_true', help='Only update existing docs in MongoDB')
    args = parser.parse_args()

    parser = ParserClass[args.datatype](args.filename, verbose=True)

    parser.parse()

    if args.url:
        parser.metadata['sourceurl'] = args.url

    # set some metadata for demonstration, they should be downloaded from ENCODE website
    if args.datatype == 'encode':
        parser.metadata['biosample'] = '#biosample#'
        parser.metadata['accession'] = '#accession#'
        parser.metadata['description'] = '#description#'
        parser.metadata['targets'] = ['#Target#']

    if args.save:
        parser.save_json()
        parser.save_mongo_nodes()

    if args.upload == True:
        genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
        from sirius.mongo import GenomeNodes, InfoNodes, Edges
        if not args.skip_insert:
            print("Uploading to MongoDB")
            update_insert_many(GenomeNodes, genome_nodes)
            update_insert_many(InfoNodes, info_nodes)
            update_insert_many(Edges, edges)
        else:
            print("Updating existing docs in MongoDB")
            update_skip_insert(GenomeNodes, genome_nodes)
            update_skip_insert(InfoNodes, info_nodes)
            update_skip_insert(Edges, edges)
def parse_upload_TCGA_files():
    filename = os.path.basename(TCGA_URL)
    print(f"Decompressing {filename}")
    subprocess.check_call(f"tar zxf {filename} --skip-old-files", shell=True)
    # three subfolders have been prepared and we will parse them one by one
    # XML for patient info
    os.chdir('BCRXML')
    xml_files = []
    for root, d, files in os.walk('.'):
        for f in files:
            if f.endswith('.xml'):
                xml_files.append(os.path.join(root, f))
    xml_files.sort()
    all_patient_infonodes = []
    # this is used in MAF parser
    patient_barcode_tumor_site = dict()
    # these are used in CNV parser
    patient_uuid_tumor_site = dict()
    patient_uuid_barcode = dict()
    print(f"Parsing {len(xml_files)} patient xml files")
    for f in xml_files:
        parser = TCGA_XMLParser(f, verbose=True)
        parser.parse()
        genome_nodes, info_nodes, edges = parser.get_mongo_nodes()
        # record the tumor site for each patient barcode
        info = info_nodes[0]['info']
        patient_barcode = info['patient_barcode']
        patient_barcode_tumor_site[patient_barcode] = info['biosample']
        patient_uuid = info['patient_uuid']
        patient_uuid_tumor_site[patient_uuid] = info['biosample']
        patient_uuid_barcode[patient_uuid] = patient_barcode
        # collection individual info_nodes for each patient
        all_patient_infonodes += info_nodes
    # upload all patient info_nodes at once
    update_insert_many(InfoNodes, all_patient_infonodes)
    os.chdir('..')
    # MAF for mutations in tumors
    os.chdir('MAF')
    maf_files = []
    variant_tags = set()
    for root, d, files in os.walk('.'):
        for f in files:
            if f.endswith('.maf.gz'):
                maf_files.append(os.path.join(root, f))
    maf_files.sort()
    print(f"Parsing {len(maf_files)} maf files")
    for i, f in enumerate(maf_files):
        parser = TCGA_MAFParser(f)
        # Parse in chunk since MAF files may be too large to fit in 16G memory
        i_chunk = 0
        finished = False
        while finished is not True:
            finished = parser.parse_chunk()
            print(f"{i:3d}-{i_chunk:2d} ", end='', flush=True)
            # provide the patient_barcode_tumor_site so the gnode will have 'info.biosample'
            genome_nodes, info_nodes, edges = parser.get_mongo_nodes(
                patient_barcode_tumor_site)
            # aggregate variant tags
            for gnode in genome_nodes:
                variant_tags.update(gnode['info']['variant_tags'])
            update_insert_many(GenomeNodes, genome_nodes)
            i_chunk += 1
    os.chdir('..')
    # CNV
    os.chdir('CNV')
    cnv_file_caseIDs = dict()
    for d in json.load(open('metadata.json')):
        # Each file only have one case
        cnv_file_caseIDs[d['file_name']] = d['cases'][0]['case_id']
    cnv_files = []
    for root, d, files in os.walk('.'):
        for f in files:
            if f.endswith('.seg.v2.txt'):
                cnv_files.append(os.path.join(root, f))
    cnv_files.sort()
    print(f"Parsing {len(cnv_files)} cnv files")
    # we parse 1000 files each time then upload at once
    i_batch, batch_size = 0, 1000
    while True:
        start, end = i_batch * batch_size, (i_batch + 1) * batch_size
        parsing_files = cnv_files[start:end]
        if len(parsing_files) == 0: break
        end = start + len(parsing_files)
        print(f"Parsing CNV files {start+1:6d} ~ {end:6d}")
        batch_genome_nodes = []
        for f in parsing_files:
            parser = TCGA_CNVParser(f)
            filebasename = os.path.basename(f)
            patient_uuid = cnv_file_caseIDs[filebasename]
            biosample = patient_uuid_tumor_site.get(patient_uuid, None)
            patient_barcode = patient_uuid_barcode.get(patient_uuid, None)
            # some patient data are not available because they are in the "controlled access" catogory
            if biosample == None or patient_barcode == None: continue
            parser.parse()
            extra_info = {
                'patient_barcode': patient_barcode,
                'biosample': biosample
            }
            genome_nodes, info_nodes, edges = parser.get_mongo_nodes(
                extra_info)
            batch_genome_nodes += genome_nodes
        update_insert_many(GenomeNodes, batch_genome_nodes)
        i_batch += 1
    # Add one info node for dataSource
    update_insert_many(InfoNodes, [{
        '_id': 'I' + DATA_SOURCE_TCGA,
        "type": "dataSource",
        'name': DATA_SOURCE_TCGA,
        "source": DATA_SOURCE_TCGA,
        'info': {
            'variant_tags': list(variant_tags)
        }
    }])
    # finish
    os.chdir('..')