Пример #1
0
def test_parse_ensembl_exons_iterable():
    """Test to parse all ensembl exons"""
    ## GIVEN an iterable with ensembl exon data
    exons_handle = [
        "Chromosome/scaffold name\tGene stable ID\tTranscript stable ID\tExon stable ID\tExon"
        " region start (bp)\tExon region end (bp)\t5' UTR start\t5' UTR end\t3' UTR start\t3'"
        " UTR end\tStrand\tExon rank in transcript",
        "1\tENSG00000176022\tENST00000379198\tENSE00001439793\t1167629\t1170421\t1167629\t1"
        "167658\t1168649\t1170421\t1\t1",
    ]
    ## WHEN parsing the exons in that file
    exons = parse_ensembl_exons(exons_handle)
    parsed_exon = next(exons)

    ## THEN assert that the exon is correctly parsed
    assert parsed_exon["chrom"] == "1"
    assert parsed_exon["ens_exon_id"] == "ENSE00001439793"
    assert parsed_exon["transcript"] == "ENST00000379198"
    assert parsed_exon["gene"] == "ENSG00000176022"
    assert parsed_exon["exon_chrom_start"] == 1167629
    assert parsed_exon["exon_chrom_end"] == 1170421
    assert parsed_exon["5_utr_start"] == 1167629
    assert parsed_exon["5_utr_end"] == 1167658
    assert parsed_exon["3_utr_start"] == 1168649
    assert parsed_exon["3_utr_end"] == 1170421
    assert parsed_exon["rank"] == 1
    assert parsed_exon["strand"] == 1
    ## THEN assert start is max(5_utr_end, exon_chrom_start) since strand is 1
    assert parsed_exon["start"] == 1167658
    ## THEN assert end is min(3_utr_start, exon_chrom_end) since strand is 1
    assert parsed_exon["end"] == 1168649
Пример #2
0
def test_parse_ensembl_exons_iterable():
    """Test to parse all ensembl exons"""
    ## GIVEN an iterable with ensembl exon data
    exons_handle = [
        "Chromosome/scaffold name\tGene stable ID\tTranscript stable ID\tExon stable ID\tExon"\
        " region start (bp)\tExon region end (bp)\t5' UTR start\t5' UTR end\t3' UTR start\t3'"\
        " UTR end\tStrand\tExon rank in transcript",
        "1\tENSG00000176022\tENST00000379198\tENSE00001439793\t1167629\t1170421\t1167629\t1"\
        "167658\t1168649\t1170421\t1\t1"
    ]
    ## WHEN parsing the exons in that file
    exons = parse_ensembl_exons(exons_handle)
    parsed_exon = next(exons)

    ## THEN assert that the exon is correctly parsed
    assert parsed_exon['chrom'] == '1'
    assert parsed_exon['ens_exon_id'] == 'ENSE00001439793'
    assert parsed_exon['transcript'] == 'ENST00000379198'
    assert parsed_exon['gene'] == 'ENSG00000176022'
    assert parsed_exon['exon_chrom_start'] == 1167629
    assert parsed_exon['exon_chrom_end'] == 1170421
    assert parsed_exon['5_utr_start'] == 1167629
    assert parsed_exon['5_utr_end'] == 1167658
    assert parsed_exon['3_utr_start'] == 1168649
    assert parsed_exon['3_utr_end'] == 1170421
    assert parsed_exon['rank'] == 1
    assert parsed_exon['strand'] == 1
    ## THEN assert start is max(5_utr_end, exon_chrom_start) since strand is 1
    assert parsed_exon['start'] == 1167658
    ## THEN assert end is min(3_utr_start, exon_chrom_end) since strand is 1
    assert parsed_exon['end'] == 1168649
Пример #3
0
def load_exons(adapter, exon_lines, build='37', nr_exons=None):
    """Build and load all the exons of a build
    
    Transcript information is from ensembl.
    
    First check that the gene that the transcript belongs to exist in the database.
    If so check that the exon belongs to one of the identifier transcripts of that gene.

    Args:
        adapter(MongoAdapter)
        exon_lines(iterable): iterable with ensembl exon lines
        build(str)
    
    """
    nr_exons = nr_exons or 100000
    # Fetch all genes with ensemblid as keys
    ensembl_genes = adapter.ensembl_genes(build=build, id_transcripts=True)

    LOG.debug("Parsing ensembl exons from iterable")
    exons = parse_ensembl_exons(exon_lines)

    start_insertion = datetime.now()
    loaded_exons = 0
    exon_bulk = []
    LOG.info("Loading exons...")
    current_chrom = None
    with progressbar(exons, label="Loading exons", length=nr_exons) as bar:
        for exon in bar:
            ensg_id = exon['gene']
            enst_id = exon['transcript']
            gene_obj = ensembl_genes.get(ensg_id)
            if not gene_obj:
                continue

            hgnc_id = gene_obj['hgnc_id']

            if not enst_id in gene_obj.get('id_transcripts', set()):
                continue

            exon_id = exon['exon_id']

            exon['hgnc_id'] = hgnc_id

            exon_obj = build_exon(exon, build)
            exon_bulk.append(exon_obj)
            if len(exon_bulk) > 10000:
                adapter.load_exon_bulk(exon_bulk)
                exon_bulk = []
            loaded_exons += 1

    if exon_bulk:
        adapter.load_exon_bulk(exon_bulk)

    LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons))
    LOG.info('Number loaded: {0}'.format(loaded_exons))
    LOG.info('Time to load exons: {0}'.format(datetime.now() -
                                              start_insertion))
Пример #4
0
def test_parse_ensembl_exons(exons_handle):
    """Test to parse a small dataframe line of ensembl transcript"""
    ## GIVEN a iterable with exon information from ensembl

    ## WHEN parsing the exons
    parsed_exons = parse_ensembl_exons(exons_handle)
    parsed_exon = next(parsed_exons)

    ## THEN assert the parsed transcript is as expected
    assert parsed_exon["chrom"]
    assert parsed_exon["gene"]
    assert parsed_exon["transcript"]
Пример #5
0
def load_exons(adapter, exon_lines, build='37', ensembl_genes=None):
    """Load all the exons
    
    Transcript information is from ensembl.
    Check that the transcript that the exon belongs to exists in the database

    Args:
        adapter(MongoAdapter)
        exon_lines(iterable): iterable with ensembl exon lines
        build(str)
        ensembl_transcripts(dict): Existing ensembl transcripts
    
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)
    hgnc_id_transcripts = adapter.id_transcripts_by_gene(build=build)

    if isinstance(exon_lines, DataFrame):
        exons = parse_ensembl_exon_request(exon_lines)
        nr_exons = exon_lines.shape[0]
    else:
        exons = parse_ensembl_exons(exon_lines)
        nr_exons = 1000000

    start_insertion = datetime.now()
    loaded_exons = 0
    LOG.info("Loading exons...")
    with progressbar(exons, label="Loading exons", length=nr_exons) as bar:
        for exon in bar:
            ensg_id = exon['gene']
            enst_id = exon['transcript']
            gene_obj = ensembl_genes.get(ensg_id)

            if not gene_obj:
                continue

            hgnc_id = gene_obj['hgnc_id']

            if not enst_id in hgnc_id_transcripts[hgnc_id]:
                continue

            exon['hgnc_id'] = hgnc_id

            exon_obj = build_exon(exon, build)
            adapter.load_exon(exon_obj)
            loaded_exons += 1

    LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons))
    LOG.info('Number loaded: {0}'.format(loaded_exons))
    LOG.info('Time to load exons: {0}'.format(datetime.now() -
                                              start_insertion))
Пример #6
0
def load_exons(adapter, exon_lines, build='37', ensembl_genes=None):
    """Load all the exons
    
    Transcript information is from ensembl.
    Check that the transcript that the exon belongs to exists in the database

    Args:
        adapter(MongoAdapter)
        exon_lines(iterable): iterable with ensembl exon lines
        build(str)
        ensembl_transcripts(dict): Existing ensembl transcripts
    
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)
    hgnc_id_transcripts = adapter.id_transcripts_by_gene(build=build)
    
    if isinstance(exon_lines, DataFrame):
        exons = parse_ensembl_exon_request(exon_lines)
        nr_exons = exon_lines.shape[0]
    else:
        exons = parse_ensembl_exons(exon_lines)
        nr_exons = 1000000
    
    start_insertion = datetime.now()
    loaded_exons = 0
    LOG.info("Loading exons...")
    with progressbar(exons, label="Loading exons", length=nr_exons) as bar:
        for exon in bar:
            ensg_id = exon['gene']
            enst_id = exon['transcript']
            gene_obj = ensembl_genes.get(ensg_id)
            
            if not gene_obj:
                continue
            
            hgnc_id = gene_obj['hgnc_id']

            if not enst_id in hgnc_id_transcripts[hgnc_id]:
                continue

            exon['hgnc_id'] = hgnc_id

            exon_obj = build_exon(exon, build)
            adapter.load_exon(exon_obj)
            loaded_exons += 1

    LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons))
    LOG.info('Number loaded: {0}'.format(loaded_exons))
    LOG.info('Time to load exons: {0}'.format(datetime.now() - start_insertion))
    
Пример #7
0
def test_parse_ensembl_exons_missing_five_utr_end():
    """Test to parse all ensembl exons"""
    ## GIVEN an iterable with ensembl exon data
    exons_handle = [
        "Chromosome/scaffold name\tGene stable ID\tTranscript stable ID\tExon stable ID\tExon"
        " region start (bp)\tExon region end (bp)\t5' UTR start\t5' UTR end\t3' UTR start\t3'"
        " UTR end\tStrand\tExon rank in transcript",
        "1\tENSG00000176022\tENST00000379198\tENSE00001439793\t1167629\t1170421\t\t"
        "\t1168649\t1170421\t1\t1",
    ]
    ## WHEN parsing the exons in that file
    exons = parse_ensembl_exons(exons_handle)
    parsed_exon = next(exons)

    ## THEN assert that the exon is correctly parsed
    assert parsed_exon["chrom"] == "1"
    assert parsed_exon["ens_exon_id"] == "ENSE00001439793"
    assert parsed_exon["transcript"] == "ENST00000379198"
    assert parsed_exon["gene"] == "ENSG00000176022"
    assert parsed_exon["5_utr_start"] is None
    assert parsed_exon["5_utr_end"] is None
Пример #8
0
def exons_38(request, exons_38_handle):
    """Get the parsed ensembl transcripts"""
    print("")
    return parse_ensembl_exons(exons_38_handle)