def test_build_exon_missing_key(parsed_exon, key):
    ## GIVEN a dictionary with exon information

    # WHEN key is deleted from dict
    parsed_exon.pop(key)
    # THEN calling build_exon() will raise KeyError
    with pytest.raises(KeyError):
        build_exon(parsed_exon)
def test_build_exon_inappropriate_type(parsed_exon, key):
    ## GIVEN a dictionary with exon information

    # WHEN setting key to None
    parsed_exon[key] = None
    # THEN calling build_exon() will raise TypeError
    with pytest.raises(TypeError):
        build_exon(parsed_exon)
示例#3
0
def test_build_exon_no_hgnc(parsed_exon):
    ## GIVEN a dictionary with exon information
    parsed_exon.pop('hgnc_id')

    ## WHEN building a exon object
    with pytest.raises(KeyError):
        ## THEN assert that a exception is raised since there is no hgnc_id
        exon_obj = build_exon(parsed_exon)
def test_build_exon(parsed_exon):
    ## GIVEN a dictionary with exon information

    ## WHEN building a exon object
    exon_obj = build_exon(parsed_exon)

    ## THEN assert that a dictionary is returned
    assert isinstance(exon_obj, dict)
示例#5
0
def load_exons(adapter, exon_lines, build='37', nr_exons=None):
    """Build and load all the exons of a build
    
    Transcript information is from ensembl.
    
    First check that the gene that the transcript belongs to exist in the database.
    If so check that the exon belongs to one of the identifier transcripts of that gene.

    Args:
        adapter(MongoAdapter)
        exon_lines(iterable): iterable with ensembl exon lines
        build(str)
    
    """
    nr_exons = nr_exons or 100000
    # Fetch all genes with ensemblid as keys
    ensembl_genes = adapter.ensembl_genes(build=build, id_transcripts=True)

    LOG.debug("Parsing ensembl exons from iterable")
    exons = parse_ensembl_exons(exon_lines)

    start_insertion = datetime.now()
    loaded_exons = 0
    exon_bulk = []
    LOG.info("Loading exons...")
    current_chrom = None
    with progressbar(exons, label="Loading exons", length=nr_exons) as bar:
        for exon in bar:
            ensg_id = exon['gene']
            enst_id = exon['transcript']
            gene_obj = ensembl_genes.get(ensg_id)
            if not gene_obj:
                continue

            hgnc_id = gene_obj['hgnc_id']

            if not enst_id in gene_obj.get('id_transcripts', set()):
                continue

            exon_id = exon['exon_id']

            exon['hgnc_id'] = hgnc_id

            exon_obj = build_exon(exon, build)
            exon_bulk.append(exon_obj)
            if len(exon_bulk) > 10000:
                adapter.load_exon_bulk(exon_bulk)
                exon_bulk = []
            loaded_exons += 1

    if exon_bulk:
        adapter.load_exon_bulk(exon_bulk)

    LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons))
    LOG.info('Number loaded: {0}'.format(loaded_exons))
    LOG.info('Time to load exons: {0}'.format(datetime.now() -
                                              start_insertion))
示例#6
0
文件: hgnc.py 项目: hassanfa/scout
 def load_exons(self, exons, genes=None, build='37'):
     """Create exon objects and insert them into the database
     
     Args:
         exons(iterable(dict))
     """
     genes = genes or self.ensembl_genes(build)
     for exon in exons:
         exon_obj = build_exon(exon, genes)
         if not exon_obj:
             continue
         
         res = self.exon_collection.insert_one(exon_obj)
示例#7
0
 def load_exons(self, exons, genes=None, build='37'):
     """Create exon objects and insert them into the database
     
     Args:
         exons(iterable(dict))
     """
     genes = genes or self.ensembl_genes(build)
     for exon in exons:
         exon_obj = build_exon(exon, genes)
         if not exon_obj:
             continue
         
         res = self.exon_collection.insert_one(exon_obj)
示例#8
0
def load_exons(adapter, exon_lines, build='37', ensembl_genes=None):
    """Load all the exons
    
    Transcript information is from ensembl.
    Check that the transcript that the exon belongs to exists in the database

    Args:
        adapter(MongoAdapter)
        exon_lines(iterable): iterable with ensembl exon lines
        build(str)
        ensembl_transcripts(dict): Existing ensembl transcripts
    
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)
    hgnc_id_transcripts = adapter.id_transcripts_by_gene(build=build)

    if isinstance(exon_lines, DataFrame):
        exons = parse_ensembl_exon_request(exon_lines)
        nr_exons = exon_lines.shape[0]
    else:
        exons = parse_ensembl_exons(exon_lines)
        nr_exons = 1000000

    start_insertion = datetime.now()
    loaded_exons = 0
    LOG.info("Loading exons...")
    with progressbar(exons, label="Loading exons", length=nr_exons) as bar:
        for exon in bar:
            ensg_id = exon['gene']
            enst_id = exon['transcript']
            gene_obj = ensembl_genes.get(ensg_id)

            if not gene_obj:
                continue

            hgnc_id = gene_obj['hgnc_id']

            if not enst_id in hgnc_id_transcripts[hgnc_id]:
                continue

            exon['hgnc_id'] = hgnc_id

            exon_obj = build_exon(exon, build)
            adapter.load_exon(exon_obj)
            loaded_exons += 1

    LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons))
    LOG.info('Number loaded: {0}'.format(loaded_exons))
    LOG.info('Time to load exons: {0}'.format(datetime.now() -
                                              start_insertion))
示例#9
0
def load_exons(adapter, exon_lines, build='37', ensembl_genes=None):
    """Load all the exons
    
    Transcript information is from ensembl.
    Check that the transcript that the exon belongs to exists in the database

    Args:
        adapter(MongoAdapter)
        exon_lines(iterable): iterable with ensembl exon lines
        build(str)
        ensembl_transcripts(dict): Existing ensembl transcripts
    
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)
    hgnc_id_transcripts = adapter.id_transcripts_by_gene(build=build)
    
    if isinstance(exon_lines, DataFrame):
        exons = parse_ensembl_exon_request(exon_lines)
        nr_exons = exon_lines.shape[0]
    else:
        exons = parse_ensembl_exons(exon_lines)
        nr_exons = 1000000
    
    start_insertion = datetime.now()
    loaded_exons = 0
    LOG.info("Loading exons...")
    with progressbar(exons, label="Loading exons", length=nr_exons) as bar:
        for exon in bar:
            ensg_id = exon['gene']
            enst_id = exon['transcript']
            gene_obj = ensembl_genes.get(ensg_id)
            
            if not gene_obj:
                continue
            
            hgnc_id = gene_obj['hgnc_id']

            if not enst_id in hgnc_id_transcripts[hgnc_id]:
                continue

            exon['hgnc_id'] = hgnc_id

            exon_obj = build_exon(exon, build)
            adapter.load_exon(exon_obj)
            loaded_exons += 1

    LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons))
    LOG.info('Number loaded: {0}'.format(loaded_exons))
    LOG.info('Time to load exons: {0}'.format(datetime.now() - start_insertion))
    
示例#10
0
def test_build_exon_no_hgnc():
    ## GIVEN a dictionary with exon information
    exon_info = {
        "exon_id": '1',
        "chrom": '1', 
        "start": 10, 
        "end": 100,     
        "transcript": '12',
        "rank": 2, # Order of exon in transcript
    }
    
    ## WHEN building a exon object
    with pytest.raises(KeyError):
        ## THEN assert that a exception is raised since there is no hgnc_id
        exon_obj = build_exon(exon_info)
示例#11
0
def test_build_exon_no_hgnc():
    ## GIVEN a dictionary with exon information
    exon_info = {
        "exon_id": '1',
        "chrom": '1',
        "start": 10,
        "end": 100,
        "transcript": '12',
        "rank": 2,  # Order of exon in transcript
    }

    ## WHEN building a exon object
    with pytest.raises(KeyError):
        ## THEN assert that a exception is raised since there is no hgnc_id
        exon_obj = build_exon(exon_info)
示例#12
0
def test_build_exon():
    ## GIVEN a dictionary with exon information
    exon_info = {
        "exon_id": '1',
        "chrom": '1', 
        "start": 10, 
        "end": 100,     
        "transcript": '12',
        "hgnc_id": 11,
        "rank": 2, # Order of exon in transcript
    }
    
    ## WHEN building a exon object
    exon_obj = build_exon(exon_info)
    
    ## THEN assert that a dictionary is returned
    
    assert isinstance(exon_obj, dict)
示例#13
0
def test_build_exon():
    ## GIVEN a dictionary with exon information
    exon_info = {
        "exon_id": '1',
        "chrom": '1',
        "start": 10,
        "end": 100,
        "transcript": '12',
        "hgnc_id": 11,
        "rank": 2,  # Order of exon in transcript
    }

    ## WHEN building a exon object
    exon_obj = build_exon(exon_info)

    ## THEN assert that a dictionary is returned

    assert isinstance(exon_obj, dict)