def test_parse_ensembl_exons_iterable(): """Test to parse all ensembl exons""" ## GIVEN an iterable with ensembl exon data exons_handle = [ "Chromosome/scaffold name\tGene stable ID\tTranscript stable ID\tExon stable ID\tExon" " region start (bp)\tExon region end (bp)\t5' UTR start\t5' UTR end\t3' UTR start\t3'" " UTR end\tStrand\tExon rank in transcript", "1\tENSG00000176022\tENST00000379198\tENSE00001439793\t1167629\t1170421\t1167629\t1" "167658\t1168649\t1170421\t1\t1", ] ## WHEN parsing the exons in that file exons = parse_ensembl_exons(exons_handle) parsed_exon = next(exons) ## THEN assert that the exon is correctly parsed assert parsed_exon["chrom"] == "1" assert parsed_exon["ens_exon_id"] == "ENSE00001439793" assert parsed_exon["transcript"] == "ENST00000379198" assert parsed_exon["gene"] == "ENSG00000176022" assert parsed_exon["exon_chrom_start"] == 1167629 assert parsed_exon["exon_chrom_end"] == 1170421 assert parsed_exon["5_utr_start"] == 1167629 assert parsed_exon["5_utr_end"] == 1167658 assert parsed_exon["3_utr_start"] == 1168649 assert parsed_exon["3_utr_end"] == 1170421 assert parsed_exon["rank"] == 1 assert parsed_exon["strand"] == 1 ## THEN assert start is max(5_utr_end, exon_chrom_start) since strand is 1 assert parsed_exon["start"] == 1167658 ## THEN assert end is min(3_utr_start, exon_chrom_end) since strand is 1 assert parsed_exon["end"] == 1168649
def test_parse_ensembl_exons_iterable(): """Test to parse all ensembl exons""" ## GIVEN an iterable with ensembl exon data exons_handle = [ "Chromosome/scaffold name\tGene stable ID\tTranscript stable ID\tExon stable ID\tExon"\ " region start (bp)\tExon region end (bp)\t5' UTR start\t5' UTR end\t3' UTR start\t3'"\ " UTR end\tStrand\tExon rank in transcript", "1\tENSG00000176022\tENST00000379198\tENSE00001439793\t1167629\t1170421\t1167629\t1"\ "167658\t1168649\t1170421\t1\t1" ] ## WHEN parsing the exons in that file exons = parse_ensembl_exons(exons_handle) parsed_exon = next(exons) ## THEN assert that the exon is correctly parsed assert parsed_exon['chrom'] == '1' assert parsed_exon['ens_exon_id'] == 'ENSE00001439793' assert parsed_exon['transcript'] == 'ENST00000379198' assert parsed_exon['gene'] == 'ENSG00000176022' assert parsed_exon['exon_chrom_start'] == 1167629 assert parsed_exon['exon_chrom_end'] == 1170421 assert parsed_exon['5_utr_start'] == 1167629 assert parsed_exon['5_utr_end'] == 1167658 assert parsed_exon['3_utr_start'] == 1168649 assert parsed_exon['3_utr_end'] == 1170421 assert parsed_exon['rank'] == 1 assert parsed_exon['strand'] == 1 ## THEN assert start is max(5_utr_end, exon_chrom_start) since strand is 1 assert parsed_exon['start'] == 1167658 ## THEN assert end is min(3_utr_start, exon_chrom_end) since strand is 1 assert parsed_exon['end'] == 1168649
def load_exons(adapter, exon_lines, build='37', nr_exons=None): """Build and load all the exons of a build Transcript information is from ensembl. First check that the gene that the transcript belongs to exist in the database. If so check that the exon belongs to one of the identifier transcripts of that gene. Args: adapter(MongoAdapter) exon_lines(iterable): iterable with ensembl exon lines build(str) """ nr_exons = nr_exons or 100000 # Fetch all genes with ensemblid as keys ensembl_genes = adapter.ensembl_genes(build=build, id_transcripts=True) LOG.debug("Parsing ensembl exons from iterable") exons = parse_ensembl_exons(exon_lines) start_insertion = datetime.now() loaded_exons = 0 exon_bulk = [] LOG.info("Loading exons...") current_chrom = None with progressbar(exons, label="Loading exons", length=nr_exons) as bar: for exon in bar: ensg_id = exon['gene'] enst_id = exon['transcript'] gene_obj = ensembl_genes.get(ensg_id) if not gene_obj: continue hgnc_id = gene_obj['hgnc_id'] if not enst_id in gene_obj.get('id_transcripts', set()): continue exon_id = exon['exon_id'] exon['hgnc_id'] = hgnc_id exon_obj = build_exon(exon, build) exon_bulk.append(exon_obj) if len(exon_bulk) > 10000: adapter.load_exon_bulk(exon_bulk) exon_bulk = [] loaded_exons += 1 if exon_bulk: adapter.load_exon_bulk(exon_bulk) LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons)) LOG.info('Number loaded: {0}'.format(loaded_exons)) LOG.info('Time to load exons: {0}'.format(datetime.now() - start_insertion))
def test_parse_ensembl_exons(exons_handle): """Test to parse a small dataframe line of ensembl transcript""" ## GIVEN a iterable with exon information from ensembl ## WHEN parsing the exons parsed_exons = parse_ensembl_exons(exons_handle) parsed_exon = next(parsed_exons) ## THEN assert the parsed transcript is as expected assert parsed_exon["chrom"] assert parsed_exon["gene"] assert parsed_exon["transcript"]
def load_exons(adapter, exon_lines, build='37', ensembl_genes=None): """Load all the exons Transcript information is from ensembl. Check that the transcript that the exon belongs to exists in the database Args: adapter(MongoAdapter) exon_lines(iterable): iterable with ensembl exon lines build(str) ensembl_transcripts(dict): Existing ensembl transcripts """ # Fetch all genes with ensemblid as keys ensembl_genes = ensembl_genes or adapter.ensembl_genes(build) hgnc_id_transcripts = adapter.id_transcripts_by_gene(build=build) if isinstance(exon_lines, DataFrame): exons = parse_ensembl_exon_request(exon_lines) nr_exons = exon_lines.shape[0] else: exons = parse_ensembl_exons(exon_lines) nr_exons = 1000000 start_insertion = datetime.now() loaded_exons = 0 LOG.info("Loading exons...") with progressbar(exons, label="Loading exons", length=nr_exons) as bar: for exon in bar: ensg_id = exon['gene'] enst_id = exon['transcript'] gene_obj = ensembl_genes.get(ensg_id) if not gene_obj: continue hgnc_id = gene_obj['hgnc_id'] if not enst_id in hgnc_id_transcripts[hgnc_id]: continue exon['hgnc_id'] = hgnc_id exon_obj = build_exon(exon, build) adapter.load_exon(exon_obj) loaded_exons += 1 LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons)) LOG.info('Number loaded: {0}'.format(loaded_exons)) LOG.info('Time to load exons: {0}'.format(datetime.now() - start_insertion))
def test_parse_ensembl_exons_missing_five_utr_end(): """Test to parse all ensembl exons""" ## GIVEN an iterable with ensembl exon data exons_handle = [ "Chromosome/scaffold name\tGene stable ID\tTranscript stable ID\tExon stable ID\tExon" " region start (bp)\tExon region end (bp)\t5' UTR start\t5' UTR end\t3' UTR start\t3'" " UTR end\tStrand\tExon rank in transcript", "1\tENSG00000176022\tENST00000379198\tENSE00001439793\t1167629\t1170421\t\t" "\t1168649\t1170421\t1\t1", ] ## WHEN parsing the exons in that file exons = parse_ensembl_exons(exons_handle) parsed_exon = next(exons) ## THEN assert that the exon is correctly parsed assert parsed_exon["chrom"] == "1" assert parsed_exon["ens_exon_id"] == "ENSE00001439793" assert parsed_exon["transcript"] == "ENST00000379198" assert parsed_exon["gene"] == "ENSG00000176022" assert parsed_exon["5_utr_start"] is None assert parsed_exon["5_utr_end"] is None
def exons_38(request, exons_38_handle): """Get the parsed ensembl transcripts""" print("") return parse_ensembl_exons(exons_38_handle)