def test_parse_transcripts_file(transcripts_handle): """Test to parse all ensembl transcripts""" transcripts = parse_transcripts(transcripts_handle) for transcript_id in transcripts: transcript = transcripts[transcript_id] assert transcript['ensembl_transcript_id'] assert transcript['ensembl_gene_id']
def test_parse_transcripts_data_frame(transcripts_df): """Test to parse all ensembl transcripts from data frame""" ## GIVEN a data frame with transcript information transcripts = parse_transcripts(transcripts_df) ## WHEN parsing the transcripts i = 0 for i, transcript_id in enumerate(transcripts): transcript = transcripts[transcript_id] ## THEN assert they all got the mandatory ids assert transcript['ensembl_transcript_id'] assert transcript['ensembl_gene_id'] assert i > 0
def test_parse_transcripts_data_frame(transcripts_df): """Test to parse all ensembl transcripts from data frame""" ## GIVEN a data frame with transcript information transcripts = parse_transcripts(transcripts_df) ## WHEN parsing the transcripts i = 0 for i,transcript_id in enumerate(transcripts): transcript = transcripts[transcript_id] ## THEN assert they all got the mandatory ids assert transcript['ensembl_transcript_id'] assert transcript['ensembl_gene_id'] assert i > 0
def parsed_transcripts(request, transcripts_handle, ensembl_genes): """Get the parsed ensembl transcripts""" print('') transcripts = parse_transcripts(transcripts_handle) for tx_id in transcripts: tx_info = transcripts[tx_id] ens_gene_id = tx_info['ensembl_gene_id'] gene_obj = ensembl_genes.get(ens_gene_id) if not gene_obj: continue tx_info['hgnc_id'] = gene_obj['hgnc_id'] tx_info['primary_transcripts'] = set( gene_obj.get('primary_transcripts', [])) return transcripts
def load_transcripts(adapter, transcripts_lines=None, build='37', ensembl_genes=None): """Load all the transcripts Transcript information is from ensembl. Args: adapter(MongoAdapter) transcripts_lines(iterable): iterable with ensembl transcript lines build(str) ensembl_genes(dict): Map from ensembl_id -> HgncGene Returns: transcript_objs(list): A list with all transcript objects """ # Fetch all genes with ensemblid as keys ensembl_genes = ensembl_genes or adapter.ensembl_genes(build) if transcripts_lines is None: transcripts_lines = fetch_ensembl_transcripts(build=build) # Map with all transcripts enstid -> parsed transcript transcripts_dict = parse_transcripts(transcripts_lines) for ens_tx_id in list(transcripts_dict): parsed_tx = transcripts_dict[ens_tx_id] # Get the ens gene id ens_gene_id = parsed_tx['ensembl_gene_id'] # pp(ens_gene_id) # Fetch the internal gene object to find out the correct hgnc id gene_obj = ensembl_genes.get(ens_gene_id) # If the gene is non existing in scout we skip the transcript if not gene_obj: transcripts_dict.pop(ens_tx_id) LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build) continue # Add the correct hgnc id parsed_tx['hgnc_id'] = gene_obj['hgnc_id'] # Primary transcript information is collected from HGNC parsed_tx['primary_transcripts'] = set( gene_obj.get('primary_transcripts', [])) ref_seq_transcripts = 0 nr_primary_transcripts = 0 nr_transcripts = len(transcripts_dict) transcript_objs = [] with progressbar(transcripts_dict.values(), label="Building transcripts", length=nr_transcripts) as bar: for tx_data in bar: #################### Get the correct refseq identifier #################### # We need to decide one refseq identifier for each transcript, if there are any to choose # from. The algorithm is as follows: # If these is ONE mrna this is choosen # If there are several mrna the one that is in 'primary_transcripts' is choosen # Else one is choosen at random # The same follows for the other categories where nc_rna has precedense over mrna_predicted tx_data['is_primary'] = False primary_transcripts = tx_data['primary_transcripts'] refseq_identifier = None for category in TRANSCRIPT_CATEGORIES: identifiers = tx_data[category] if not identifiers: continue intersection = identifiers.intersection(primary_transcripts) ref_seq_transcripts += 1 if intersection: refseq_identifier = intersection.pop() tx_data['is_primary'] = True nr_primary_transcripts += 1 else: refseq_identifier = identifiers.pop() # If there was refseq identifiers we break the loop break if refseq_identifier: tx_data['refseq_id'] = refseq_identifier #################### #################### #################### # Build the transcript object tx_obj = build_transcript(tx_data, build) transcript_objs.append(tx_obj) # Load all transcripts LOG.info("Loading transcripts...") if len(transcript_objs) > 0: adapter.load_transcript_bulk(transcript_objs) LOG.info('Number of transcripts in build %s: %s', build, nr_transcripts) LOG.info('Number of transcripts with refseq identifier: %s', ref_seq_transcripts) LOG.info('Number of primary transcripts: %s', nr_primary_transcripts) return transcript_objs
def load_transcripts(adapter, transcripts_lines=None, build='37', ensembl_genes=None): """Load all the transcripts Transcript information is from ensembl. Args: adapter(MongoAdapter) transcripts_lines(iterable): iterable with ensembl transcript lines build(str) ensembl_genes(dict): Map from ensembl_id -> HgncGene Returns: transcript_objs(list): A list with all transcript objects """ # Fetch all genes with ensemblid as keys ensembl_genes = ensembl_genes or adapter.ensembl_genes(build) if transcripts_lines is None: transcripts_lines = fetch_ensembl_transcripts(build=build) # Map with all transcripts enstid -> parsed transcript transcripts_dict = parse_transcripts(transcripts_lines) for ens_tx_id in list(transcripts_dict): parsed_tx = transcripts_dict[ens_tx_id] # Get the ens gene id ens_gene_id = parsed_tx['ensembl_gene_id'] # Fetch the internal gene object to find out the correct hgnc id gene_obj = ensembl_genes.get(ens_gene_id) # If the gene is non existing in scout we skip the transcript if not gene_obj: transcripts_dict.pop(ens_tx_id) LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build) continue # Add the correct hgnc id parsed_tx['hgnc_id'] = gene_obj['hgnc_id'] # Primary transcript information is collected from HGNC parsed_tx['primary_transcripts'] = set(gene_obj.get('primary_transcripts', [])) ref_seq_transcripts = 0 nr_primary_transcripts = 0 nr_transcripts = len(transcripts_dict) transcript_objs = [] with progressbar(transcripts_dict.values(), label="Building transcripts", length=nr_transcripts) as bar: for tx_data in bar: #################### Get the correct refseq identifier #################### # We need to decide one refseq identifier for each transcript, if there are any to # choose from. The algorithm is as follows: # If there is ONE mrna this is choosen # If there are several mrna the one that is in 'primary_transcripts' is choosen # Else one is choosen at random # The same follows for the other categories where nc_rna has precedense over mrna_predicted # We will store all refseq identifiers in a "refseq_identifiers" list as well tx_data['is_primary'] = False primary_transcripts = tx_data['primary_transcripts'] refseq_identifier = None refseq_identifiers = [] for category in TRANSCRIPT_CATEGORIES: identifiers = tx_data[category] if not identifiers: continue for refseq_id in identifiers: # Add all refseq identifiers to refseq_identifiers refseq_identifiers.append(refseq_id) ref_seq_transcripts += 1 if refseq_id in primary_transcripts: refseq_identifier = refseq_id tx_data['is_primary'] = True nr_primary_transcripts += 1 if not refseq_identifier: refseq_identifier = refseq_id if refseq_identifier: tx_data['refseq_id'] = refseq_identifier if refseq_identifiers: tx_data['refseq_identifiers'] = refseq_identifiers #################### #################### #################### # Build the transcript object tx_obj = build_transcript(tx_data, build) transcript_objs.append(tx_obj) # Load all transcripts LOG.info("Loading transcripts...") if len(transcript_objs) > 0: adapter.load_transcript_bulk(transcript_objs) LOG.info('Number of transcripts in build %s: %s', build, nr_transcripts) LOG.info('Number of transcripts with refseq identifier: %s', ref_seq_transcripts) LOG.info('Number of primary transcripts: %s', nr_primary_transcripts) return transcript_objs