def test_parse_transcripts_file(transcripts_handle):
    """Test to parse all ensembl transcripts"""
    transcripts = parse_transcripts(transcripts_handle)

    for transcript_id in transcripts:
        transcript = transcripts[transcript_id]
        assert transcript['ensembl_transcript_id']
        assert transcript['ensembl_gene_id']
def test_parse_transcripts_file(transcripts_handle):
    """Test to parse all ensembl transcripts"""
    transcripts = parse_transcripts(transcripts_handle)
    
    for transcript_id in transcripts:
        transcript = transcripts[transcript_id]
        assert transcript['ensembl_transcript_id']
        assert transcript['ensembl_gene_id']
def test_parse_transcripts_data_frame(transcripts_df):
    """Test to parse all ensembl transcripts from data frame"""
    ## GIVEN a data frame with transcript information
    transcripts = parse_transcripts(transcripts_df)

    ## WHEN parsing the transcripts
    i = 0
    for i, transcript_id in enumerate(transcripts):
        transcript = transcripts[transcript_id]
        ## THEN assert they all got the mandatory ids
        assert transcript['ensembl_transcript_id']
        assert transcript['ensembl_gene_id']
    assert i > 0
def test_parse_transcripts_data_frame(transcripts_df):
    """Test to parse all ensembl transcripts from data frame"""
    ## GIVEN a data frame with transcript information
    transcripts = parse_transcripts(transcripts_df)
    
    ## WHEN parsing the transcripts
    i = 0
    for i,transcript_id in enumerate(transcripts):
        transcript = transcripts[transcript_id]
    ## THEN assert they all got the mandatory ids
        assert transcript['ensembl_transcript_id']
        assert transcript['ensembl_gene_id']
    assert i > 0
Exemplo n.º 5
0
def parsed_transcripts(request, transcripts_handle, ensembl_genes):
    """Get the parsed ensembl transcripts"""
    print('')
    transcripts = parse_transcripts(transcripts_handle)
    for tx_id in transcripts:
        tx_info = transcripts[tx_id]
        ens_gene_id = tx_info['ensembl_gene_id']
        gene_obj = ensembl_genes.get(ens_gene_id)
        if not gene_obj:
            continue
        tx_info['hgnc_id'] = gene_obj['hgnc_id']
        tx_info['primary_transcripts'] = set(
            gene_obj.get('primary_transcripts', []))

    return transcripts
Exemplo n.º 6
0
def load_transcripts(adapter,
                     transcripts_lines=None,
                     build='37',
                     ensembl_genes=None):
    """Load all the transcripts

    Transcript information is from ensembl.

    Args:
        adapter(MongoAdapter)
        transcripts_lines(iterable): iterable with ensembl transcript lines
        build(str)
        ensembl_genes(dict): Map from ensembl_id -> HgncGene

    Returns:
        transcript_objs(list): A list with all transcript objects
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)

    if transcripts_lines is None:
        transcripts_lines = fetch_ensembl_transcripts(build=build)

    # Map with all transcripts enstid -> parsed transcript
    transcripts_dict = parse_transcripts(transcripts_lines)
    for ens_tx_id in list(transcripts_dict):
        parsed_tx = transcripts_dict[ens_tx_id]
        # Get the ens gene id
        ens_gene_id = parsed_tx['ensembl_gene_id']
        # pp(ens_gene_id)
        # Fetch the internal gene object to find out the correct hgnc id
        gene_obj = ensembl_genes.get(ens_gene_id)
        # If the gene is non existing in scout we skip the transcript
        if not gene_obj:
            transcripts_dict.pop(ens_tx_id)
            LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build)
            continue

        # Add the correct hgnc id
        parsed_tx['hgnc_id'] = gene_obj['hgnc_id']
        # Primary transcript information is collected from HGNC
        parsed_tx['primary_transcripts'] = set(
            gene_obj.get('primary_transcripts', []))

    ref_seq_transcripts = 0
    nr_primary_transcripts = 0
    nr_transcripts = len(transcripts_dict)

    transcript_objs = []

    with progressbar(transcripts_dict.values(),
                     label="Building transcripts",
                     length=nr_transcripts) as bar:
        for tx_data in bar:

            #################### Get the correct refseq identifier ####################
            # We need to decide one refseq identifier for each transcript, if there are any to choose
            # from. The algorithm is as follows:
            # If these is ONE mrna this is choosen
            # If there are several mrna the one that is in 'primary_transcripts' is choosen
            # Else one is choosen at random
            # The same follows for the other categories where nc_rna has precedense over mrna_predicted
            tx_data['is_primary'] = False
            primary_transcripts = tx_data['primary_transcripts']
            refseq_identifier = None
            for category in TRANSCRIPT_CATEGORIES:
                identifiers = tx_data[category]
                if not identifiers:
                    continue

                intersection = identifiers.intersection(primary_transcripts)
                ref_seq_transcripts += 1
                if intersection:
                    refseq_identifier = intersection.pop()
                    tx_data['is_primary'] = True
                    nr_primary_transcripts += 1
                else:
                    refseq_identifier = identifiers.pop()
                # If there was refseq identifiers we break the loop
                break

            if refseq_identifier:
                tx_data['refseq_id'] = refseq_identifier
            ####################  ####################  ####################

            # Build the transcript object
            tx_obj = build_transcript(tx_data, build)
            transcript_objs.append(tx_obj)

    # Load all transcripts
    LOG.info("Loading transcripts...")
    if len(transcript_objs) > 0:
        adapter.load_transcript_bulk(transcript_objs)

    LOG.info('Number of transcripts in build %s: %s', build, nr_transcripts)
    LOG.info('Number of transcripts with refseq identifier: %s',
             ref_seq_transcripts)
    LOG.info('Number of primary transcripts: %s', nr_primary_transcripts)

    return transcript_objs
Exemplo n.º 7
0
def load_transcripts(adapter, transcripts_lines=None, build='37', ensembl_genes=None):
    """Load all the transcripts

    Transcript information is from ensembl.

    Args:
        adapter(MongoAdapter)
        transcripts_lines(iterable): iterable with ensembl transcript lines
        build(str)
        ensembl_genes(dict): Map from ensembl_id -> HgncGene

    Returns:
        transcript_objs(list): A list with all transcript objects
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)

    if transcripts_lines is None:
        transcripts_lines = fetch_ensembl_transcripts(build=build)

    # Map with all transcripts enstid -> parsed transcript
    transcripts_dict = parse_transcripts(transcripts_lines)
    for ens_tx_id in list(transcripts_dict):
        parsed_tx = transcripts_dict[ens_tx_id]
        # Get the ens gene id
        ens_gene_id = parsed_tx['ensembl_gene_id']

        # Fetch the internal gene object to find out the correct hgnc id
        gene_obj = ensembl_genes.get(ens_gene_id)
        # If the gene is non existing in scout we skip the transcript
        if not gene_obj:
            transcripts_dict.pop(ens_tx_id)
            LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build)
            continue

        # Add the correct hgnc id
        parsed_tx['hgnc_id'] = gene_obj['hgnc_id']
        # Primary transcript information is collected from HGNC
        parsed_tx['primary_transcripts'] = set(gene_obj.get('primary_transcripts', []))


    ref_seq_transcripts = 0
    nr_primary_transcripts = 0
    nr_transcripts = len(transcripts_dict)

    transcript_objs = []

    with progressbar(transcripts_dict.values(), label="Building transcripts", length=nr_transcripts) as bar:
        for tx_data in bar:

            #################### Get the correct refseq identifier ####################
            # We need to decide one refseq identifier for each transcript, if there are any to 
            # choose from. The algorithm is as follows:
            # If there is ONE mrna this is choosen
            # If there are several mrna the one that is in 'primary_transcripts' is choosen
            # Else one is choosen at random
            # The same follows for the other categories where nc_rna has precedense over mrna_predicted
            # We will store all refseq identifiers in a "refseq_identifiers" list as well
            tx_data['is_primary'] = False
            primary_transcripts = tx_data['primary_transcripts']
            refseq_identifier = None
            refseq_identifiers = []
            for category in TRANSCRIPT_CATEGORIES:
                identifiers = tx_data[category]
                if not identifiers:
                    continue

                for refseq_id in identifiers:
                    # Add all refseq identifiers to refseq_identifiers
                    refseq_identifiers.append(refseq_id)
                    ref_seq_transcripts += 1

                    if refseq_id in primary_transcripts:
                        refseq_identifier = refseq_id
                        tx_data['is_primary'] = True
                        nr_primary_transcripts += 1
                    
                    if not refseq_identifier:
                        refseq_identifier = refseq_id

            if refseq_identifier:
                tx_data['refseq_id'] = refseq_identifier
            if refseq_identifiers:
                tx_data['refseq_identifiers'] = refseq_identifiers

            ####################  ####################  ####################
            # Build the transcript object
            tx_obj = build_transcript(tx_data, build)
            transcript_objs.append(tx_obj)

    # Load all transcripts
    LOG.info("Loading transcripts...")
    if len(transcript_objs) > 0:
        adapter.load_transcript_bulk(transcript_objs)

    LOG.info('Number of transcripts in build %s: %s', build, nr_transcripts)
    LOG.info('Number of transcripts with refseq identifier: %s', ref_seq_transcripts)
    LOG.info('Number of primary transcripts: %s', nr_primary_transcripts)

    return transcript_objs