Exemplo n.º 1
0
def _calculate_gene_start_end(df):
    df['start'] = 0
    df['end'] = 0
    for index, row in df.iterrows():
        if row.db == 'hg19':
            exons = exon_interval_trees()[row.chromosome][row.cut_position]
            for exon in exons:
                if (exon.strand == '+') == (row.Strand == 'sense'):
                    gene = read_gencode()[
                            (read_gencode().gene_id == exon.gene_id) &
                            (read_gencode().feature == 'gene')]
                    if len(gene) != 1:
                        print('found more than one gene for ID: {}'.format(
                            gene))
                    gene = gene.iloc[0]

                    df[index].start = gene.start
                    df[index].end = gene.end
                    break
            else:
                print('no gene found for row {}'.format(row))
        if row.db == 'mm9':
            strand = '+' if row.Strand == 'sense' else '-'
            start, end, gene_ids = mouse_gene_intervals()[row.chromosome + strand][row]
            if len(gene_ids) != 1:
                print('error: found more than one gene: {}, row: {}'.format(gene, row))
            df[index].start = start
            df[index].end = end
Exemplo n.º 2
0
def score(gene_id, guides):
    '''
    Call the azimuth module model
    :guides: A dataframe with all relevant information for the guides
    TODO add amino acid cut position and percent peptides
    :returns: a list of scores
    '''
    if len(guides) == 0:
        return []

    gene = read_gencode()[read_gencode().gene_id == gene_id].iloc[0]

    cut_positions = guides.apply(lambda row:
                                 ('hg19', gene.seqname, row['cut_position']),
                                 axis=1)
    conservation_scores = process_dataframe(cut_positions)

    # now build the azimuth-style feature data
    # TODO Strand might fail again..
    df = pd.DataFrame({
        'Sequence':
        guides.target,
        'Target':
        'dummy',
        'Target gene':
        'dummy',
        'score_drug_gene_rank': [0.5] * len(guides),  # no need
        'drug':
        'dummy',
        '30mer':
        guides.context,
        'Strand': [
            'sense' if orientation == 'FWD' else 'antisense'
            for orientation in guides.orientation
        ],
        'Percent Peptide':
        guides.percent_peptide,
        'Amino Acid Cut position':
        guides.aa_cut_position
    })

    Xdf = df[['Sequence', 'Target', 'drug', '30mer',
              'Strand']].set_index(['Sequence', 'Target', 'drug'])
    gene_position = df[[
        'Sequence', 'Target gene', 'drug', 'Percent Peptide',
        'Amino Acid Cut position'
    ]].set_index(['Sequence', 'Target gene', 'drug'])
    Y = df[['Sequence', 'Target gene', 'drug', 'score_drug_gene_rank'
            ]].set_index(['Sequence', 'Target gene', 'drug'])
    conservation_scores.set_index(Xdf.index, inplace=True)

    combined_features, y, genes, feature_names = extract_features(
        Xdf, Y, gene_position, conservation_scores, order=1)

    transformed_features = feature_scaler().transform(combined_features)

    result = cnn38_model()(Variable(
        torch.from_numpy(transformed_features))).cpu().data.numpy()
    return result.reshape(-1)
Exemplo n.º 3
0
def main():
    mismatches = {}
    overflow_count = 0
    gene_ids = read_gencode().gene_id.drop_duplicates()
    if COMPUTATION_CORES > 1:
        with Pool(COMPUTATION_CORES) as pool:
            for partial_overflow_count, partial_mismatches in tqdm(
                    pool.imap_unordered(generate_exon_guides, gene_ids),
                    total=len(gene_ids)):
                overflow_count += partial_overflow_count
                for key in partial_mismatches:
                    try:
                        mismatches[key] += partial_mismatches[key]
                    except KeyError:
                        mismatches[key] = partial_mismatches[key]
    else:
        # debuggable
        for gene_id in tqdm(gene_ids, total=len(gene_ids)):
            partial_overflow_count, partial_mismatches = generate_exon_guides(
                gene_id)
            overflow_count += partial_overflow_count
            for key in partial_mismatches:
                try:
                    mismatches[key] += partial_mismatches[key]
                except KeyError:
                    mismatches[key] = partial_mismatches[key]

    # save anaysis data
    print('Overflow count: {}'.format(overflow_count))
    print(mismatches)
Exemplo n.º 4
0
def gene_names_similar(gene_a, gene_b):
    gc = read_gencode()
    name_a = gc[gc.gene_id == gene_a].iloc[0].gene_name
    name_b = gc[gc.gene_id == gene_b].iloc[0].gene_name
    if name_a[:-1] in name_b or name_b[:-1] in name_a:
        return True
    else:
        return False
Exemplo n.º 5
0
def pdbs_for_gene(gene_id):
    gencode = read_gencode()
    pdbs = pdb_list()
    protein_ids = gencode.loc[(gencode['gene_id'] == gene_id
                               )]['swissprot_id'].drop_duplicates().dropna()

    canonical_pids = np.unique(
        [normalize_pid(pid) for pid in protein_ids if pid]).astype('O')
    if len(canonical_pids) > 1:
        logging.warning('Gene {} has {} "canonical" protein ids: {}"'.format(
            gene_id, len(canonical_pids), canonical_pids))

    gene_pdbs = pdbs.loc[pdbs.SP_PRIMARY.isin(canonical_pids)][[
        'PDB', 'SP_PRIMARY', 'CHAIN'
    ]].copy()
    gene_pdbs.columns = ['pdb', 'swissprot_id', 'chain']

    if len(gene_pdbs) > 0:
        # mappings from swissprot-coordinate to pdb-index
        gene_pdbs['mappings'] = gene_pdbs.apply(
            lambda row: pdb_mappings(row.pdb, row.chain, row.swissprot_id),
            axis=1)

        empty_mappings = gene_pdbs['mappings'].apply(len) == 0
        if empty_mappings.any():
            logging.warning('No PDB mapping for {}'.format(
                gene_pdbs[empty_mappings].pdb))
            gene_pdbs.drop(gene_pdbs.index[empty_mappings], inplace=True)

        gene_pdbs['start'] = gene_pdbs['mappings'].apply(
            lambda mappings: min(mappings.values()))
        gene_pdbs['end'] = gene_pdbs['mappings'].apply(
            lambda mappings: max(mappings.values()) + 1)  # end is contained
        gene_pdbs['mappings'] = gene_pdbs['mappings'].apply(
            lambda mappings:
            {str(key): value
             for key, value in mappings.items()})

    return gene_pdbs
Exemplo n.º 6
0
def test_read_gencode(mocked_gtf, mocked_protein_mapping):
    '''only
    Mocking read_gtf_as_dataframe to return a simplified dataframe,
    read_gencode should return only one copy of gene2
    '''

    df = data.read_gencode()

    # check that the correct columns are returned
    eq_(
        set(df.columns), {
            'feature', 'gene_id', 'transcript_id', 'start', 'end', 'exon_id',
            'exon_number', 'gene_name', 'transcript_type', 'strand',
            'gene_type', 'tag', 'protein_id', 'swissprot_id', 'score',
            'seqname', 'source'
        })

    # gene 'GB' should have  zero entires for ENSEMBL
    # but 3 (gene, transcript, exon) for HAVANA
    eq_(len(df[(df.gene_id == 'GB') & (df.source == 'ENSEMBL')]), 0)
    eq_(len(df[(df.gene_id == 'GB') & (df.source == 'HAVANA')]), 3)

    # gene 'GC' should not be existent because it has no basic transcripts
    eq_(len(df[df.gene_id == 'GC']), 0)

    # start should be subtracted (just testing for one exon for simplicity)
    eq_(df[df.exon_id == 'EB1'].iloc[0].start, 4)
    # end should remain the same
    eq_(df[df.exon_id == 'EB1'].iloc[0].end, 10)

    # swissprot should be merged
    eq_(list(df[(df.protein_id == 'PA')].swissprot_id.drop_duplicates()),
        ['SP1'])
    eq_(list(df[(df.protein_id == 'PB')].swissprot_id.drop_duplicates()),
        ['SP2'])

    # no swissprot_id for a protein_id shouldn't delete that row
    assert (df.protein_id == 'PA2').any()