예제 #1
0
def plot_long_RNA_peak(peaks, ax, ce, top_n=10, y_val='log10p'):
    lp = peaks[peaks.sense_gtype.str.contains('Long RNA')] \
        .query('sample_count >= %i' %sample_cutoff)\
        .groupby('sense_gname', as_index=False)\
        .apply(pick_lp)
    rfam_labs = defaultdict(lambda: 'Others')  #get_peak_rfam_annotation(lp)
    rfam_labs['CPN1'] = 'tRNA-lookalike'
    rfam_labs['CASKIN2'] = 'Excised structured intron RNA'
    rfam_labs['DAPK1'] = 'miRNA-like'
    rfam_labs['RP11-51O6.1'] = 'Pseudogene'

    assert (y_val in ['log10p', 'pileup'])
    name_conversion = NameConversion()
    rev_name_conversion = {v: k for k, v in name_conversion.encoder.items()}

    lp = lp\
        .assign(picked_RNA_sense = lambda d: d.sense_gname.map(name_conversion.convert).str.replace('-NPIPA8','')) \
        .groupby('picked_RNA_sense')\
        .apply(lambda d: d.nlargest(1, y_val))\
        .nlargest(top_n, y_val) \
        .pipe(cat_long_rna_type)\
        .sort_values(y_val, ascending=False)
    colors = lp.rt.map(peak_type_ce.encoder).values
    sns.barplot(data=lp, x='picked_RNA_sense', y=y_val, palette=colors, ax=ax)
    ax.legend().set_visible(False)
    ax.set_xlabel('')
    if y_val == 'log10p':
        ax.set_ylabel('-$log_{10}$ p-value', fontsize=20)
    else:
        ax.set_ylabel('Coverage', fontsize=20)
    ax.set_xticklabels(ax.get_xticklabels(),
                       rotation=70,
                       rotation_mode='anchor',
                       ha='right')

    used_rfam = []
    for i, xt in enumerate(ax.get_xticklabels()):
        gn = xt.get_text()
        if gn in rev_name_conversion.keys():
            gn = rev_name_conversion[gn]
        rfam = rfam_labs[gn]
        used_rfam.append(rfam)

    used = lp.rt.unique()
    cc_ce = color_encoder()
    cc_ce.encoder = {
        k: v
        for k, v in peak_type_ce.encoder.items() if k in used
    }
    cc_ce.show_legend(ax=ax, frameon=False, fontsize=20)

    for col, xt in zip(colors, ax.get_xticklabels()):
        xt.set_color(col)
예제 #2
0
def plot_pymc_bar(ax):
    ce = color_encoder()
    bar_df = get_pymc_df()\
        .query('bayes_factor > 3')\
        .nlargest(15,'delta')\
        .assign(color = lambda d: ce.fit_transform(d['is_telo']))\
        .assign(delta = lambda d: d.delta*100)
    bar_df.plot\
        .bar('gene_name','delta',color = bar_df.color.tolist(), ax = ax)
    ce.show_legend(ax, frameon=False, fontsize=20, bbox_to_anchor=(0.5, 0.7))
    xts = []
    for xt in ax.get_xticklabels():
        xtext = xt.get_text().split(':')[1]
        xts.append((xtext))
    ax.set_xticklabels(xts, rotation=70, rotation_mode='anchor', ha='right')
    ax.set_xlabel('')
    ax.set_ylabel('$\Delta$ % Plus strand')
예제 #3
0
        return prep_order[3]
    else:
        return x


def rename_sample(xs):
    sample_dict = defaultdict(int)
    out_name = []
    for x in xs:
        prep = label_sample(x)
        sample_dict[prep] += 1
        out_name.append('%s %i' % (prep, sample_dict[prep]))
    return out_name


label_ce = color_encoder()
label_ce.encoder = {}
for label, color in zip([
        'DNase I', 'DNase I + Exo I', 'DNase I + NaOH',
        'DNase I + Exo I + NaOH', 'NaOH', 'Untreated', 'Ladder', 'Fragmented',
        "DNase I - 3'P", 'HEK293'
], [
        '#d12604', '#ff96cb', '#964b06', '#f2a157', '#4286f4', 'black', 'grey',
        '#592782', '#870c47', 'black'
]):
    label_ce.encoder[label] = color

RNA_type = [
    'Antisense', 'Mt', 'Other ncRNA', 'Other sncRNA', 'Protein coding',
    'Repeats', 'miRNA', 'rRNA', 'snoRNA', 'tRNA', 'Vault RNA', 'Unannotated',
    '5/5.8S rRNA', '18/28S rRNA', 'Mt-tRNA'
예제 #4
0
import numpy as np
import re
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import KFold, LeaveOneOut, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sequencing_tools.viz_tools import color_encoder, okabeito_palette, simpsons_palette
import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from operator import itemgetter
import os
import pysam
from collections import Counter
end_ce = color_encoder()
end_ce.fit(["3' end", "5' end"],['darkgoldenrod','purple'])

def positioning(x):
    return x[-1]

def count_to_cpm(count_array):
    count_array = np.true_divide(count_array,count_array.sum()) * 1e6 
    return count_array

def get_end(x):
    if 'head' in x:
        return "5' N+"
    elif 'tail' in x:
        return "3' N-"
예제 #5
0
from collections import defaultdict
from sequencing_tools.viz_tools import okabeito_palette, \
                        simpsons_palette, \
                        color_encoder
from sequencing_tools.io_tools import ReadPicardRNA
from collections import defaultdict
import re
import glob
import os
from plotting_utils import label_sample, rename_sample, \
                        label_ce, rna_type_ce, \
                        figure_path, work_path
from functools import lru_cache
plt.rc('font', **{'family': 'sans-serif', 'sans-serif': 'Arial'})

small_RNA_ce = color_encoder()
label_order = [
    'Untreated', 'NaOH', 'WGS-sim', 'DNase I', 'DNase I + Exo I',
    "DNase I - 3'P"
]

metric_path = work_path + '/cfNA/tgirt_map/merged_bam/filtered_bam'
metrics = glob.glob(metric_path + '/*.RNA_Metrics')
metrics = list(filter(lambda x: 'sense' not in x, metrics))


def read_metric(metric):
    return pd.read_table(metric, skiprows=6, nrows=1)\
        .pipe(pd.melt) \
        .pipe(lambda d: d[d.variable.str.contains('TRANSCRIPT_STRAND_')])\
        .pipe(lambda d: d[d.variable.str.contains('PCT')])
예제 #6
0
import matplotlib.pyplot as plt
sys.path.insert(0, '/stor/home/cdw2854/cfNA/peak_callings')
from structural_peaks import PeakAnalyzer, mRNAFilter, GenicIntersect, NameConversion, GeneMapper, TrnaLookAlike
from exon_coverage import ExonFilter
import dask.dataframe as dd
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.rc('font', **{'family': 'sans-serif', 'sans-serif': 'Arial'})

pileup_cutoff = 5
sample_cutoff = 5
project_path = '/stor/work/Lambowitz/cdw2854/cfNA/tgirt_map'
project_path = '/stor/work/Lambowitz/yaojun/Work/cfNA/tgirt_map'
peak_path = project_path + '/bed_files/merged_bed/MACS2/annotated'
peak_type_ce = color_encoder()
peak_type_ce.encoder = {
    'mRNA': 'purple',
    'Pseudogene': 'darkblue',
    'Exon': 'purple',
    'Intron': '#fca237',
    'Exon-intron': '#7bb73e',
    'Within intron': '#f9b109',
    'Stem-loop': '#f9b109',
    'miRNA': 'darkgreen',
    'rRNA': '#15879b',
    'Mismapped': '#bcbb76',
    'Others': 'black',
    'Intergenic': 'black',
    'tRNA-lookalike': 'red',
    'Full-length intron': '#725001',
예제 #7
0
import numpy as np
from sequencing_tools.viz_tools import color_encoder, okabeito_palette
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize, OneHotEncoder, LabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA, FactorAnalysis
import matplotlib.patches as mpatches
from sklearn.model_selection import LeaveOneOut, KFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from helper_function import *
import glob
import os

pca_ce = color_encoder()


def pca_color(labs, colors):
    return {lab: col for lab, col in zip(labs, colors)}


def pca_biplot(train_df, ax):
    pca = PCA(n_components=3)
    train_df = train_df \
        .query('label !="Zero" & label != ""')
    tdf = extract_train_cols(train_df)
    d = pca.fit_transform(StandardScaler().fit_transform(tdf))
    pca_df = pd.DataFrame(d)
    pca_df.columns = ['PC%i' % (int(col) + 1) for col in pca_df.columns]
    pca_df['label'] = train_df.label.values