Exemplo n.º 1
0
    def distribution_for_half(self):
        if self._distribution_half != None:
            return self._distribution_half

        bycond = {}
        for o in self.halfoffer_set.all():
            if o.active:
                cond = map_half_cond_to_internal(o.condition)
                if not bycond.has_key(cond):
                    bycond[cond] = []
                bycond[cond].append(float(o.price))

        from stats import quantile
        dist = {}
        for k in bycond.keys():
            if len(bycond[k]) > 1:
                min = quantile(bycond[k],0)
                p25 = quantile(bycond[k],0.25)
                med = quantile(bycond[k],0.5)
                p75 = quantile(bycond[k],0.75)
                max = quantile(bycond[k],1)
                #print k,"N=%d, %03.2f<< %03.2f - %03.2f - %03.2f >>%03.2f" % (len(bycond[k]),min,p25,med,p75,max)
                dist[k] = (len(bycond[k]),Decimal("%3.2f" % min), Decimal("%3.2f" % p25), Decimal("%3.2f" % med), Decimal("%3.2f" % p75), Decimal("%3.2f" % max))
            else:
                #print k,"%03.2f" % bycond[k][0]
                dist[k] = (1,"","",Decimal("%3.2f" % bycond[k][0]),"","")
        self._distribution_half = dist
        return dist
Exemplo n.º 2
0
def get_seq_feature(seq, seq_name, user_id):
    # total 11 features
    if not seq:
        print('seq is empty!')
        return
    df = pd.DataFrame()
    df[seq_name + '_mean'] = [np.mean(seq)]
    df[seq_name + '_median'] = [np.median(seq)]
    df[seq_name + '_max'] = [np.max(seq)]
    df[seq_name + '_min'] = [np.min(seq)]
    df[seq_name + '_var'] = [np.var(seq)]
    df[seq_name + '_std'] = [np.std(seq)]
    if len(seq) == 1:
        df[seq_name + '_upquantile'] = seq[0]
        df[seq_name + '_downquantile'] = 0
    else:
        df[seq_name + '_upquantile'] = [sts.quantile(seq, p=0.75)]
        df[seq_name + '_downquantile'] = [sts.quantile(seq, p=0.25)]
    if np.mean(seq) != 0: df[seq_name + '_discrete'] = [np.std(seq) / np.mean(seq)]
    else: df[seq_name + '_discrete'] = [np.NaN]
    try: df[seq_name + 'skew'] = [sts.skewness(seq)]
    except: df[seq_name + 'skew'] = [np.NaN]
    try: df[seq_name + 'kurt'] = [sts.kurtosis(seq)]
    except: df[seq_name + 'kurt'] = [np.NaN]
    df['user_id'] = [user_id]
    return df
Exemplo n.º 3
0
def data_description(index, start, end):
    returns = download_data.get_returns(index, start, end)
    print('个数:', len(returns))
    print('平均值:', np.mean(returns))
    print('中位数:', np.median(returns))
    print('上四分位数', sts.quantile(returns, p=0.25))
    print('下四分位数', sts.quantile(returns, p=0.75))
    #离散趋势的度量
    print('最大值:', np.max(returns))
    print('最小值:', np.min(returns))
    print('极差:', np.max(returns) - np.min(returns))
    print('四分位差',
          sts.quantile(returns, p=0.75) - sts.quantile(returns, p=0.25))
    print('标准差:', np.std(returns))
    print('方差:', np.var(returns))
    print('离散系数:', np.std(returns) / np.mean(returns))
    #偏度与峰度的度量
    print('偏度:', sts.skewness(returns))
    print('峰度:', sts.kurtosis(returns))
    print(st.kstest(returns, 'norm'))
    length = len(returns)
    sns.distplot(returns, bins=100, label='Empirical')
    sns.plt.legend()
    sns.plt.title('Empirical')
    sns.plt.show()
Exemplo n.º 4
0
def extend_feature(scores):
    """
    特征构造

    Args:
        scores: 原始滑动窗口获得的特征
    Returns:
        返回基于滑动窗口特征增加的统计特征
    """
    features = scores
    features.append(np.sum(scores))  #总数
    features.append(np.mean(scores))  #平均数
    features.append(np.median(scores))  #中位数
    # features.append(sts.mode(scores)) #众数
    features.append(sts.quantile(scores, p=0.25))  #上四分位
    features.append(sts.quantile(scores, p=0.75))  #上七分位
    features.append(np.max(scores))  #最大值
    features.append(np.min(scores))  #最小值
    features.append(np.max(scores) - np.min(scores))  #极差
    features.append(
        sts.quantile(scores, p=0.75) - sts.quantile(scores, p=0.25))  #四分位差
    features.append(np.var(scores))  #方差
    features.append(np.std(scores) / np.mean(scores))  #离散系数
    features.append(sts.skewness(scores))  #偏度
    features.append(sts.kurtosis(scores))  #峰度
    return features
Exemplo n.º 5
0
 def test_quantile(self):
     self.assertEqual(stats.quantile([1, 2, 3, 4, 5], 0.2), 2)
Exemplo n.º 6
0
 def test_quantile_wrong_type(self):
     with self.assertRaises(TypeError) as raised_exception:
         stats.quantile([2, 3, 4], 3)
     self.assertEqual(raised_exception.exception.args[0], "p must be float")
Exemplo n.º 7
0
 def test_quantile_wrong_range_p(self):
     with self.assertRaises(ValueError) as raised_exception:
         stats.quantile([2, 3, 4], 3.0)
     self.assertEqual(raised_exception.exception.args[0],
                      "p must be in range (0,1)")
Exemplo n.º 8
0
#!/usr/bin/python26
# encoding=utf-8

"""
    异常值检测
    使用算法:Tukey's Test
"""

import numpy as np
import stats as sts

list = [1, 4, 8, 90, 98, 44, 35, 56, 2, 41, 11, 24, 23, 45, 500, 150]
print(list)

# 求四分位数
print('下四分位数:', sts.quantile(list, p=0.25))
print('上四分位数:', sts.quantile(list, p=0.75))
q1 = sts.quantile(list, p=0.25)
q3 = sts.quantile(list, p=0.75)

# k=1.5  中度异常
k1 = 1.5
g_min_m = q1 - k1 * (q3 - q1)
g_max_m = q3 + k1 * (q3 - q1)

# k=3 重度异常
k2 = 3
g_min_b = q1 - k2 * (q3 - q1)
g_max_b = q3 + k2 * (q3 - q1)

# g_min_b, g_min_m, g_max_m, g_max_b
Exemplo n.º 9
0
import numpy as np
import stats as sts

data = [1, 2, 2, 3]
#集中趋势的度量
print('求和:', np.sum(data))
print('个数:', len(data))
print('平均值:', np.mean(data))
print('中位数:', np.median(data))
print('众数:', sts.mode(data))
print('上四分位数', sts.quantile(data, p=0.25))
print('下四分位数', sts.quantile(data, p=0.75))
#离散趋势的度量
print('最大值:', np.max(data))
print('最小值:', np.min(data))
print('极差:', np.max(data) - np.min(data))
print('四分位差', sts.quantile(data, p=0.75) - sts.quantile(data, p=0.25))
print('标准差:', np.std(data))
print('方差:', np.var(data))
print('变异系数:', np.std(data) / np.mean(data))
#偏度与峰度的度量
print('偏度:', sts.skewness(data))
print('峰度:', sts.kurtosis(data))

# 随机生成两个样本
x = np.random.randint(0, 9, 1000)
y = np.random.randint(0, 9, 1000)

# 计算平均值
mx = x.mean()
my = y.mean()
Exemplo n.º 10
0
def main():
    usage = 'usage: %prog [options] <roc_dir>'
    parser = OptionParser(usage)
    parser.add_option('-t', dest='targets_file')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide ROC points file')
    else:
        roc_dir = args[0]

    # read target labels
    if options.targets_file:
        target_labels = [
            line.split()[0] for line in open(options.targets_file)
        ]
    else:
        target_labels = [
            'Target %d' % (ti + 1)
            for ti in range(len(glob.glob('%s/roc*.txt' % roc_dir)))
        ]

    #######################################################
    # make all ROC plots
    #######################################################
    target_fpr = []
    target_tpr = []

    for roc_file in glob.glob('%s/roc*.txt' % roc_dir):
        ti = int(roc_file[roc_file.find('roc') + 3:-4]) - 1

        target_fpr.append([])
        target_tpr.append([])
        for line in open(roc_file):
            a = line.split()
            target_fpr[-1].append(float(a[0]))
            target_tpr[-1].append(float(a[1]))

        plt.figure(figsize=(6, 6))

        plt.scatter(target_fpr[-1],
                    target_tpr[-1],
                    s=8,
                    linewidths=0,
                    c=sns_colors[0])

        plt.title(target_labels[ti])
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.xlim((0, 1))
        plt.ylim((0, 1))
        plt.grid(True)
        plt.tight_layout()

        out_pdf = '%s.pdf' % os.path.splitext(roc_file)[0]
        plt.savefig(out_pdf)
        plt.close()

    #######################################################
    # multiple ROC curve plot
    #######################################################
    # read AUCs
    target_aucs = [
        float(line.split()[1]) for line in open('%s/aucs.txt' % roc_dir)
    ]

    # choose cells
    auc_targets = [(target_aucs[ti], ti) for ti in range(len(target_aucs))]
    auc_targets.sort()

    fig_quants = [0.05, .33, 0.5, .67, .95]
    auc_target_quants = quantile(auc_targets, fig_quants)

    # plot
    sns.set(style='white', font_scale=1.2)
    plt.figure(figsize=(6, 6))

    si = 0
    for auc, ti in auc_target_quants:
        target_label = '%-9s AUC: %.3f' % (target_labels[ti], target_aucs[ti])
        plt.plot(target_fpr[ti],
                 target_tpr[ti],
                 c=sns_colors[si],
                 label=target_label,
                 linewidth=2.5,
                 alpha=0.8)
        si += 1

    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')

    plt.xlim((0, 1))
    plt.ylim((0, 1))

    ax = plt.gca()
    ax.xaxis.label.set_fontsize(17)
    ax.yaxis.label.set_fontsize(17)

    map(lambda xl: xl.set_fontsize(13), ax.get_xticklabels())
    map(lambda yl: yl.set_fontsize(13), ax.get_yticklabels())

    ax.grid(True, linestyle=':')
    plt.tight_layout()

    matplotlib.rcParams.update({'font.family': 'monospace'})
    plt.legend(loc=4, fontsize=12)

    plt.savefig('%s/range.pdf' % roc_dir)
    plt.close()
Exemplo n.º 11
0
data[:, :, 65] = walking_12[0:480, 1:7]
data[:, :, 66] = walking_13[0:480, 1:7]
data[:, :, 67] = walking_14[0:480, 1:7]
data[:, :, 68] = walking_15[0:480, 1:7]

y = [[0] for row in range(480)]
dataset = np.zeros((69, 7, 6))
for a in range(6):
    for i in range(69):
        dataset[i, 0, a] = data[:, a, i].min()
        dataset[i, 1, a] = data[:, a, i].max()
        dataset[i, 2, a] = data[:, a, i].mean()
        dataset[i, 3, a] = np.median(data[:, a, i])
        dataset[i, 4, a] = data[:, a, i].std()
        y = data[:, a, i].tolist()
        dataset[i, 5, a] = sts.quantile(y, p=0.25)
        dataset[i, 6, a] = sts.quantile(y, p=0.75)

scatter_matrix = np.zeros((69, 3, 3))
scatter_matrix[:, :, 0] = dataset[:, 0:3, 0]
scatter_matrix[:, :, 1] = dataset[:, 0:3, 1]
scatter_matrix[:, :, 2] = dataset[:, 0:3, 5]

fig = plt.figure()
plt.title('Scatter Plots')
#
ax1 = fig.add_subplot(991)
ax1.scatter(scatter_matrix[0:9, 0, 0].tolist(),
            scatter_matrix[0:9, 0, 0].tolist(),
            c='b',
            marker='.')
Exemplo n.º 12
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]')
    parser.add_option('-c', dest='scale', default=1, type='float', help='Plot scale [Default: %default]')


    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK'])
    parser.add_option('-r', dest='orientation', default=False, action='store_true', help='Split TEs by orientation [Default: %default]')

    parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]')

    parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]')
    parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]')
    parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # make output directory
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True)

        ref_gtf = spread_gtf

    # hash genes -> TEs -> occurence num
    gene_te_num = te.hash_genes_repeats_num(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=options.orientation)

    # hash diffs stats
    gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input')

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        gene_list = list(set(gene_te_num.keys()) & set(gene_diffs[spair].keys()))
                
        for fam in count_tes:
            if options.orientation:
                orients = ['+','-']
            else:
                orients = ['+']

            for orient in orients:
                # hash diff values by TE count
                count_diff = []
                for gene_id in gene_diffs[spair]:
                    if options.orientation:
                        count = gene_te_num.get(gene_id,{}).get(('*',fam,orient), 0)
                    else:
                        count = gene_te_num.get(gene_id,{}).get(('*',fam), 0)

                    while count >= len(count_diff):
                        count_diff.append([])
                    count_diff[count].append(gene_diffs[spair][gene_id])

                df = {'TEs':[], 'stat_low':[], 'stat_mid':[], 'stat_hi':[]}
                for c in range(len(count_diff)):
                    if len(count_diff[c]) > 12:
                        stat_low, stat_mid, stat_hi = stats.quantile(count_diff[c], [.25, .5, .75])
                        df['TEs'].append(c)
                        df['stat_low'].append(stat_low)
                        df['stat_mid'].append(stat_mid)
                        df['stat_hi'].append(stat_hi)
                    else:
                        break

                if len(df['TEs']) > 1:
                    fam_plot = fam[fam.find('/')+1:]

                    if options.orientation:                        
                        out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot, orient)
                        out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot, orient)
                    else:
                        out_pdf = '%s/%s-%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot)
                        out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot)

                    ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'], df, [out_pdf, options.scale], df_file=out_df)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Exemplo n.º 13
0
import numpy as np
import stats as sts
a = [31, 24, 23, 25, 14, 25, 13, 12, 14, 23,
          32, 34, 43, 41, 21, 23, 26, 26, 34, 42,
          43, 25, 24, 23, 24, 44, 23, 14, 52,32,
          42, 44, 35, 28, 17, 21, 32, 42, 12, 34]
scores=np.array(a)
print('總合為:',np.sum(scores))
print('筆數為:',len(scores))
print('平均值為:',np.mean(scores))
print('中位數為:',np.median(scores))
print('眾數為:',sts.mode(scores))
print('上四分位數為',sts.quantile(scores,p=0.25))
print('下四分位數為',sts.quantile(scores,p=0.75))
print('最大值:',np.max(scores))
print('最小值:',np.min(scores))
print('全距:',np.ptp(scores))
print('標準差:',np.std(scores))
print('變異數:',np.var(scores))
print('離散係數:',np.std(scores)/np.mean(scores))
print('偏態係數:',sts.skewness(scores))
print('峰態係數:',sts.kurtosis(scores))
Exemplo n.º 14
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]')
    parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file')
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(',')]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ''
    if options.cuda:
        cuda_str = '-cuda'

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    num_filters = len(filter_consensus)
    # num_filters = 40
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length/2 - options.center_dist - filter_len
    right_i = options.seq_length/2 + options.center_dist

    ns_1hot = np.zeros((4,options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i]
            motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length))


    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = '%s/motif_seqs.h5' % options.out_dir
    h5f = h5py.File(seqs_file, 'w')
    h5f.create_dataset('test_in', data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = '%s/motif_seqs_scores.h5' % options.out_dir
    torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, 'r')
    motif_seq_scores = np.array(hdf5_in['scores'])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0],2*num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi,i] += 1
                X[xi,num_filters+j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:,ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:,ti])

        # print filter coefficients
        coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w')
        for i in range(num_filters):
            print >> coef_out, '%3d  %6.2f' % (i,model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters,num_filters))
        table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w')

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j])
                print >> table_out, '%3d  %3d  %6.3f  %6.3f  %6.3f' % cols
                si += 1

        table_out.close()

        scores_abs = abs(filter_interaction.flatten())
        max_score = stats.quantile(scores_abs, .999)
        print 'Limiting scores to +-%f' % max_score
        filter_interaction_max = np.zeros((num_filters, num_filters))
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score])
                filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score])

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False)
        plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
Exemplo n.º 15
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option(
        '-o',
        dest='out_dir',
        default='te_diff_regress',
        help=
        'Output directory to print regression summaries [Default: %default]')
    parser.add_option('-c',
                      dest='scale',
                      default=1,
                      type='float',
                      help='Plot scale [Default: %default]')

    parser.add_option('-t',
                      dest='te_gff',
                      default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])
    parser.add_option('-r',
                      dest='orientation',
                      default=False,
                      action='store_true',
                      help='Split TEs by orientation [Default: %default]')

    parser.add_option('-m',
                      dest='max_stat',
                      default=None,
                      type='float',
                      help='Maximum stat for plotting [Default: %default]')

    parser.add_option(
        '-s',
        dest='spread_factor',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]'
    )
    parser.add_option(
        '-l',
        dest='spread_lower',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and shortest transcripts [Defafult: %default]'
    )
    parser.add_option(
        '-u',
        dest='spread_upper',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and longest transcripts [Defafult: %default]'
    )

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # make output directory
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf,
                          spread_gtf,
                          options.spread_lower,
                          options.spread_lower,
                          verbose=True)

        ref_gtf = spread_gtf

    # hash genes -> TEs -> occurence num
    gene_te_num = te.hash_genes_repeats_num(ref_gtf,
                                            options.te_gff,
                                            gene_key='transcript_id',
                                            add_star=True,
                                            stranded=options.orientation)

    # hash diffs stats
    gene_diffs = cuffdiff.hash_diff(diff_file,
                                    stat='fold',
                                    max_stat=options.max_stat,
                                    sample_first='input')

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        gene_list = list(
            set(gene_te_num.keys()) & set(gene_diffs[spair].keys()))

        for fam in count_tes:
            if options.orientation:
                orients = ['+', '-']
            else:
                orients = ['+']

            for orient in orients:
                # hash diff values by TE count
                count_diff = []
                for gene_id in gene_diffs[spair]:
                    if options.orientation:
                        count = gene_te_num.get(gene_id, {}).get(
                            ('*', fam, orient), 0)
                    else:
                        count = gene_te_num.get(gene_id, {}).get(('*', fam), 0)

                    while count >= len(count_diff):
                        count_diff.append([])
                    count_diff[count].append(gene_diffs[spair][gene_id])

                df = {'TEs': [], 'stat_low': [], 'stat_mid': [], 'stat_hi': []}
                for c in range(len(count_diff)):
                    if len(count_diff[c]) > 12:
                        stat_low, stat_mid, stat_hi = stats.quantile(
                            count_diff[c], [.25, .5, .75])
                        df['TEs'].append(c)
                        df['stat_low'].append(stat_low)
                        df['stat_mid'].append(stat_mid)
                        df['stat_hi'].append(stat_hi)
                    else:
                        break

                if len(df['TEs']) > 1:
                    fam_plot = fam[fam.find('/') + 1:]

                    if options.orientation:
                        out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir,
                                                          sample1, sample2,
                                                          fam_plot, orient)
                        out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir,
                                                        sample1, sample2,
                                                        fam_plot, orient)
                    else:
                        out_pdf = '%s/%s-%s_%s.pdf' % (
                            options.out_dir, sample1, sample2, fam_plot)
                        out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1,
                                                     sample2, fam_plot)

                    ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'],
                                df, [out_pdf, options.scale],
                                df_file=out_df)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Exemplo n.º 16
0
def main():
    usage = "usage: %prog [options] <roc_dir>"
    parser = OptionParser(usage)
    parser.add_option("-t", dest="targets_file")
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide ROC points file")
    else:
        roc_dir = args[0]

    # read target labels
    if options.targets_file:
        target_labels = [line.split()[0] for line in open(options.targets_file)]
    else:
        target_labels = ["Target %d" % (ti + 1) for ti in range(len(glob.glob("%s/roc*.txt" % roc_dir)))]

    #######################################################
    # make all ROC plots
    #######################################################
    target_fpr = []
    target_tpr = []

    for roc_file in glob.glob("%s/roc*.txt" % roc_dir):
        ti = int(roc_file[roc_file.find("roc") + 3 : -4]) - 1

        target_fpr.append([])
        target_tpr.append([])
        for line in open(roc_file):
            a = line.split()
            target_fpr[-1].append(float(a[0]))
            target_tpr[-1].append(float(a[1]))

        plt.figure(figsize=(6, 6))

        plt.scatter(target_fpr[-1], target_tpr[-1], s=8, linewidths=0, c=sns_colors[0])

        plt.title(target_labels[ti])
        plt.xlabel("False positive rate")
        plt.ylabel("True positive rate")
        plt.xlim((0, 1))
        plt.ylim((0, 1))
        plt.grid(True)
        plt.tight_layout()

        out_pdf = "%s.pdf" % os.path.splitext(roc_file)[0]
        plt.savefig(out_pdf)
        plt.close()

    #######################################################
    # multiple ROC curve plot
    #######################################################
    # read AUCs
    target_aucs = [float(line.split()[1]) for line in open("%s/aucs.txt" % roc_dir)]

    # choose cells
    auc_targets = [(target_aucs[ti], ti) for ti in range(len(target_aucs))]
    auc_targets.sort()

    fig_quants = [0.05, 0.33, 0.5, 0.67, 0.95]
    auc_target_quants = quantile(auc_targets, fig_quants)

    # plot
    sns.set(style="white", font_scale=1.2)
    plt.figure(figsize=(6, 6))

    si = 0
    for auc, ti in auc_target_quants:
        target_label = "%-9s AUC: %.3f" % (target_labels[ti], target_aucs[ti])
        plt.plot(target_fpr[ti], target_tpr[ti], c=sns_colors[si], label=target_label, linewidth=2.5, alpha=0.8)
        si += 1

    plt.xlabel("False positive rate")
    plt.ylabel("True positive rate")

    plt.xlim((0, 1))
    plt.ylim((0, 1))

    ax = plt.gca()
    ax.xaxis.label.set_fontsize(17)
    ax.yaxis.label.set_fontsize(17)

    map(lambda xl: xl.set_fontsize(13), ax.get_xticklabels())
    map(lambda yl: yl.set_fontsize(13), ax.get_yticklabels())

    ax.grid(True, linestyle=":")
    plt.tight_layout()

    matplotlib.rcParams.update({"font.family": "monospace"})
    plt.legend(loc=4, fontsize=12)

    plt.savefig("%s/range.pdf" % roc_dir)
    plt.close()
Exemplo n.º 17
0
import numpy as np
import stats as sts
scares = [
    31, 24, 23, 25, 14, 25, 13, 12, 14, 23, 32, 34, 43, 41, 21, 23, 26, 26, 34,
    42, 43, 25, 24, 23, 24, 44, 23, 14, 52, 32, 42, 44, 35, 28, 17, 21, 32, 42,
    12, 34
]

print('求和:', np.sum(scares))
print('個數:', len(scares))
print('平均值:', np.mean(scares))
print('中位數:', np.median(scares))
print('眾數:', sts.mode(scares))
print('上四分位數:', sts.quantile(scares, p=0.25))
print('下四分位數:', sts.quantile(scares, p=0.75))

print('最大值:', np.max(scares))
print('最小值:', np.min(scares))
print('極差:', np.std(scares))
print('四分位數:', sts.quantile(scares, p=0.75), sts.quantile(scares, p=0.25))
print('標準差:', np.std(scares))
print('方差', np.var(scares))
print('離散係數', np.std(scares) / np.mean(scares))

print('遍度:', sts.skewness(scares))
print('峰度:', sts.kurtosis(scares))
Exemplo n.º 18
0
 def quantile2(self, data):
     print('下四分位数:', sts.quantile(data, p=0.75))
Exemplo n.º 19
0
print("\n\n")
print("*** Test Module <stats> ***")

A = [1, 3, 5, 7, 9, 2, 3, 4, 4, 4, 6, 8, 10, 13, 15, 17]

print("vector A = ", A)
print("sorted A = ", sorted(A))

mean = st.mean(A)
print("A's mean = ", mean)

median = st.median(A)
print("A's median = ", median)

quantile = st.quantile(A, 0.2)
print("A's 20% quantile = ", quantile)

quantile = st.quantile(A, 0.9)
print("A's 90% quantile = ", quantile)

mode = st.mode(A)
print("A's mode = ", mode)

data_range = st.data_range(A)
print("A's range = ", data_range)

variance = st.variance(A)
print("A's variance = ", variance)

standard_deviation = st.standard_deviation(A)
Exemplo n.º 20
0
 def interquartile_range(self, data):
     print('四分位差:', sts.quantile(data, p=0.75) - sts.quantile(data, p=0.25))
Exemplo n.º 21
0
print('方差', df['身高'].var())
print('标准差', df['身高'].std())
print('极差', df['身高'].max() - df['身高'].min())

print('偏度', df['身高'].skew())
print('峰度', df['身高'].kurt())

import numpy as np
import stats as sts
scores = [1, 2, 2, 2, 5]
#集中趋势的度量
print('求和:', np.sum(scores))
print('个数:', len(scores))
print('平均值:', np.mean(scores))
print('中位数:', np.median(scores))
print('众数:', sts.mode(scores))
print('上四分位数', sts.quantile(scores, p=0.25))
print('下四分位数', sts.quantile(scores, p=0.75))
#离散趋势的度量
print('最大值:', np.max(scores))
print('最小值:', np.min(scores))
print('极差:', np.max(scores) - np.min(scores))
print('四分位差', sts.quantile(scores, p=0.75) - sts.quantile(scores, p=0.25))
print('标准差:', np.std(scores))
print('方差:', np.var(scores))
print('离散系数:', np.std(scores) / np.mean(scores))
#偏度与峰度的度量
print('偏度:', sts.skewness(scores))
print('峰度:', sts.kurtosis(scores))
Exemplo n.º 22
0
 def quantile1(self, data):
     print('上四分位数:', sts.quantile(data, p=0.25))