示例#1
0
                          do_not_download=False):
    """
    :param target_dir: example "s3://epionengs/80011001_HCC_Metastase/Methylation/"
    :param outdir: 下载结果的输出路径
    :param match: 匹配文件的表达式
    :param restore: 如果提供该参数,则下载前需要对文件进行还原
    :param threads: 下载的并发数
    :param do_not_download: 如果提供该参数,则不下载
    :return:
    """
    out = os.path.join(outdir, 'target_file.list')
    bucket = target_dir.split('/')[2]
    target_files = get_target_file_path(target_dir, out, match)
    if restore:
        if target_files:
            restore_files(target_files, bucket=bucket)
        else:
            print('Nothing matched!')
            return

    if not do_not_download:
        if restore:
            import time
            time.sleep(3600 * 12)
        download(target_files, outdir, bucket, threads)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), exclude=['pool', 'run_cmd'])
示例#2
0
def run_cmd(cmd):
    return subprocess.call(cmd, shell=True)


def downBam2Fq(data):
    """
    samtools sort + view + fastq -> down sample bam to fastq
    :param data: 第一列样本名,第二列为bam路径,其他列为百分比,第二列的header需指定为"path"
    :return:
    """
    data = pd.read_csv(data, header=0, index_col=0, sep=None, engine='python')
    cmd_list = list()
    for sample in data.index:
        bam = data.loc[sample, 'path']
        for name in data.columns[2:]:
            ratio = data.loc[sample, name]
            if not os.path.exists(name):
                os.mkdir(name)
            fq = f'{name}/{sample}.{name}.R1.fq.gz'
            fq2 = f'{name}/{sample}.{name}.R2.fq.gz'
            cmd = f'samtools sort -n {bam}| samtools view -s {ratio} -O BAM - | samtools fastq -1 {fq} -2 {fq2} - '
            cmd_list.append(cmd)
    with ThreadPoolExecutor(6) as pool:
        pool.map(run_cmd, cmd_list)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['downBam2Fq'])
    all_cls = {x.strip().split('\t')[1] for x in open(parent_info)}
    cls_lst = []
    reason_lst = []
    for cls_set, reason in result:
        if len(cls_set) > 1:
            cls_depth = map(max_distance_to_root, [tree for x in range(len(cls_set))], list(cls_set))
            selected = sorted(zip(list(cls_set), cls_depth), key=lambda x:x[1])[-1][0]
            print(selected, 'vs ', cls_set)
            cls_lst.append(selected)
        else:
            if cls_set:
                for each in cls_set:
                    if each not in all_cls:
                        raise Exception(f'分类名{each}不在已知分类里!')
                    cls_lst.append(each)
            else:
                cls_lst.append('')
        reason_lst.append(reason)

    # target_df['OKR_Name'] = cls_lst
    # target_df['myReason'] = reason_lst
    # target_df['other_reportable'] = other_reportable
    raw_table['OKR_Name'] = cls_lst
    raw_table['myReason'] = reason_lst
    raw_table['other_reportable'] = other_reportable
    raw_table.to_excel(out, merge_cells=False, index=False)

if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals())
示例#4
0
from matplotlib import pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
sns.set(font_scale=0.5)


def heatmap(data, vmin:float=None, vmax:float=None, cmap:str='RdYlBu_r', center:float=None, robust=False, label_size=8,
            pvalue:str=None, annot=True, fmt='', linewidths:float=0, linecolor='white', cbar=True,
            annot_fontsize=6, square=False, xticklabels='auto', yticklabels='auto', out='heatmap.png', dpi=300):
    data = pd.read_csv(data, header=0, index_col=0, sep=None, engine='python')
    if annot:
        annot = data.round(3).applymap(str)
        if pvalue is not None:
            pvalues = pd.read_csv(pvalue, header=0, index_col=0, sep=None, engine='python')
            annot += '\n(p=' + pvalues.round(3).applymap(str) + ')'
    ax = sns.heatmap(data, vmin=vmin, vmax=vmax, cmap=cmap, center=center, robust=robust, annot_kws={"size": annot_fontsize},
                    annot=annot, fmt=fmt, linewidths=linewidths, linecolor=linecolor,
                    cbar=cbar, square=square, xticklabels=xticklabels, yticklabels=yticklabels)
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=label_size)
    plt.savefig(out, dpi=300, bbox_inches='tight')
    plt.close()


# heatmap.__doc__ = sns.heatmap.__doc__


if __name__ == '__main__':
    from xcmds.xcmds import xcmds
    xcmds(locals(), include=['heatmap'])

示例#5
0
    tickvals = []
    for ind, each_file in enumerate(files):
        df = pd.read_csv(each_file, index_col=0, sep='\t')
        x_list = [x + max(x_list) for x in range(df.shape[0])]
        labels += list(df.index)
        tickvals += x_list
        y_list = df.iloc[:, 0]
        trace = go.Scatter(x=x_list,
                           y=y_list,
                           fill='tozeroy',
                           fillcolor='lightgrey',
                           line=dict(color='lightgrey'),
                           text="lightgrey",
                           hoverinfo='text')
        traces.append(trace)
    print(labels)
    layout = go.Layout(xaxis=dict(
        tickvals=tickvals,
        ticktext=labels,
        showticklabels=True,
        dtick=1,
    ))

    fig = go.Figure(data=traces, layout=layout)
    plt(fig, filename=prefix + '.html')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['area_plot'])
    hg38_ids = avenio['Sample ID']+':'+avenio['Genomic Position']+':'+avenio['Allele Fraction']
    avenio['hg19siteID'] = [converter[x] if x in converter else x for x in hg38_ids]
    avenio = avenio.set_index('hg19siteID')
    result = hot_df.join(avenio)
    result.to_excel(out)


def match_avenio_hot(hot_table, avenio_table):
    avenio = pd.read_csv(avenio_table, header=0, sep='\t')
    hot = pd.read_csv(hot_table, header=0, sep='\t')
    hit_inds = []
    for ind, row in avenio.iterrows():
        gene = row['Gene']
        phgvs = row['Amino Acid Change']
        for i, r in hot.iterrows():
            try:
                if gene == r['Gene'] and r['pHgvs'].replace('(', '').replace(')', '') in phgvs:
                    if ind in hit_inds:
                        continue
                    else:
                        hit_inds.append(ind)
            except:
                print(r['Gene'], r['pHgvs'], phgvs)
    result = avenio.loc[hit_inds]
    result.to_excel('in_avenio_hot.xlsx')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['cross_map', 'extract_hot', 'match_avenio_hot', 'process_avenio_result', 'annotation'])
示例#7
0
def boxplots(df,
             x,
             y,
             hue=None,
             out='boxplots.html',
             ncols=3,
             y_range=(0, 0),
             dot=False):
    if type(df) == str:
        df = pd.read_csv(df, sep=None, engine='python').fillna(0)
    y_range = Range1d(*y_range) if all(y_range) else None
    plots = []
    if hue:
        for each in df[hue].unique():
            tmp = df.loc[df[hue] == each]
            p = boxplot(tmp, x, y, title=each, y_range=y_range, dot=dot)
            plots.append(p)
    else:
        plots = [boxplot(df, x, y, dot=dot)]
    fig = gridplot(plots,
                   toolbar_location='left',
                   sizing_mode='stretch_{}'.format('width'),
                   ncols=ncols)
    output_file(out, title="boxplot")
    save(fig)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['boxplots'])
示例#8
0
        table.columns = [
            new_name_dict[x] if x in new_name_dict else x
            for x in table.columns
        ]
    if new_row_name:
        new_name_dict = dict(x.strip().split('\t')[:2]
                             for x in open(new_row_name))
        table.index = [
            new_name_dict[x] if x in new_name_dict else x for x in table.index
        ]
    if sum_table:
        table = table.sum(axis=axis)
        print(table)
    if describe_table:
        table = table.describe()
        print(table)
    if sort_by:
        table.sort_values(by=sort_by,
                          inplace=True,
                          axis=axis,
                          ascending=not descending)
    if out_name.endswith('.csv'):
        table.to_csv(out_name)
    else:
        table.to_csv(out_name, sep='\t')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['table_knife'])
示例#9
0
                    corr, pval = stats.pearsonr(data.loc[p1], data.loc[p2])
                    direct = 'positive' if corr > 0 else 'negative'
                    if abs(corr) >= corr_cutoff and pval <= pval_cutoff:
                        if reg_direct:
                            regulation = 'unknown'
                            if (p1, p2) in direct_dict:
                                regulation = direct_dict[(p1, p2)]
                                if regulation == '-->':
                                    regulation = 'up'
                                else:
                                    'down'
                            elif (p2, p1) in direct_dict:
                                regulation = direct_dict[(p2, p1)]
                                if regulation == '-->':
                                    regulation = 'down'
                                else:
                                    'up'
                            f.write(
                                f'{p1}\t{p2}\t{corr}\t{direct}\t{pval}\t{regulation}\n'
                            )
                        else:
                            f.write(f'{p1}\t{p2}\t{corr}\t{direct}\t{pval}\n')
                except Exception as e:
                    print(e)
                    print(p1, p2)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['set_node_attrs', 'pair_corr'])
示例#10
0
                 geneid2symbol=None,
                 population=None,
                 correct='fdr_bh',
                 alpha=0.05,
                 top=20,
                 show_gene_limit=6,
                 only_plot_sig=False):
    gene2go = gene2go or "/nfs2/database/Human_gene_go_kegg_annot/NewAnnot/hsa.ensembl.gene2go.txt"
    obo = obo or "/nfs2/database/Human_gene_go_kegg_annot/NewAnnot/go-basic.obo"
    geneid2symbol = geneid2symbol or "/nfs2/database/Human_gene_go_kegg_annot/NewAnnot/hsa.ensembl.id2symbol.txt"
    population = population or "/nfs2/database/Human_gene_go_kegg_annot/NewAnnot/hsa.gene.list"
    for each in study:
        enrich(gene2go=gene2go,
               study=each,
               obo=obo,
               population=population,
               geneid2symbol=geneid2symbol,
               correct=correct,
               alpha=alpha,
               top=top,
               goea_out=None,
               only_plot_sig=only_plot_sig,
               dag_out=None,
               dpi=300,
               show_gene_limit=show_gene_limit)


if __name__ == '__main__':
    from xcmds.xcmds import xcmds
    xcmds(locals(), include=['enrich', 'enrich_batch'])
示例#11
0
    with gzip.open(fastq, 'rb') as f:
        while True:
            line = f.readline()
            if not line:
                break
            if line.startswith(b'@'):
                header, detail = line.strip().decode().split()
                lane = '{:0>3d}'.format(int(header.split(':')[3]))
                read_num, filtered, ctrl_num, indexes = detail.split(':')
                indexes = tuple(sorted(indexes.split('+')))
                sample = pair_index_dict.get(indexes)
                if sample:
                    w_id = sample+f'_R{read_num}'
                    if w_id not in write_obj_dict:
                        write_obj = gzip.open(f'{sample}_S1_L{lane}_R{read_num}_001.fastq.gz', 'w')
                        write_obj_dict[w_id] = write_obj
                    else:
                        write_obj = write_obj_dict[w_id]
                    write_obj.write(line)
                    write_obj.write(f.readline())
                    write_obj.write(f.readline())
                    write_obj.write(f.readline())
    # close writing obj
    for _, obj in write_obj_dict.items():
        obj.close()


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['split_fastq', 'run_splitter'])
示例#12
0
                    if on_base:
                        result.setdefault(sample,
                                          dict())[tuple(row[:2])] = row[2]
                    else:
                        depths.append(depth)
                if not on_base:
                    row = [contig, start + 1, end, median_high(depths)]
                    result.setdefault(sample, dict())[tuple(row[:3])] = row[3]

    data = pd.DataFrame(result)
    data.index.names = ['chr', 'start', 'end'
                        ] if not on_base else ['chr', 'pos']
    data = data.sort_index()
    if not on_base:
        bed_info = pd.read_table(bed, header=None, comment='#')
        if not one_based:
            bed_info.iloc[:, 1] = bed_info.iloc[:, 1] + 1
        bed_info = bed_info.set_index(list(bed_info.columns[:3]))
        bed_info.index.names = ['chr', 'start', 'end']
        data = bed_info.join(data)

    if out.endswith('xlsx'):
        data.to_excel(out, merge_cells=False)
    else:
        data.to_csv(out, sep='\t')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['depth'])
示例#13
0
    with open(vcf_lst) as f:
        for line in f:
            vcfs.append(line.strip())

    results = list()
    with Pool(p_num) as executor:
        for vcf in vcfs:
            future = executor.submit(target_mutate_status, vcf, target_genes)
            results.append(future)

    merge_result = dict()
    for each in results:
        merge_result.update(each.result())

    df = pd.DataFrame(merge_result)
    df.index.name = 'Genes'
    gene_mutated_ratio = df.apply(
        lambda x: round(sum(bool(v) for v in x) / df.shape[0], 3), axis=0)
    df.loc['mutated_ratio'] = gene_mutated_ratio
    sample_mutated_ratio = df.apply(
        lambda x: round(sum(bool(v) for v in x) / df.shape[1], 3), axis=1)
    df['mutated_ratio'] = sample_mutated_ratio
    order = sorted(df.index)
    df = df.loc[order]
    df.to_csv('stats.csv')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['stat_multi_vcf', 'filter_vcf'])
示例#14
0
文件: bar3d.py 项目: gudeqing/biodev
    lg1 = plt.Rectangle((-1, 0), 1, 1, fc="grey")
    lg2 = plt.Rectangle((-1, 0), 1, 1, fc="lightgreen")
    lg3 = plt.Rectangle((-1, 0), 1, 1, fc="yellow")
    lg4 = plt.Rectangle((-1, 0), 1, 1, fc="pink")
    lg5 = plt.Rectangle((-1, 0), 1, 1, fc="red")
    ax.legend([lg1, lg2, lg3, lg4, lg5],
              ['<0.005', '<0.01', '<0.02', '<0.03', '>=0.03'])
    # adjust tick label to axis distance
    ax.tick_params(axis='both', which='major', pad=0.1)

    plt.autoscale(enable=True, axis='both', tight=True)
    plt.savefig(out, dpi=300)


def bar3d_html(data):
    raw = pd.read_csv(data, header=0, index_col=0, sep=None, engine='python')
    data = [(i, j, raw.loc[j, i]) for i in raw.columns for j in raw.index]
    c = (Bar3D().add(
        "",
        [[d[1], d[0], d[2]] for d in data],
        xaxis3d_opts=opts.Axis3DOpts(raw.index, type_="category"),
        yaxis3d_opts=opts.Axis3DOpts(raw.columns, type_="category"),
        zaxis3d_opts=opts.Axis3DOpts(type_="value"),
    ).set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=20), ))
    c.render('bar3d.html')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['bar3dplot'])
示例#15
0

def converting(query, hgnc_custom=None, out='query_result.txt',
               prior_known_pair=None, symbol2id=False):
    """
    converting ensembl id to symbol or reverse
    :param hgnc_custom: https://www.genenames.org/download/custom/, "/nfs2/database/HGNC/custom.txt"
    :param sym: 待查询的列表文件
    :param out: 输出文件名
    :param prior_known_pair: 已经有的ensembl id 和 symbol对应文件, 包含两列; 如提供, 则将优先使用该文件做转换
    :param symbol2id: bool, 如果想把symbol转换为id, 则请用此参数
    """
    hgnc_custom = hgnc_custom if hgnc_custom is not None else 'hgnc.info.txt'
    from urllib.request import urlretrieve
    urlretrieve('https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit', hgnc_custom)
    object = ParseHGNC(hgnc_custom)
    return object.converting(query=query, symbol2id=symbol2id, out=out, known_pair=prior_known_pair)


if __name__ == '__main__':
    from xcmds.xcmds import xcmds
    xcmds(locals(), include=['converting'])








示例#16
0
def merge(vcfs: tuple, names: tuple, out_prefix='merged'):
    assert len(vcfs) == len(names)
    vcf_lst = []
    for vcf, name in zip(vcfs, names):
        d = pd.read_table(vcf, comment='#', index_col=[0, 1, 2, 3, 4, 5])
        d.columns = [name + '.' + x for x in d.columns[:-1]] + [name]
        vcf_lst.append(d)
    # merge
    e = vcf_lst[0]
    for vcf in vcf_lst[1:]:
        e = e.join(vcf, how='outer')
    e.to_csv(f'{out_prefix}.detail.xls', sep='\t')
    # simplify
    # pattern = re.compile(r'AAChange_refGene=.[^,;]+')
    f1 = lambda x: x.split(':')[0].split('=')[1] + ':' + x.split(':')[
        -1] if type(x) == str else None
    f2 = lambda x: x.split(':')[2] + ':' + x.split(':')[3] if type(
        x) == str else None
    info = e[[x for x in e.columns if x.endswith('INFO')]].applymap(f1)
    af_dp = e[names].applymap(f2)
    filter_ = e[[x for x in e.columns if x.endswith('FILTER')]]
    # result = info.join(af_dp).join(filter_)
    af_dp.columns = info.columns
    result = (info + ':' + af_dp).join(filter_)
    result.to_csv(f'{out_prefix}.simplified.xls', sep='\t')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['merge'])
示例#17
0
            else:
                raise Exception(f'{sample} 只在一组vcf中出现')
            if not var_dict and (not var_dict2):
                print(f'WARN: No hotspot mutation detected in {sample}')
            for mutation in mutations:
                if mutation in var_dict:
                    af = var_dict[mutation]
                else:
                    af = 0
                if mutation in var_dict2:
                    af2 = var_dict2[mutation]
                else:
                    af2 = 0
                if mutation in var_dict and (mutation in var_dict2):
                    consistent = 'yes'
                else:
                    consistent = 'no'
                fw.write(f'{sample}\t{mutation}\t{af}\t{af2}\t{consistent}\n')
    if sample_info:
        sample_info_df = pd.read_csv(sample_info, header=0, index_col=0, sep=None, engine='python')
        mutation_df = pd.read_csv(out_file, header=0, index_col=0, sep=None, engine='python')
        final_df = mutation_df.join(sample_info_df)
        final_df.to_excel(out_file[:-3]+'xlsx')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['batch_extract_hotspot'])


示例#18
0
    fig, axes = plt.subplots(nrows=baseline)
    if baseline == 1:
        axes = [axes]
    for i, ax in enumerate(axes):
        indexes = [i] + list(range(baseline, raw.shape[0]))
        data = raw.iloc[indexes, :]
        starts = data.iloc[:, 1]
        ends = data.iloc[:, 2]
        names = data.iloc[:, 3]
        colors = get_color_pool(len(starts))
        if not same_height:
            heights = range(len(starts))
        else:
            heights = [0] * len(starts)
        ax.set_yticks(heights)
        ax.set_ylim(-0.1, len(starts) + 0.1)
        ax.set_xlim(starts.iloc[0] - 1, ends.iloc[0] + 1)
        ax.hlines(heights, starts, ends, colors=colors, linewidth=2)
        ax.grid(linestyle='-', linewidth=0.5)
        ax.set_yticklabels([str(x) for x in names], rotation=0, fontsize=6)
        # coord = sorted(pd.concat([starts, ends]))
        # ax.set_xticks(coord)
        # ax.set_xticklabels([str(x) for x in coord], rotation=90, fontsize=6)
    plt.tight_layout()
    plt.savefig(out)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['range_lines'])
示例#19
0
    def generate_new_color(existing_colors, pastel_factor=0.5):
        max_distance = None
        best_color = None
        for i in range(0, 100):
            color = get_random_color(pastel_factor=pastel_factor)
            # exclude some colors
            if np.absolute(np.array(color) - np.array([1, 1, 1])).sum() < 0.1:
                continue
            if not existing_colors:
                return color
            best_distance = min(
                [color_distance(color, c) for c in existing_colors])
            if not max_distance or best_distance > max_distance:
                max_distance = best_distance
                best_color = color
        return best_color

    color_pool = []
    for i in range(0, n):
        color_pool.append(generate_new_color(color_pool, pastel_factor=0.9))
    color_pool = [(int(x * 255), int(y * 255), int(z * 255))
                  for x, y, z in color_pool]
    color_pool = sorted(color_pool, key=lambda x: (x[0], x[1], x[2]))
    return colorlover.to_rgb(color_pool)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['pca'])
示例#20
0
            ld = dict(zip(fields, lst[:10]))
            if int(ld['e5']) - int(ld['s5']) != 1:
                raise Exception('断点坐标怎么不是相差1')
            # 第一个断点时,正链取上游序列
            if ld['s1'] == '+':
                left = gn.fetch(ld['c5'], int(ld['s5']) - extend, int(ld['e5']))
                # print(len(left))
            else:
                left = gn.fetch(ld['c5'], int(ld['s5']), int(ld['e5']) + extend)
                left = reverse_complement(left)

            # 第二个断点时,正链取下游序列
            if ld['s2'] == '+':
                right = gn.fetch(ld['c3'], int(ld['s3']), int(ld['e3']) + extend)
                # print(len(right))
            else:
                right = gn.fetch(ld['c3'], int(ld['s3']) - extend, int(ld['e3']))
                right = reverse_complement(right)

            # print(left, right)
            b1 = ':'.join([lst[0], lst[-2], lst[2]])
            b2 = ':'.join([lst[3], lst[-1], lst[5]])
            fw.write(f'>{ld["name"]} {b1}--{b2}\n')
            fw.write(left+'|'+right+'\n')


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), )

示例#21
0
    :param top: 需要给两个值, 空格分隔即可. 若 top[0]<=1,则根据比例计算, 否则直接用该数;第二个数top[1]与第一个同理。
    :param index_col: 行索引号, 可以指定多列
    :param header: 列索引号, 可以指定多行
    :return:
    """
    exp = pd.read_csv(exp_matrix,
                      index_col=index_col,
                      header=header,
                      sep=None,
                      engine='python')
    exp = exp.loc[exp.sum(axis=1) > 0]
    rexp = exp.rank(ascending=False)
    select = list(top)

    if float(top[0]) <= 1:
        select[0] = int(exp.shape[0] * top[0])
    if float(top[1]) <= 1:
        select[1] = int(exp.shape[1] * top[1])
    select = [int(x) for x in select]
    ind = rexp.apply(lambda x: sum(y <= select[0] for y in x) >= select[1],
                     axis=1)
    top_exp = exp.loc[ind]
    out_name = 'top{}.exp.{}_in_{}samples.csv'.format(top_exp.shape[0],
                                                      select[0], select[1])
    top_exp.to_csv(out_name)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['top_rank_gene'])
示例#22
0
                   group_order:list=None, out='multiline.png'):
    if sample_group is not None:
        group_df = pd.read_csv(sample_group, index_col=0, header=0, sep=None, engine='python')
        group_names = set(group_df.iloc[:, 0])
        if group_order is None:
            group_names = sorted(list(group_names))
        else:
            group_names = group_order
        if type(data) == str:
            data = pd.read_csv(data, index_col=index_col, header=0, sep=None, engine='python')
        centered = data.sub(data.mean(axis=1), axis=0)
        mean_centered = pd.DataFrame()
        for name in group_names:
            target_samples = list(group_df.loc[group_df.iloc[:, 0]==name].index)
            mean_centered[name] = centered[target_samples].mean(axis=1)
        # print(geometric.mean(axis=1))
        plot_data = mean_centered
    else:
        if type(data) == str:
            plot_data = pd.read_csv(data, index_col=index_col, header=0, sep=None, engine='python')
        else:
            plot_data = data

    _plot_multiline(plot_data, out=out, annotate_at_end=annotate_at_end)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['plot_exp_lines'])

示例#23
0
import os
import pandas as pd


def merge_star_alignment_stat(log_files: list, outfile=None):
    results = list()
    for logfile in log_files:
        sample = os.path.basename(logfile).split('.', 1)[0]
        with open(logfile) as fr:
            _ = [fr.readline() for i in range(5)]
            result = dict(sample=sample)
            for line in fr:
                if '|' in line:
                    desc, value = line.split('|')
                    desc = desc.strip()
                    value = value.strip()
                    result[desc] = value
            results.append(result)
    df = pd.DataFrame(results).set_index('sample')
    outfile = 'star_alignment_stat.csv' if outfile is None else outfile
    df.to_csv(outfile)
    return df


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['merge_star_alignment_stat'])
示例#24
0
    diff = ImageChops.add(diff, diff, 2.0, -90)
    bbox = diff.getbbox()
    if bbox:
        img = img.crop(bbox)
    img.save(path)
    return path


def pdf2img(img, zoom_ratio=2, rotate=0, out_format='png', trim_white=True):
    """
    把第一页的pdf图片抠出来
    :param img:
    :param zoom_ratio:
    :param rotate:
    :param out_format:
    :param trim_white:
    :return:
    """
    img = pdf2png(img,
                  zoom_ratio=zoom_ratio,
                  rotate=rotate,
                  out_format=out_format)
    if trim_white:
        img = trim_white_around(img)
    return img


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['pdf2img'])
示例#25
0
    if table.columns[0] == 'Unnamed: 0':
        table.columns = ['index'] + list(table.columns[1:])
    # print(table.head())
    if index_cols:
        table.set_index(index_cols, inplace=True)
    for each in files[1:]:
        each_table = pd.read_csv(each, index_col=None, header=0, sep=None, engine='python')
        if each_table.columns[0] == 'Unnamed: 0':
            each_table.columns = ['index'] + list(each_table.columns[1:])
        if index_cols:
            each_table.set_index(index_cols, inplace=True)
        table = table.join(each_table, how=how)
    table.columns = [x.strip() for x in table.columns]
    if new_col_name is not None:
        new_name_df = pd.read_csv(new_col_name, index_col=None, header=None, sep=None, engine='python')
        new_name_dict = dict(zip(new_name_df.iloc[0], new_name_df.iloc[1]))
        table.columns = [new_name_dict[x] for x in table.columns]
    out_index = True
    if not index_cols:
        out_index = False
    if not out.endswith('.csv'):
        table.to_csv(out, sep='\t', index=out_index)
    else:
        table.to_csv(out, out_index)


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['merge_data'])

示例#26
0
                    venn.venn(tmp_dict, cmap="tab10", fmt="{size}\n{percentage:.2f}%", fontsize=9)
                else:
                    venn.venn(tmp_dict, cmap="tab10")
                out_name = out_prefix + '.{}.venn.{}'.format(name, graph_format)
                plt.savefig(out_name, dpi=300)
                plt.close()

            else:
                print('venn for {}?'.format(groups))
                print('venn only support 2-6 sets')

    # intersection plot
    if venn_list is None:
        if len(venn_set_dict) <= 8:
            plot(from_contents(venn_set_dict), sum_over=False, sort_categories_by=None, show_counts=True)
            plt.savefig('{}.upSet.{}'.format(out_prefix, graph_format), dpi=300)
            plt.close()
    else:
        for group, name in zip(venn_list, venn_names):
            groups = group.split(',')
            tmp_dict = {x: y for x, y in venn_set_dict.items() if x in groups}
            if len(tmp_dict) > 1:
                plot(from_contents(tmp_dict), sum_over=False, sort_categories_by=None, show_counts=True)
                plt.savefig('{}.{}.upSet.{}'.format(out_prefix, name, graph_format), dpi=300)
                plt.close()


if __name__ == '__main__':
    from xcmds.xcmds import xcmds
    xcmds(locals(), include=['run'])
示例#27
0
import PyPDF2


def pdf2txt(pdf, page_range=(0, 1), out='pdf.txt'):
    with open(pdf, 'rb') as f, open(out, 'w') as f2:
        pdfReader = PyPDF2.PdfFileReader(f)
        print(" No. Of Pages :", pdfReader.numPages)
        for i in range(page_range[0], page_range[1]):
            pageObject = pdfReader.getPage(i)
            f2.write(pageObject.extractText())


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['pdf2txt'])

示例#28
0
            gridplot.map_diag(plt.hist)
        elif diag_kind == 'scatter':
            gridplot.map_diag(sns.scatterplot)
        elif diag_kind == 'kde':
            gridplot.map(sns.kdeplot)
        else:
            gridplot.map_diag(sns.scatterplot)
        for ax, col in zip(np.diag(gridplot.axes), tdata.columns):
            ax.set_xlabel(col)
        # lower
        gridplot.map_lower(sns.scatterplot)
        gridplot.map_lower(sns.regplot, scatter=False)
        gridplot.map_lower(corr_annotate, method=corr_method)

        # upper
        gridplot.map_upper(corr_annotate,
                           method=corr_method,
                           x_pos=0.15,
                           y_pos=0.6,
                           font_size=14)

        plt.savefig(prefix +
                    f'.{group_name}.common{tdata.shape[0]}genes.{fig_format}',
                    dpi=300)
        plt.close()


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['pair_corr'])
示例#29
0
                                      ascending=False)
    stat_data = stat_data.transpose()
    stat_data.to_csv(os.path.join(outdir, 'detected_gene_type.stat.csv'))
    # plot
    df = stat_data
    order = sorted(df.index)
    df = df.loc[order]
    # print(df.head())
    colors = get_color_pool(df.shape[1])
    color_dict = dict(zip(df.columns, colors))
    data = [
        go.Bar(x=df.index,
               y=df[x] / df.sum(axis=1),
               name=x,
               marker=dict(color=color_dict[x])) for x in df.columns
    ]
    layout = go.Layout(
        title="Gene type distribution",
        # xaxis=dict(title='Sample'),
        barmode='stack',
    )
    fig = go.Figure(data=data, layout=layout)
    prefix = "GeneTypeDistribution"
    draw(fig, prefix=prefix, outdir=outdir)
    return stat_data


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['detected_gene_type_stat'])
示例#30
0
    if sample_group:
        group = pd.read_csv(sample_group,
                            header=0,
                            index_col=0,
                            sep=None,
                            engine='python')
    else:
        group = pd.DataFrame({'All': {k: k for k in samples}})
    group.index.name = 'Sample'
    group_names = group.columns
    group.reset_index('Sample', inplace=True)
    data = data.merge(group, on='Sample')
    for name in group_names:
        hue = None if sample_group is None else name
        if x_col.lower() == 'gene':
            ax = sns.boxplot(x='Gene', y='Expression', hue=hue, data=data)
        else:
            ax = sns.boxplot(x='Sample', y='Expression', hue=hue, data=data)
        ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=6, rotation=90)
        if xlabel:
            ax.set(xlabel=xlabel)
        plt.savefig(f'{prefix}{name}.boxplot.png',
                    dpi=300,
                    bbox_inches='tight')
        plt.close()


if __name__ == '__main__':
    from xcmds import xcmds
    xcmds.xcmds(locals(), include=['expr_box_plot'])