do_not_download=False): """ :param target_dir: example "s3://epionengs/80011001_HCC_Metastase/Methylation/" :param outdir: 下载结果的输出路径 :param match: 匹配文件的表达式 :param restore: 如果提供该参数,则下载前需要对文件进行还原 :param threads: 下载的并发数 :param do_not_download: 如果提供该参数,则不下载 :return: """ out = os.path.join(outdir, 'target_file.list') bucket = target_dir.split('/')[2] target_files = get_target_file_path(target_dir, out, match) if restore: if target_files: restore_files(target_files, bucket=bucket) else: print('Nothing matched!') return if not do_not_download: if restore: import time time.sleep(3600 * 12) download(target_files, outdir, bucket, threads) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), exclude=['pool', 'run_cmd'])
def run_cmd(cmd): return subprocess.call(cmd, shell=True) def downBam2Fq(data): """ samtools sort + view + fastq -> down sample bam to fastq :param data: 第一列样本名,第二列为bam路径,其他列为百分比,第二列的header需指定为"path" :return: """ data = pd.read_csv(data, header=0, index_col=0, sep=None, engine='python') cmd_list = list() for sample in data.index: bam = data.loc[sample, 'path'] for name in data.columns[2:]: ratio = data.loc[sample, name] if not os.path.exists(name): os.mkdir(name) fq = f'{name}/{sample}.{name}.R1.fq.gz' fq2 = f'{name}/{sample}.{name}.R2.fq.gz' cmd = f'samtools sort -n {bam}| samtools view -s {ratio} -O BAM - | samtools fastq -1 {fq} -2 {fq2} - ' cmd_list.append(cmd) with ThreadPoolExecutor(6) as pool: pool.map(run_cmd, cmd_list) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['downBam2Fq'])
all_cls = {x.strip().split('\t')[1] for x in open(parent_info)} cls_lst = [] reason_lst = [] for cls_set, reason in result: if len(cls_set) > 1: cls_depth = map(max_distance_to_root, [tree for x in range(len(cls_set))], list(cls_set)) selected = sorted(zip(list(cls_set), cls_depth), key=lambda x:x[1])[-1][0] print(selected, 'vs ', cls_set) cls_lst.append(selected) else: if cls_set: for each in cls_set: if each not in all_cls: raise Exception(f'分类名{each}不在已知分类里!') cls_lst.append(each) else: cls_lst.append('') reason_lst.append(reason) # target_df['OKR_Name'] = cls_lst # target_df['myReason'] = reason_lst # target_df['other_reportable'] = other_reportable raw_table['OKR_Name'] = cls_lst raw_table['myReason'] = reason_lst raw_table['other_reportable'] = other_reportable raw_table.to_excel(out, merge_cells=False, index=False) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals())
from matplotlib import pyplot as plt import seaborn as sns; sns.set() import pandas as pd sns.set(font_scale=0.5) def heatmap(data, vmin:float=None, vmax:float=None, cmap:str='RdYlBu_r', center:float=None, robust=False, label_size=8, pvalue:str=None, annot=True, fmt='', linewidths:float=0, linecolor='white', cbar=True, annot_fontsize=6, square=False, xticklabels='auto', yticklabels='auto', out='heatmap.png', dpi=300): data = pd.read_csv(data, header=0, index_col=0, sep=None, engine='python') if annot: annot = data.round(3).applymap(str) if pvalue is not None: pvalues = pd.read_csv(pvalue, header=0, index_col=0, sep=None, engine='python') annot += '\n(p=' + pvalues.round(3).applymap(str) + ')' ax = sns.heatmap(data, vmin=vmin, vmax=vmax, cmap=cmap, center=center, robust=robust, annot_kws={"size": annot_fontsize}, annot=annot, fmt=fmt, linewidths=linewidths, linecolor=linecolor, cbar=cbar, square=square, xticklabels=xticklabels, yticklabels=yticklabels) ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=label_size) plt.savefig(out, dpi=300, bbox_inches='tight') plt.close() # heatmap.__doc__ = sns.heatmap.__doc__ if __name__ == '__main__': from xcmds.xcmds import xcmds xcmds(locals(), include=['heatmap'])
tickvals = [] for ind, each_file in enumerate(files): df = pd.read_csv(each_file, index_col=0, sep='\t') x_list = [x + max(x_list) for x in range(df.shape[0])] labels += list(df.index) tickvals += x_list y_list = df.iloc[:, 0] trace = go.Scatter(x=x_list, y=y_list, fill='tozeroy', fillcolor='lightgrey', line=dict(color='lightgrey'), text="lightgrey", hoverinfo='text') traces.append(trace) print(labels) layout = go.Layout(xaxis=dict( tickvals=tickvals, ticktext=labels, showticklabels=True, dtick=1, )) fig = go.Figure(data=traces, layout=layout) plt(fig, filename=prefix + '.html') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['area_plot'])
hg38_ids = avenio['Sample ID']+':'+avenio['Genomic Position']+':'+avenio['Allele Fraction'] avenio['hg19siteID'] = [converter[x] if x in converter else x for x in hg38_ids] avenio = avenio.set_index('hg19siteID') result = hot_df.join(avenio) result.to_excel(out) def match_avenio_hot(hot_table, avenio_table): avenio = pd.read_csv(avenio_table, header=0, sep='\t') hot = pd.read_csv(hot_table, header=0, sep='\t') hit_inds = [] for ind, row in avenio.iterrows(): gene = row['Gene'] phgvs = row['Amino Acid Change'] for i, r in hot.iterrows(): try: if gene == r['Gene'] and r['pHgvs'].replace('(', '').replace(')', '') in phgvs: if ind in hit_inds: continue else: hit_inds.append(ind) except: print(r['Gene'], r['pHgvs'], phgvs) result = avenio.loc[hit_inds] result.to_excel('in_avenio_hot.xlsx') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['cross_map', 'extract_hot', 'match_avenio_hot', 'process_avenio_result', 'annotation'])
def boxplots(df, x, y, hue=None, out='boxplots.html', ncols=3, y_range=(0, 0), dot=False): if type(df) == str: df = pd.read_csv(df, sep=None, engine='python').fillna(0) y_range = Range1d(*y_range) if all(y_range) else None plots = [] if hue: for each in df[hue].unique(): tmp = df.loc[df[hue] == each] p = boxplot(tmp, x, y, title=each, y_range=y_range, dot=dot) plots.append(p) else: plots = [boxplot(df, x, y, dot=dot)] fig = gridplot(plots, toolbar_location='left', sizing_mode='stretch_{}'.format('width'), ncols=ncols) output_file(out, title="boxplot") save(fig) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['boxplots'])
table.columns = [ new_name_dict[x] if x in new_name_dict else x for x in table.columns ] if new_row_name: new_name_dict = dict(x.strip().split('\t')[:2] for x in open(new_row_name)) table.index = [ new_name_dict[x] if x in new_name_dict else x for x in table.index ] if sum_table: table = table.sum(axis=axis) print(table) if describe_table: table = table.describe() print(table) if sort_by: table.sort_values(by=sort_by, inplace=True, axis=axis, ascending=not descending) if out_name.endswith('.csv'): table.to_csv(out_name) else: table.to_csv(out_name, sep='\t') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['table_knife'])
corr, pval = stats.pearsonr(data.loc[p1], data.loc[p2]) direct = 'positive' if corr > 0 else 'negative' if abs(corr) >= corr_cutoff and pval <= pval_cutoff: if reg_direct: regulation = 'unknown' if (p1, p2) in direct_dict: regulation = direct_dict[(p1, p2)] if regulation == '-->': regulation = 'up' else: 'down' elif (p2, p1) in direct_dict: regulation = direct_dict[(p2, p1)] if regulation == '-->': regulation = 'down' else: 'up' f.write( f'{p1}\t{p2}\t{corr}\t{direct}\t{pval}\t{regulation}\n' ) else: f.write(f'{p1}\t{p2}\t{corr}\t{direct}\t{pval}\n') except Exception as e: print(e) print(p1, p2) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['set_node_attrs', 'pair_corr'])
geneid2symbol=None, population=None, correct='fdr_bh', alpha=0.05, top=20, show_gene_limit=6, only_plot_sig=False): gene2go = gene2go or "/nfs2/database/Human_gene_go_kegg_annot/NewAnnot/hsa.ensembl.gene2go.txt" obo = obo or "/nfs2/database/Human_gene_go_kegg_annot/NewAnnot/go-basic.obo" geneid2symbol = geneid2symbol or "/nfs2/database/Human_gene_go_kegg_annot/NewAnnot/hsa.ensembl.id2symbol.txt" population = population or "/nfs2/database/Human_gene_go_kegg_annot/NewAnnot/hsa.gene.list" for each in study: enrich(gene2go=gene2go, study=each, obo=obo, population=population, geneid2symbol=geneid2symbol, correct=correct, alpha=alpha, top=top, goea_out=None, only_plot_sig=only_plot_sig, dag_out=None, dpi=300, show_gene_limit=show_gene_limit) if __name__ == '__main__': from xcmds.xcmds import xcmds xcmds(locals(), include=['enrich', 'enrich_batch'])
with gzip.open(fastq, 'rb') as f: while True: line = f.readline() if not line: break if line.startswith(b'@'): header, detail = line.strip().decode().split() lane = '{:0>3d}'.format(int(header.split(':')[3])) read_num, filtered, ctrl_num, indexes = detail.split(':') indexes = tuple(sorted(indexes.split('+'))) sample = pair_index_dict.get(indexes) if sample: w_id = sample+f'_R{read_num}' if w_id not in write_obj_dict: write_obj = gzip.open(f'{sample}_S1_L{lane}_R{read_num}_001.fastq.gz', 'w') write_obj_dict[w_id] = write_obj else: write_obj = write_obj_dict[w_id] write_obj.write(line) write_obj.write(f.readline()) write_obj.write(f.readline()) write_obj.write(f.readline()) # close writing obj for _, obj in write_obj_dict.items(): obj.close() if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['split_fastq', 'run_splitter'])
if on_base: result.setdefault(sample, dict())[tuple(row[:2])] = row[2] else: depths.append(depth) if not on_base: row = [contig, start + 1, end, median_high(depths)] result.setdefault(sample, dict())[tuple(row[:3])] = row[3] data = pd.DataFrame(result) data.index.names = ['chr', 'start', 'end' ] if not on_base else ['chr', 'pos'] data = data.sort_index() if not on_base: bed_info = pd.read_table(bed, header=None, comment='#') if not one_based: bed_info.iloc[:, 1] = bed_info.iloc[:, 1] + 1 bed_info = bed_info.set_index(list(bed_info.columns[:3])) bed_info.index.names = ['chr', 'start', 'end'] data = bed_info.join(data) if out.endswith('xlsx'): data.to_excel(out, merge_cells=False) else: data.to_csv(out, sep='\t') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['depth'])
with open(vcf_lst) as f: for line in f: vcfs.append(line.strip()) results = list() with Pool(p_num) as executor: for vcf in vcfs: future = executor.submit(target_mutate_status, vcf, target_genes) results.append(future) merge_result = dict() for each in results: merge_result.update(each.result()) df = pd.DataFrame(merge_result) df.index.name = 'Genes' gene_mutated_ratio = df.apply( lambda x: round(sum(bool(v) for v in x) / df.shape[0], 3), axis=0) df.loc['mutated_ratio'] = gene_mutated_ratio sample_mutated_ratio = df.apply( lambda x: round(sum(bool(v) for v in x) / df.shape[1], 3), axis=1) df['mutated_ratio'] = sample_mutated_ratio order = sorted(df.index) df = df.loc[order] df.to_csv('stats.csv') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['stat_multi_vcf', 'filter_vcf'])
lg1 = plt.Rectangle((-1, 0), 1, 1, fc="grey") lg2 = plt.Rectangle((-1, 0), 1, 1, fc="lightgreen") lg3 = plt.Rectangle((-1, 0), 1, 1, fc="yellow") lg4 = plt.Rectangle((-1, 0), 1, 1, fc="pink") lg5 = plt.Rectangle((-1, 0), 1, 1, fc="red") ax.legend([lg1, lg2, lg3, lg4, lg5], ['<0.005', '<0.01', '<0.02', '<0.03', '>=0.03']) # adjust tick label to axis distance ax.tick_params(axis='both', which='major', pad=0.1) plt.autoscale(enable=True, axis='both', tight=True) plt.savefig(out, dpi=300) def bar3d_html(data): raw = pd.read_csv(data, header=0, index_col=0, sep=None, engine='python') data = [(i, j, raw.loc[j, i]) for i in raw.columns for j in raw.index] c = (Bar3D().add( "", [[d[1], d[0], d[2]] for d in data], xaxis3d_opts=opts.Axis3DOpts(raw.index, type_="category"), yaxis3d_opts=opts.Axis3DOpts(raw.columns, type_="category"), zaxis3d_opts=opts.Axis3DOpts(type_="value"), ).set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=20), )) c.render('bar3d.html') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['bar3dplot'])
def converting(query, hgnc_custom=None, out='query_result.txt', prior_known_pair=None, symbol2id=False): """ converting ensembl id to symbol or reverse :param hgnc_custom: https://www.genenames.org/download/custom/, "/nfs2/database/HGNC/custom.txt" :param sym: 待查询的列表文件 :param out: 输出文件名 :param prior_known_pair: 已经有的ensembl id 和 symbol对应文件, 包含两列; 如提供, 则将优先使用该文件做转换 :param symbol2id: bool, 如果想把symbol转换为id, 则请用此参数 """ hgnc_custom = hgnc_custom if hgnc_custom is not None else 'hgnc.info.txt' from urllib.request import urlretrieve urlretrieve('https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit', hgnc_custom) object = ParseHGNC(hgnc_custom) return object.converting(query=query, symbol2id=symbol2id, out=out, known_pair=prior_known_pair) if __name__ == '__main__': from xcmds.xcmds import xcmds xcmds(locals(), include=['converting'])
def merge(vcfs: tuple, names: tuple, out_prefix='merged'): assert len(vcfs) == len(names) vcf_lst = [] for vcf, name in zip(vcfs, names): d = pd.read_table(vcf, comment='#', index_col=[0, 1, 2, 3, 4, 5]) d.columns = [name + '.' + x for x in d.columns[:-1]] + [name] vcf_lst.append(d) # merge e = vcf_lst[0] for vcf in vcf_lst[1:]: e = e.join(vcf, how='outer') e.to_csv(f'{out_prefix}.detail.xls', sep='\t') # simplify # pattern = re.compile(r'AAChange_refGene=.[^,;]+') f1 = lambda x: x.split(':')[0].split('=')[1] + ':' + x.split(':')[ -1] if type(x) == str else None f2 = lambda x: x.split(':')[2] + ':' + x.split(':')[3] if type( x) == str else None info = e[[x for x in e.columns if x.endswith('INFO')]].applymap(f1) af_dp = e[names].applymap(f2) filter_ = e[[x for x in e.columns if x.endswith('FILTER')]] # result = info.join(af_dp).join(filter_) af_dp.columns = info.columns result = (info + ':' + af_dp).join(filter_) result.to_csv(f'{out_prefix}.simplified.xls', sep='\t') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['merge'])
else: raise Exception(f'{sample} 只在一组vcf中出现') if not var_dict and (not var_dict2): print(f'WARN: No hotspot mutation detected in {sample}') for mutation in mutations: if mutation in var_dict: af = var_dict[mutation] else: af = 0 if mutation in var_dict2: af2 = var_dict2[mutation] else: af2 = 0 if mutation in var_dict and (mutation in var_dict2): consistent = 'yes' else: consistent = 'no' fw.write(f'{sample}\t{mutation}\t{af}\t{af2}\t{consistent}\n') if sample_info: sample_info_df = pd.read_csv(sample_info, header=0, index_col=0, sep=None, engine='python') mutation_df = pd.read_csv(out_file, header=0, index_col=0, sep=None, engine='python') final_df = mutation_df.join(sample_info_df) final_df.to_excel(out_file[:-3]+'xlsx') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['batch_extract_hotspot'])
fig, axes = plt.subplots(nrows=baseline) if baseline == 1: axes = [axes] for i, ax in enumerate(axes): indexes = [i] + list(range(baseline, raw.shape[0])) data = raw.iloc[indexes, :] starts = data.iloc[:, 1] ends = data.iloc[:, 2] names = data.iloc[:, 3] colors = get_color_pool(len(starts)) if not same_height: heights = range(len(starts)) else: heights = [0] * len(starts) ax.set_yticks(heights) ax.set_ylim(-0.1, len(starts) + 0.1) ax.set_xlim(starts.iloc[0] - 1, ends.iloc[0] + 1) ax.hlines(heights, starts, ends, colors=colors, linewidth=2) ax.grid(linestyle='-', linewidth=0.5) ax.set_yticklabels([str(x) for x in names], rotation=0, fontsize=6) # coord = sorted(pd.concat([starts, ends])) # ax.set_xticks(coord) # ax.set_xticklabels([str(x) for x in coord], rotation=90, fontsize=6) plt.tight_layout() plt.savefig(out) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['range_lines'])
def generate_new_color(existing_colors, pastel_factor=0.5): max_distance = None best_color = None for i in range(0, 100): color = get_random_color(pastel_factor=pastel_factor) # exclude some colors if np.absolute(np.array(color) - np.array([1, 1, 1])).sum() < 0.1: continue if not existing_colors: return color best_distance = min( [color_distance(color, c) for c in existing_colors]) if not max_distance or best_distance > max_distance: max_distance = best_distance best_color = color return best_color color_pool = [] for i in range(0, n): color_pool.append(generate_new_color(color_pool, pastel_factor=0.9)) color_pool = [(int(x * 255), int(y * 255), int(z * 255)) for x, y, z in color_pool] color_pool = sorted(color_pool, key=lambda x: (x[0], x[1], x[2])) return colorlover.to_rgb(color_pool) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['pca'])
ld = dict(zip(fields, lst[:10])) if int(ld['e5']) - int(ld['s5']) != 1: raise Exception('断点坐标怎么不是相差1') # 第一个断点时,正链取上游序列 if ld['s1'] == '+': left = gn.fetch(ld['c5'], int(ld['s5']) - extend, int(ld['e5'])) # print(len(left)) else: left = gn.fetch(ld['c5'], int(ld['s5']), int(ld['e5']) + extend) left = reverse_complement(left) # 第二个断点时,正链取下游序列 if ld['s2'] == '+': right = gn.fetch(ld['c3'], int(ld['s3']), int(ld['e3']) + extend) # print(len(right)) else: right = gn.fetch(ld['c3'], int(ld['s3']) - extend, int(ld['e3'])) right = reverse_complement(right) # print(left, right) b1 = ':'.join([lst[0], lst[-2], lst[2]]) b2 = ':'.join([lst[3], lst[-1], lst[5]]) fw.write(f'>{ld["name"]} {b1}--{b2}\n') fw.write(left+'|'+right+'\n') if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), )
:param top: 需要给两个值, 空格分隔即可. 若 top[0]<=1,则根据比例计算, 否则直接用该数;第二个数top[1]与第一个同理。 :param index_col: 行索引号, 可以指定多列 :param header: 列索引号, 可以指定多行 :return: """ exp = pd.read_csv(exp_matrix, index_col=index_col, header=header, sep=None, engine='python') exp = exp.loc[exp.sum(axis=1) > 0] rexp = exp.rank(ascending=False) select = list(top) if float(top[0]) <= 1: select[0] = int(exp.shape[0] * top[0]) if float(top[1]) <= 1: select[1] = int(exp.shape[1] * top[1]) select = [int(x) for x in select] ind = rexp.apply(lambda x: sum(y <= select[0] for y in x) >= select[1], axis=1) top_exp = exp.loc[ind] out_name = 'top{}.exp.{}_in_{}samples.csv'.format(top_exp.shape[0], select[0], select[1]) top_exp.to_csv(out_name) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['top_rank_gene'])
group_order:list=None, out='multiline.png'): if sample_group is not None: group_df = pd.read_csv(sample_group, index_col=0, header=0, sep=None, engine='python') group_names = set(group_df.iloc[:, 0]) if group_order is None: group_names = sorted(list(group_names)) else: group_names = group_order if type(data) == str: data = pd.read_csv(data, index_col=index_col, header=0, sep=None, engine='python') centered = data.sub(data.mean(axis=1), axis=0) mean_centered = pd.DataFrame() for name in group_names: target_samples = list(group_df.loc[group_df.iloc[:, 0]==name].index) mean_centered[name] = centered[target_samples].mean(axis=1) # print(geometric.mean(axis=1)) plot_data = mean_centered else: if type(data) == str: plot_data = pd.read_csv(data, index_col=index_col, header=0, sep=None, engine='python') else: plot_data = data _plot_multiline(plot_data, out=out, annotate_at_end=annotate_at_end) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['plot_exp_lines'])
import os import pandas as pd def merge_star_alignment_stat(log_files: list, outfile=None): results = list() for logfile in log_files: sample = os.path.basename(logfile).split('.', 1)[0] with open(logfile) as fr: _ = [fr.readline() for i in range(5)] result = dict(sample=sample) for line in fr: if '|' in line: desc, value = line.split('|') desc = desc.strip() value = value.strip() result[desc] = value results.append(result) df = pd.DataFrame(results).set_index('sample') outfile = 'star_alignment_stat.csv' if outfile is None else outfile df.to_csv(outfile) return df if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['merge_star_alignment_stat'])
diff = ImageChops.add(diff, diff, 2.0, -90) bbox = diff.getbbox() if bbox: img = img.crop(bbox) img.save(path) return path def pdf2img(img, zoom_ratio=2, rotate=0, out_format='png', trim_white=True): """ 把第一页的pdf图片抠出来 :param img: :param zoom_ratio: :param rotate: :param out_format: :param trim_white: :return: """ img = pdf2png(img, zoom_ratio=zoom_ratio, rotate=rotate, out_format=out_format) if trim_white: img = trim_white_around(img) return img if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['pdf2img'])
if table.columns[0] == 'Unnamed: 0': table.columns = ['index'] + list(table.columns[1:]) # print(table.head()) if index_cols: table.set_index(index_cols, inplace=True) for each in files[1:]: each_table = pd.read_csv(each, index_col=None, header=0, sep=None, engine='python') if each_table.columns[0] == 'Unnamed: 0': each_table.columns = ['index'] + list(each_table.columns[1:]) if index_cols: each_table.set_index(index_cols, inplace=True) table = table.join(each_table, how=how) table.columns = [x.strip() for x in table.columns] if new_col_name is not None: new_name_df = pd.read_csv(new_col_name, index_col=None, header=None, sep=None, engine='python') new_name_dict = dict(zip(new_name_df.iloc[0], new_name_df.iloc[1])) table.columns = [new_name_dict[x] for x in table.columns] out_index = True if not index_cols: out_index = False if not out.endswith('.csv'): table.to_csv(out, sep='\t', index=out_index) else: table.to_csv(out, out_index) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['merge_data'])
venn.venn(tmp_dict, cmap="tab10", fmt="{size}\n{percentage:.2f}%", fontsize=9) else: venn.venn(tmp_dict, cmap="tab10") out_name = out_prefix + '.{}.venn.{}'.format(name, graph_format) plt.savefig(out_name, dpi=300) plt.close() else: print('venn for {}?'.format(groups)) print('venn only support 2-6 sets') # intersection plot if venn_list is None: if len(venn_set_dict) <= 8: plot(from_contents(venn_set_dict), sum_over=False, sort_categories_by=None, show_counts=True) plt.savefig('{}.upSet.{}'.format(out_prefix, graph_format), dpi=300) plt.close() else: for group, name in zip(venn_list, venn_names): groups = group.split(',') tmp_dict = {x: y for x, y in venn_set_dict.items() if x in groups} if len(tmp_dict) > 1: plot(from_contents(tmp_dict), sum_over=False, sort_categories_by=None, show_counts=True) plt.savefig('{}.{}.upSet.{}'.format(out_prefix, name, graph_format), dpi=300) plt.close() if __name__ == '__main__': from xcmds.xcmds import xcmds xcmds(locals(), include=['run'])
import PyPDF2 def pdf2txt(pdf, page_range=(0, 1), out='pdf.txt'): with open(pdf, 'rb') as f, open(out, 'w') as f2: pdfReader = PyPDF2.PdfFileReader(f) print(" No. Of Pages :", pdfReader.numPages) for i in range(page_range[0], page_range[1]): pageObject = pdfReader.getPage(i) f2.write(pageObject.extractText()) if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['pdf2txt'])
gridplot.map_diag(plt.hist) elif diag_kind == 'scatter': gridplot.map_diag(sns.scatterplot) elif diag_kind == 'kde': gridplot.map(sns.kdeplot) else: gridplot.map_diag(sns.scatterplot) for ax, col in zip(np.diag(gridplot.axes), tdata.columns): ax.set_xlabel(col) # lower gridplot.map_lower(sns.scatterplot) gridplot.map_lower(sns.regplot, scatter=False) gridplot.map_lower(corr_annotate, method=corr_method) # upper gridplot.map_upper(corr_annotate, method=corr_method, x_pos=0.15, y_pos=0.6, font_size=14) plt.savefig(prefix + f'.{group_name}.common{tdata.shape[0]}genes.{fig_format}', dpi=300) plt.close() if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['pair_corr'])
ascending=False) stat_data = stat_data.transpose() stat_data.to_csv(os.path.join(outdir, 'detected_gene_type.stat.csv')) # plot df = stat_data order = sorted(df.index) df = df.loc[order] # print(df.head()) colors = get_color_pool(df.shape[1]) color_dict = dict(zip(df.columns, colors)) data = [ go.Bar(x=df.index, y=df[x] / df.sum(axis=1), name=x, marker=dict(color=color_dict[x])) for x in df.columns ] layout = go.Layout( title="Gene type distribution", # xaxis=dict(title='Sample'), barmode='stack', ) fig = go.Figure(data=data, layout=layout) prefix = "GeneTypeDistribution" draw(fig, prefix=prefix, outdir=outdir) return stat_data if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['detected_gene_type_stat'])
if sample_group: group = pd.read_csv(sample_group, header=0, index_col=0, sep=None, engine='python') else: group = pd.DataFrame({'All': {k: k for k in samples}}) group.index.name = 'Sample' group_names = group.columns group.reset_index('Sample', inplace=True) data = data.merge(group, on='Sample') for name in group_names: hue = None if sample_group is None else name if x_col.lower() == 'gene': ax = sns.boxplot(x='Gene', y='Expression', hue=hue, data=data) else: ax = sns.boxplot(x='Sample', y='Expression', hue=hue, data=data) ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=6, rotation=90) if xlabel: ax.set(xlabel=xlabel) plt.savefig(f'{prefix}{name}.boxplot.png', dpi=300, bbox_inches='tight') plt.close() if __name__ == '__main__': from xcmds import xcmds xcmds.xcmds(locals(), include=['expr_box_plot'])