def first_work(task_file): """定义各种参数,并加进全局变量""" global task_list, data_path, temp_path, result_path global organism, organism_genome, organism_genome_gtf, organism_genome_index global fasterq_dump, star, feature_counts global cpu_use, cache_load, remove # ====================定位到各种文件夹的路径==================== distribution_path, task_name = os.path.split(task_file) srr_data = os.path.dirname(distribution_path) data_path = os.path.join(srr_data, 'data') temp_path = os.path.join(srr_data, 'temp', task_name) result_path = os.path.join(srr_data, 'result', task_name) make_dir(temp_path) make_dir(result_path) finished = { i + '.sra' for walk in os.walk(os.path.join(srr_data, 'result')) for i in walk[2] if i.startswith('SRR') } task_list = [i.strip() for i in open(task_file) if i not in finished] # 在task_list中删掉那些已经处理完的 # ====================定位到基因组信息的文件==================== organism = '_'.join(task_name.split('_')[:-1]) organism_genome = os.path.join(GENOME, organism) organism_genome_index = os.path.join(GENOME_INDEX, organism) for file in os.listdir(organism_genome): if file.endswith('.gtf'): organism_genome_gtf = os.path.join(organism_genome, file) break else: print('missed gtf file of %s: %s' % (organism, organism_genome)) exit() # ====================定位到软件的位置==================== star = STAR fasterq_dump = FASTERQ_DUMP feature_counts = FEATURE_COUNTS # ====================定义其他参数==================== if TIANHE: cpu_use = TIANHE_CPU_PER_WORKER else: cpu_use = CPU_PER_WORKER cache_load_organism = {i.replace(' ', '_') for i in CACHE_LOAD} if organism in cache_load_organism: cache_load = True else: cache_load = False remove = REMOVE
def make_srr_dir(srr_data): """创建文件的结构""" # 先检查SRR_data目录下是否有文件,再检查SRR_data/data目录下是否有文件 srr_list = [i for i in os.listdir(srr_data) if i.endswith('.sra')] if len(srr_list) == 0: data_path = os.path.join(srr_data, 'data') srr_list = [i for i in os.listdir(data_path) if i.endswith('.sra')] if len(srr_list) == 0: print('%s和%s:文件夹下一个SRR的文件都没有!' % (srr_data, data_path)) exit() for dir_name in ['data', 'script', 'distribution', 'temp', 'result']: dir_path = os.path.join(srr_data, dir_name) # 如下三个文件如果存在,必须删了重建,否则里面已经存在的东西会影响处理 # result当然不能删掉重建,不然前面的处理就白费了 if dir_name in ['script', 'distribution', 'temp']: if os.path.exists(dir_path): os.system('rm -r ' + dir_path) print('创建文件夹:%s' % os.path.abspath(dir_path)) make_dir(dir_path)
def srr_pool(srr_data): """将所有result文件下以SRR开头的文件移动到all_result_path中""" result_path = os.path.join(srr_data, 'result') all_result_path = os.path.join(result_path, 'all_result') make_dir(all_result_path) print('正在移动文件') for dir_path, _, _ in os.walk(result_path): abs_all_result_path = os.path.abspath(all_result_path) abs_dir_path = os.path.abspath(dir_path) if abs_dir_path != abs_all_result_path: file_list = [srr for srr in os.listdir(dir_path) if srr.startswith('SRR')] if len(file_list) > 0: print(f'{abs_dir_path}/*\t>\t{abs_all_result_path}/') command = f'mv {abs_dir_path}/* {abs_all_result_path}/' os.system(command) if len(os.listdir(all_result_path)) == 0: text = f'{result_path}中没有数据!' print(add_color(text, 'red')) exit()
def download_srr(srr_list, output_dir): """使用生产者消费者模型下载数据""" print('开始下载') make_dir(output_dir) # 创建文件夹 # 启动生产者 queue_size = 100 queue = Queue(queue_size) pro = Process(target=producer, args=(srr_list, queue, PRINT_DETAIL, queue_size)) pro.start() try: # 启动消费者队列 consumer_list = [] new_consumer = partial(consumer, path=output_dir) for consumer_name in range(1, N_DOWNLOAD + 1): con = Process(target=new_consumer, args=(consumer_name, queue, PRINT_DETAIL, wget_srr)) con.start() consumer_list.append(con) # 等待消费者执行完毕 for con in consumer_list: con.join() except KeyboardInterrupt: print(add_color('如果是Windows下的话按两下Ctrl+C, 父进程和子进程全部马上结束', 'red')) print(add_color('如果是Linux下的话按两下Ctrl+C,等待子进程完成最后一个任务才会退出', 'red')) # 输出错误报告 finished = {i.split('.')[0] for i in os.listdir(output_dir)} error = set(srr_list) - finished if error: print('%s个SRR没有下载' % add_color(len(error), 'red')) print(add_color(error, 'red')) else: print(add_color('所有SRR都被下载', 'green')) # 子进程必须要外部停止才能关掉 print('按Ctrl+C退出')
def integrate_srr(gse_dict, srr_data, gse_data, print_detail=PRINT_DETAIL): """按GSE整合SRR""" result_path = os.path.join(srr_data, 'result') all_result_path = os.path.join(result_path, 'all_result') all_result_srr = set(os.listdir(all_result_path)) for gse_count, (gse, srr_list) in enumerate(gse_dict.items(), 1): print(f'===================={gse_count}/{len(gse_dict)}====================') finished_srr = [srr for srr in srr_list if srr in all_result_srr] # ============输出处理报告============ completion = f'({len(finished_srr)}/{len(srr_list)})' if len(finished_srr) == 0: text = f'{gse}中的SRR完全没有被处理{completion}!' print(add_color(text, 'red')) else: # 输出每个GSE中SRR处理信息 error_srr = set(srr_list) - set(finished_srr) if len(error_srr) == 0: text = f'{gse}中的SRR处理完全{completion}!' print(add_color(text, 'green')) else: text = f'{gse}中的SRR缺失{completion}!' print(add_color(text, 'yellow')) if print_detail: print(add_color(error_srr, 'yellow')) # ============对完成度大于阈值的GSE进行整合============ # 一个GSE中的SRR处理完成度大于这个阈值才会整合成表达矩阵 if len(finished_srr) / len(srr_list) < THRESHOLD: text = f'{gse}中SRR缺失过多,不予整合!' print(add_color(text, 'yellow')) else: gse_dir = os.path.join(gse_data, gse) make_dir(gse_dir) matrix_file = os.path.join(gse_dir, 'matrix.csv') # 如果当前GSE已经存在一个matrix.csv文件 # 会检查处理好的srr是否已经写好了,从而避免多次处理 if os.path.exists(matrix_file): with open(matrix_file, 'r', encoding='utf8') as f: matrix_srr = [srr for srr in f.readline().strip().split(',') if srr] if set(matrix_srr) == set(finished_srr): print('已找到整合好的matrix.csv文件!') continue # 首先抽一个文件读取gene的列表 # 经过验证,每个featureCounts出来的文件基因的次序和个数是相同的 with open(os.path.join(all_result_path, finished_srr[0]), 'r', encoding='utf8') as f: # 读掉开始两行 f.readline() f.readline() genes_length = [] genes_list = [] for line in f: genes_length.append(int(line.strip().split('\t')[-2])) genes_list.append(line.strip().split('\t')[0]) # 多进程整合SRR成GSE的表达矩阵 print('开启%d个进程整合SRR数据!' % N_INTEGRATION) n_worker, srr_per_worker = distribute_srr(finished_srr, n_worker=N_INTEGRATION) pool = Pool(processes=n_worker) new_integration_worker = partial(integration_worker, all_result_path=all_result_path) result = pool.map(new_integration_worker, srr_per_worker) gse_data_dict = {key: every_dict[key] for every_dict in result for key in every_dict} gse_matrix = pd.DataFrame(gse_data_dict, index=genes_list) # 创建一个DataFrame写入表达矩阵 if VALUE == 'RPKM': cells_numi = gse_matrix.sum(axis=0) gse_matrix = gse_matrix.div(cells_numi, axis=1).div(genes_length, axis=0) * 10**9 elif VALUE == 'TPM': foo = gse_matrix.div(genes_length, axis=0) * 1000 foo_numi = foo.sum(axis=0) gse_matrix = foo.div(foo_numi) * 10**6 print('整合完毕,保存数据中!') with open(matrix_file, 'w', encoding='utf8') as f: # TPM、RPKM的数量级最小大概就是三位小数,所以文件保存保留三位小数 gse_matrix.to_csv(f, sep=',', header=True, index=True, float_format='%.3f') text = '保存成功:%s' % os.path.abspath(matrix_file) print(add_color(text, 'green'))
def split_gse(gse_data, gse_organism_dict, organism_genes_dict, coding_data, ncoding_data): """切分表达矩阵获得编码和非编码两个文件""" print('切分表达矩阵获得编码和非编码两个文件') make_dir(coding_data) make_dir(ncoding_data) all_gse_data = os.listdir(gse_data) for count, gse in enumerate(all_gse_data, 1): print('========================================') gse_dir = os.path.join(gse_data, gse) gse_file = os.path.join(gse_dir, 'matrix.csv') if os.path.isdir(gse_dir) and not os.path.isfile(gse_file): # 存在文件夹,但文件夹里面没有matrix.csv会报错 text = f'不存在{gse_file}' print(add_color(text, 'red')) else: if gse not in gse_organism_dict: text = f'GSE_info中没有{gse}的物种信息!' print(add_color(text, 'red')) continue organism = gse_organism_dict[gse].replace(' ', '_') if organism not in organism_genes_dict: text = f'{gse}: GENE_info中没有{organism}的基因信息!' print(add_color(text, 'red')) continue file_size = '%.3fM' % (os.path.getsize(gse_file) / (10**6)) text = f'正在处理: {gse} {organism} {file_size} ({count}/{len(all_gse_data)})' print(add_color(text, 'yellow')) coding = organism_genes_dict[organism]['coding'] ncoding = organism_genes_dict[organism]['ncoding'] with open(gse_file) as f: matrix_data = pd.read_csv(f, index_col=0) # 判断表达矩阵行名中哪些是编码基因,哪些是非编码基因 coding_genes = [ gene for gene in matrix_data.index if gene in coding ] ncoding_genes = [ gene for gene in matrix_data.index if gene in ncoding ] # 保存编码基因矩阵 if coding_genes: print('找到%d个Coding genes' % len(coding_genes)) coding_dir = os.path.join(coding_data, gse) coding_file = os.path.join(coding_dir, 'matrix.csv') make_dir(coding_dir) with open(coding_file, 'w') as f: foo = matrix_data.loc[coding_genes, :] foo.to_csv(f, sep=',') else: text = f'{gse_file}: 未发现Coding genes' print(add_color(text, 'yellow')) # 保存非编码基因矩阵 if ncoding_genes: print('找到%d个Non coding genes' % len(ncoding_genes)) ncoding_dir = os.path.join(ncoding_data, gse) ncoding_file = os.path.join(ncoding_dir, 'matrix.csv') make_dir(ncoding_dir) with open(ncoding_file, 'w') as f: foo = matrix_data.loc[ncoding_genes, :] foo.to_csv(f, sep=',') else: text = f'{gse_file}: 未发现Non coding genes' print(add_color(text, 'yellow')) text = f'处理完毕: {gse}' print(add_color(text, 'green'))