Пример #1
0
def first_work(task_file):
    """定义各种参数,并加进全局变量"""

    global task_list, data_path, temp_path, result_path
    global organism, organism_genome, organism_genome_gtf, organism_genome_index
    global fasterq_dump, star, feature_counts
    global cpu_use, cache_load, remove

    # ====================定位到各种文件夹的路径====================
    distribution_path, task_name = os.path.split(task_file)
    srr_data = os.path.dirname(distribution_path)
    data_path = os.path.join(srr_data, 'data')
    temp_path = os.path.join(srr_data, 'temp', task_name)
    result_path = os.path.join(srr_data, 'result', task_name)
    make_dir(temp_path)
    make_dir(result_path)
    finished = {
        i + '.sra'
        for walk in os.walk(os.path.join(srr_data, 'result')) for i in walk[2]
        if i.startswith('SRR')
    }
    task_list = [i.strip() for i in open(task_file)
                 if i not in finished]  # 在task_list中删掉那些已经处理完的
    # ====================定位到基因组信息的文件====================
    organism = '_'.join(task_name.split('_')[:-1])
    organism_genome = os.path.join(GENOME, organism)
    organism_genome_index = os.path.join(GENOME_INDEX, organism)

    for file in os.listdir(organism_genome):
        if file.endswith('.gtf'):
            organism_genome_gtf = os.path.join(organism_genome, file)
            break
    else:
        print('missed gtf file of %s: %s' % (organism, organism_genome))
        exit()
    # ====================定位到软件的位置====================
    star = STAR
    fasterq_dump = FASTERQ_DUMP
    feature_counts = FEATURE_COUNTS
    # ====================定义其他参数====================
    if TIANHE:
        cpu_use = TIANHE_CPU_PER_WORKER
    else:
        cpu_use = CPU_PER_WORKER
    cache_load_organism = {i.replace(' ', '_') for i in CACHE_LOAD}
    if organism in cache_load_organism:
        cache_load = True
    else:
        cache_load = False
    remove = REMOVE
Пример #2
0
def make_srr_dir(srr_data):
    """创建文件的结构"""

    # 先检查SRR_data目录下是否有文件,再检查SRR_data/data目录下是否有文件
    srr_list = [i for i in os.listdir(srr_data) if i.endswith('.sra')]
    if len(srr_list) == 0:
        data_path = os.path.join(srr_data, 'data')
        srr_list = [i for i in os.listdir(data_path) if i.endswith('.sra')]
        if len(srr_list) == 0:
            print('%s和%s:文件夹下一个SRR的文件都没有!' % (srr_data, data_path))
            exit()
    for dir_name in ['data', 'script', 'distribution', 'temp', 'result']:
        dir_path = os.path.join(srr_data, dir_name)
        # 如下三个文件如果存在,必须删了重建,否则里面已经存在的东西会影响处理
        # result当然不能删掉重建,不然前面的处理就白费了
        if dir_name in ['script', 'distribution', 'temp']:
            if os.path.exists(dir_path):
                os.system('rm -r ' + dir_path)
        print('创建文件夹:%s' % os.path.abspath(dir_path))
        make_dir(dir_path)
Пример #3
0
def srr_pool(srr_data):
    """将所有result文件下以SRR开头的文件移动到all_result_path中"""

    result_path = os.path.join(srr_data, 'result')
    all_result_path = os.path.join(result_path, 'all_result')
    make_dir(all_result_path)

    print('正在移动文件')
    for dir_path, _, _ in os.walk(result_path):
        abs_all_result_path = os.path.abspath(all_result_path)
        abs_dir_path = os.path.abspath(dir_path)
        if abs_dir_path != abs_all_result_path:
            file_list = [srr for srr in os.listdir(dir_path) if srr.startswith('SRR')]
            if len(file_list) > 0:
                print(f'{abs_dir_path}/*\t>\t{abs_all_result_path}/')
                command = f'mv {abs_dir_path}/* {abs_all_result_path}/'
                os.system(command)

    if len(os.listdir(all_result_path)) == 0:
        text = f'{result_path}中没有数据!'
        print(add_color(text, 'red'))
        exit()
Пример #4
0
def download_srr(srr_list, output_dir):
    """使用生产者消费者模型下载数据"""

    print('开始下载')
    make_dir(output_dir)  # 创建文件夹

    # 启动生产者
    queue_size = 100
    queue = Queue(queue_size)
    pro = Process(target=producer,
                  args=(srr_list, queue, PRINT_DETAIL, queue_size))
    pro.start()

    try:
        # 启动消费者队列
        consumer_list = []
        new_consumer = partial(consumer, path=output_dir)
        for consumer_name in range(1, N_DOWNLOAD + 1):
            con = Process(target=new_consumer,
                          args=(consumer_name, queue, PRINT_DETAIL, wget_srr))
            con.start()
            consumer_list.append(con)
        # 等待消费者执行完毕
        for con in consumer_list:
            con.join()
    except KeyboardInterrupt:
        print(add_color('如果是Windows下的话按两下Ctrl+C, 父进程和子进程全部马上结束', 'red'))
        print(add_color('如果是Linux下的话按两下Ctrl+C,等待子进程完成最后一个任务才会退出', 'red'))

    # 输出错误报告
    finished = {i.split('.')[0] for i in os.listdir(output_dir)}
    error = set(srr_list) - finished
    if error:
        print('%s个SRR没有下载' % add_color(len(error), 'red'))
        print(add_color(error, 'red'))
    else:
        print(add_color('所有SRR都被下载', 'green'))
    # 子进程必须要外部停止才能关掉
    print('按Ctrl+C退出')
Пример #5
0
def integrate_srr(gse_dict, srr_data, gse_data, print_detail=PRINT_DETAIL):
    """按GSE整合SRR"""

    result_path = os.path.join(srr_data, 'result')
    all_result_path = os.path.join(result_path, 'all_result')
    all_result_srr = set(os.listdir(all_result_path))

    for gse_count, (gse, srr_list) in enumerate(gse_dict.items(), 1):
        print(f'===================={gse_count}/{len(gse_dict)}====================')
        finished_srr = [srr for srr in srr_list if srr in all_result_srr]

        # ============输出处理报告============
        completion = f'({len(finished_srr)}/{len(srr_list)})'
        if len(finished_srr) == 0:
            text = f'{gse}中的SRR完全没有被处理{completion}!'
            print(add_color(text, 'red'))
        else:
            # 输出每个GSE中SRR处理信息
            error_srr = set(srr_list) - set(finished_srr)
            if len(error_srr) == 0:
                text = f'{gse}中的SRR处理完全{completion}!'
                print(add_color(text, 'green'))
            else:
                text = f'{gse}中的SRR缺失{completion}!'
                print(add_color(text, 'yellow'))
                if print_detail:
                    print(add_color(error_srr, 'yellow'))

            # ============对完成度大于阈值的GSE进行整合============
            # 一个GSE中的SRR处理完成度大于这个阈值才会整合成表达矩阵
            if len(finished_srr) / len(srr_list) < THRESHOLD:
                text = f'{gse}中SRR缺失过多,不予整合!'
                print(add_color(text, 'yellow'))
            else:
                gse_dir = os.path.join(gse_data, gse)
                make_dir(gse_dir)
                matrix_file = os.path.join(gse_dir, 'matrix.csv')

                # 如果当前GSE已经存在一个matrix.csv文件
                # 会检查处理好的srr是否已经写好了,从而避免多次处理
                if os.path.exists(matrix_file):
                    with open(matrix_file, 'r', encoding='utf8') as f:
                        matrix_srr = [srr for srr in f.readline().strip().split(',') if srr]
                        if set(matrix_srr) == set(finished_srr):
                            print('已找到整合好的matrix.csv文件!')
                            continue

                # 首先抽一个文件读取gene的列表
                # 经过验证,每个featureCounts出来的文件基因的次序和个数是相同的
                with open(os.path.join(all_result_path, finished_srr[0]), 'r', encoding='utf8') as f:
                    # 读掉开始两行
                    f.readline()
                    f.readline()
                    genes_length = []
                    genes_list = []
                    for line in f:
                        genes_length.append(int(line.strip().split('\t')[-2]))
                        genes_list.append(line.strip().split('\t')[0])

                # 多进程整合SRR成GSE的表达矩阵
                print('开启%d个进程整合SRR数据!' % N_INTEGRATION)
                n_worker, srr_per_worker = distribute_srr(finished_srr, n_worker=N_INTEGRATION)
                pool = Pool(processes=n_worker)
                new_integration_worker = partial(integration_worker, all_result_path=all_result_path)
                result = pool.map(new_integration_worker, srr_per_worker)
                gse_data_dict = {key: every_dict[key] for every_dict in result for key in every_dict}
                gse_matrix = pd.DataFrame(gse_data_dict, index=genes_list)  # 创建一个DataFrame写入表达矩阵
                if VALUE == 'RPKM':
                    cells_numi = gse_matrix.sum(axis=0)
                    gse_matrix = gse_matrix.div(cells_numi, axis=1).div(genes_length, axis=0) * 10**9
                elif VALUE == 'TPM':
                    foo = gse_matrix.div(genes_length, axis=0) * 1000
                    foo_numi = foo.sum(axis=0)
                    gse_matrix = foo.div(foo_numi) * 10**6
                print('整合完毕,保存数据中!')

                with open(matrix_file, 'w', encoding='utf8') as f:
                    # TPM、RPKM的数量级最小大概就是三位小数,所以文件保存保留三位小数
                    gse_matrix.to_csv(f, sep=',', header=True, index=True, float_format='%.3f')
                    text = '保存成功:%s' % os.path.abspath(matrix_file)
                    print(add_color(text, 'green'))
Пример #6
0
def split_gse(gse_data, gse_organism_dict, organism_genes_dict, coding_data,
              ncoding_data):
    """切分表达矩阵获得编码和非编码两个文件"""

    print('切分表达矩阵获得编码和非编码两个文件')
    make_dir(coding_data)
    make_dir(ncoding_data)
    all_gse_data = os.listdir(gse_data)
    for count, gse in enumerate(all_gse_data, 1):
        print('========================================')
        gse_dir = os.path.join(gse_data, gse)
        gse_file = os.path.join(gse_dir, 'matrix.csv')
        if os.path.isdir(gse_dir) and not os.path.isfile(gse_file):
            # 存在文件夹,但文件夹里面没有matrix.csv会报错
            text = f'不存在{gse_file}'
            print(add_color(text, 'red'))
        else:
            if gse not in gse_organism_dict:
                text = f'GSE_info中没有{gse}的物种信息!'
                print(add_color(text, 'red'))
                continue

            organism = gse_organism_dict[gse].replace(' ', '_')
            if organism not in organism_genes_dict:
                text = f'{gse}: GENE_info中没有{organism}的基因信息!'
                print(add_color(text, 'red'))
                continue

            file_size = '%.3fM' % (os.path.getsize(gse_file) / (10**6))
            text = f'正在处理: {gse} {organism} {file_size} ({count}/{len(all_gse_data)})'
            print(add_color(text, 'yellow'))
            coding = organism_genes_dict[organism]['coding']
            ncoding = organism_genes_dict[organism]['ncoding']
            with open(gse_file) as f:
                matrix_data = pd.read_csv(f, index_col=0)

            # 判断表达矩阵行名中哪些是编码基因,哪些是非编码基因
            coding_genes = [
                gene for gene in matrix_data.index if gene in coding
            ]
            ncoding_genes = [
                gene for gene in matrix_data.index if gene in ncoding
            ]
            # 保存编码基因矩阵
            if coding_genes:
                print('找到%d个Coding genes' % len(coding_genes))
                coding_dir = os.path.join(coding_data, gse)
                coding_file = os.path.join(coding_dir, 'matrix.csv')
                make_dir(coding_dir)
                with open(coding_file, 'w') as f:
                    foo = matrix_data.loc[coding_genes, :]
                    foo.to_csv(f, sep=',')
            else:
                text = f'{gse_file}: 未发现Coding genes'
                print(add_color(text, 'yellow'))

            # 保存非编码基因矩阵
            if ncoding_genes:
                print('找到%d个Non coding genes' % len(ncoding_genes))
                ncoding_dir = os.path.join(ncoding_data, gse)
                ncoding_file = os.path.join(ncoding_dir, 'matrix.csv')
                make_dir(ncoding_dir)
                with open(ncoding_file, 'w') as f:
                    foo = matrix_data.loc[ncoding_genes, :]
                    foo.to_csv(f, sep=',')
            else:
                text = f'{gse_file}: 未发现Non coding genes'
                print(add_color(text, 'yellow'))

            text = f'处理完毕: {gse}'
            print(add_color(text, 'green'))