def get_chapter(driver, root_url, chapter_num, manga_name): chapter_url = f'{root_url}/{chapter_num}' path = f'{home}/Downloads/Manga/{manga_name}/Chapter_{chapter_num}/' create_dir(path) # get all pages page_num = 1 img_srcs = [] print(f'getting chapter {chapter_num}...') with concurrent.futures.ThreadPoolExecutor() as executor: while True: future = executor.submit(get_page, driver, chapter_url, page_num, chapter_num, manga_name, path) result = future.result() if result == '404': break elif result == 'unknow_error': continue else: img_srcs.append(result) page_num += 1 print(f'chapter {chapter_num} successfully getted!')
def main(): options = docopt(__doc__) command_log = 'fast_circ.py parameters: ' + ' '.join(sys.argv) work_dir = options['--output'] if work_dir != '.' and 'work_dir' != './': create_dir(work_dir) if options['parse']: # parse fusion reads from <fusion> file options['--bed'] = '%s/back_spliced_junction.bed' % work_dir parse_command(options, command_log) # annotate circular RNAs options['--output'] = '%s/circularRNA_known.txt' % options['--output'] annotate_command(options, command_log) elif options['annotate']: # align fusion reads options['--output'] = '%s/alignment' % work_dir options['--bed'] = '%s/back_spliced_junction.bed' % work_dir align_command(options, command_log) # annotate circular RNAs options['--output'] = '%s/circularRNA_known.txt' % options['--output'] annotate_command(options, command_log) elif options['denovo']: # align fusion reads options['--output'] = '%s/alignment' % work_dir options['--bed'] = '%s/back_spliced_junction.bed' % work_dir align_command(options, command_log) # de novo assemble circular RNAs options['--tophat'] = '%s/alignment/tophat' % work_dir options['--output'] = '%s/assemble' % work_dir assemble_command(options, command_log) # fetch AS events of circular RNAs options['--output'] = '%s/denovo' % work_dir options['--abs'] = '%s/abs' % work_dir denovo_command(options, work_dir, command_log)
def download_manga_from_list(): with open('download_list.json', 'r') as file: manga_data = json.load(file) manga_name = manga_data['manga_name'] chapters = manga_data['chapters'] home = get_home_dir() for chapter in chapters: chapter_id = list(chapter.keys())[0] chapter_imgs = list(chapter.values())[0] path = f'{home}/Downloads/Manga/{manga_name}/{chapter_id}/' create_dir(path) num = 1 chapter_number = chapter_id.split('_')[1] print(f'Downloading chapter {chapter_number}...') with concurrent.futures.ThreadPoolExecutor() as executor: for img in chapter_imgs: executor.submit(download_img, img, path, num) num += 1 print(f'Chapter {chapter_number} Downloaded!\n')
def assemble(options): # check output directory out_dir = create_dir(options['--output']) # check tophat results tophat_dir = options['--tophat'] # check cufflinks if which('cufflinks') is None: sys.exit('Cufflinks is required for CIRCexplorer2 assemble!') # check genePredToGtf if which('genePredToGtf') is None: sys.exit('genePredToGtf is required for CIRCexplorer2 assemble!') # check gtfToGenePred if which('gtfToGenePred') is None: sys.exit('gtfToGenePred is required for CIRCexplorer2 assemble!') # prepare cufflinks directory cufflinks_dir = out_dir create_dir(cufflinks_dir) # filter ref file ref_filter(options['--ref'], tophat_dir, cufflinks_dir) # assemble with cufflinks cufflinks_assemble(tophat_dir, cufflinks_dir, options['--thread'], options['--remove-rRNA'], options['--max-bundle-frags']) # convert assembly results convert_assembly_gtf(tophat_dir, cufflinks_dir, options['--ref'], options['--bb'], options['--chrom-size'])
def annotate(options): # check output directory out_dir = check_dir(options['<circ_dir>']) # prepare annotate directory annotate_dir = '%s/annotate' % out_dir create_dir(annotate_dir) # annotate fusion junctions annotate_fusion(options['--ref'], annotate_dir) # fix fusion juncrions fix_fusion(options['--ref'], options['--genome'], annotate_dir, options['--no-fix'])
def denovo(options): # check output directory out_dir = check_dir(options['<circ_dir>']) # check tophat results if options['--tophat-dir']: tophat_dir = check_dir(options['--tophat-dir']) else: tophat_dir = check_dir(out_dir + '/tophat') # prepare denovo directory denovo_dir = '%s/denovo' % out_dir create_dir(denovo_dir) # combine ref files cufflinks_ref_path = '%s/cufflinks/transcripts_ref.txt' % out_dir if os.path.isfile(cufflinks_ref_path): print('Combine %s with %s to create a new ref file!' % (options['--ref'], cufflinks_ref_path)) ref_path = '%s/combined_ref.txt' % denovo_dir new_ref_f = open(ref_path, 'w') with open(cufflinks_ref_path, 'r') as cuff_ref: for line in cuff_ref: if line.startswith('CUFF'): # only import novel isoforms new_ref_f.write(line) new_ref_f.write(open(options['--ref'], 'r').read()) new_ref_f.close() else: print('Warning: no cufflinks directory under %s!' % out_dir) print('Please run CIRCexplorer2 assembly before this step!') ref_path = options['--ref'] # annotate fusion junctions annotate_fusion(ref_path, denovo_dir, denovo_flag=1) # fix fusion juncrions fix_fusion(ref_path, options['--genome'], denovo_dir, options['--no-fix'], denovo_flag=1) # extract novel circRNAs extract_novel_circ(denovo_dir, options['--ref']) if options['--as']: if options['--pAplus'] and os.path.isdir(options['--pAplus']): pAplus_dir = os.path.abspath(options['--pAplus']) else: sys.exit('You should offer --pAplus option in --as mode!') if not options['--as-type'] or options['--as-type'] == 'CE': # extract cassette exons extract_cassette_exon(denovo_dir, tophat_dir, pAplus_dir, options['--rpkm']) if not options['--as-type'] or options['--as-type'] == 'RI': # extract retained introns extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir) if not options['--as-type'] or options['--as-type'] == 'ASS': # characterize A5SS and A3SS parse_splice_site(denovo_dir, tophat_dir, pAplus_dir)
def denovo(options): # check output directory out_dir = check_dir(options['<circ_dir>']) # check tophat results if options['--tophat-dir']: tophat_dir = check_dir(options['--tophat-dir']) else: tophat_dir = check_dir(out_dir + '/tophat') # prepare denovo directory denovo_dir = '%s/denovo' % out_dir create_dir(denovo_dir) # combine ref files cufflinks_ref_path = '%s/cufflinks/transcripts_ref.txt' % out_dir if os.path.isfile(cufflinks_ref_path): print('Combine %s with %s to create a new ref file!' % (options['--ref'], cufflinks_ref_path)) ref_path = '%s/combined_ref.txt' % denovo_dir new_ref_f = open(ref_path, 'w') with open(cufflinks_ref_path, 'r') as cuff_ref: for line in cuff_ref: if line.startswith('CUFF'): # only import novel isoforms new_ref_f.write(line) new_ref_f.write(open(options['--ref'], 'r').read()) new_ref_f.close() else: print('Warning: no cufflinks directory under %s!' % out_dir) print('Please run CIRCexplorer2 assembly before this step!') ref_path = options['--ref'] # annotate fusion junctions annotate_fusion(ref_path, denovo_dir, 1) # fix fusion juncrions fix_fusion(ref_path, options['--genome'], denovo_dir, options['--no-fix'], 1) # extract novel circRNAs extract_novel_circ(denovo_dir, options['--ref']) if options['--as']: if options['--pAplus'] and os.path.isdir(options['--pAplus']): pAplus_dir = os.path.abspath(options['--pAplus']) else: sys.exit('You should offer --pAplus option in --as mode!') if not options['--as-type'] or options['--as-type'] == 'CE': # extract cassette exons extract_cassette_exon(denovo_dir, tophat_dir, pAplus_dir, options['--rpkm']) if not options['--as-type'] or options['--as-type'] == 'RI': # extract retained introns extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir) if not options['--as-type'] or options['--as-type'] == 'ASS': # characterize A5SS and A3SS parse_splice_site(denovo_dir, tophat_dir, pAplus_dir)
def check_outdir(out_dir): ''' 1. Clear output directory if not empty 2. Create essential subdirectories ''' print('Check output directory...') # clear output directory if not empty create_dir(out_dir) dir_path = os.path.abspath(out_dir) # create essential subdirectories os.mkdir(dir_path + '/bowtie1_index') os.mkdir(dir_path + '/bowtie2_index') os.mkdir(dir_path + '/tophat') os.mkdir(dir_path + '/tophat_fusion') return dir_path
def parse(options): aliger = set(['STAR', 'MapSplice', 'segemehl']) if options['-t'] not in aliger: sys.exit('Error: CIRCexplorer2 parse does not support %s!' % options['-t']) # check output directory create_dir(options['--output']) out_dir = os.path.abspath(options['--output']) out = out_dir + '/fusion_junction.bed' # parse fusion junctions from other aligers if options['-t'] == 'STAR': star_parse(options['<fusion>'], out) elif options['-t'] == 'MapSplice': mapsplice_parse(options['<fusion>'], out) elif options['-t'] == 'segemehl': segemehl_parse(options['<fusion>'], out)
def hisat_to_tophat(bam_f, denovo_dir): if which('regtools') is None: sys.exit('regtools is required "as" analysis when use hisat2 mapping \ results!') o_dir = create_dir('%s/temp%f' % (denovo_dir, time.time())) os.symlink(os.path.realpath(bam_f), "%s/accepted_hits.bam" % o_dir) pysam.index("%s/accepted_hits.bam" % o_dir) # creat junctions.bed file regtools_cmd = 'regtools junctions extract -s 0 ' regtools_cmd += '-o %s %s' % ("%s/junctions.bed" % o_dir, "%s/accepted_hits.bam" % o_dir) regtools_cmd += ' 2> %s/regtools.log' % o_dir print('Creating junctions.bed command:') print(regtools_cmd) return_code = os.system(regtools_cmd) >> 8 if return_code: sys.exit('Error: cannot create junctions.bed file!') return o_dir
def denovo(options): # check tophat results # if options['--tophat']: # tophat_dir = check_dir(options['--tophat']) # prepare denovo directory denovo_dir = options['--output'] create_dir(denovo_dir) # combine ref files cufflinks_ref_path = '%s/transcripts_ref.txt' % options['--cuff'] if os.path.isfile(cufflinks_ref_path): print('Combine %s with %s to create a new ref file!' % (options['--ref'], cufflinks_ref_path)) ref_path = '%s/combined_ref.txt' % denovo_dir new_ref_f = open(ref_path, 'w') with open(cufflinks_ref_path, 'r') as cuff_ref: for line in cuff_ref: if line.startswith('CUFF'): # only import novel isoforms new_ref_f.write(line) new_ref_f.write(open(options['--ref'], 'r').read()) new_ref_f.close() else: print('Warning: no cufflinks directory %s!' % options['--cuff']) print('Please run CIRCexplorer2 assembly before this step!') ref_path = options['--ref'] # create temporary annotated fusion file fusion_tmp = tempfile.TemporaryFile(mode='w+') # annotate fusion junctions annotate_fusion(ref_path, options['--bed'], fusion_tmp, denovo_flag=1) # fix fusion juncrions out_f = '%s/circularRNA_full.txt' % denovo_dir fix_fusion(ref_path, options['--genome'], fusion_tmp, out_f, options['--no-fix'], denovo_flag=1) # extract novel circRNAs extract_novel_circ(denovo_dir, options['--ref']) if options['--as']: create_dir(options['--as']) if options['--pAplus'] and os.path.isdir(options['--pAplus']): pAplus_dir = os.path.abspath(options['--pAplus']) elif options['--pAplus'] and os.path.isfile(options['--pAplus']): pAplus_dir = hisat_to_tophat(options['--pAplus'], denovo_dir) else: sys.exit('You should offer --pAplus option in --as mode!') if options['--tophat'] and os.path.isdir(options['--tophat']): tophat_dir = os.path.abspath(options['--tophat']) elif options['--tophat'] and os.path.isfile(options['--tophat']): tophat_dir = hisat_to_tophat(options['--tophat'], denovo_dir) else: sys.exit('You should offer p(A)minus dir/file in --as mode!') if not options['--as-type'] or options['--as-type'] == 'CE': # extract cassette exons extract_cassette_exon(denovo_dir, tophat_dir, pAplus_dir, options['--as'], options['--rpkm']) if not options['--as-type'] or options['--as-type'] == 'RI': # extract retained introns extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir, options['--as']) if not options['--as-type'] or options['--as-type'] == 'ASS': # characterize A5SS and A3SS parse_splice_site(denovo_dir, tophat_dir, pAplus_dir, options['--as']) if options['--abs']: create_dir(options['--abs']) analyze_abs(denovo_dir, options['--genome'], options['--abs'])