예제 #1
0
def extract_marker_gene_reads(bams,out_prefix,cores):
    N = min(len(bams),cores,cpu_count())
    extract_reads.extract_reads(bamlist=bams,cores=N,prefix=out_prefix)
    # merge single reads
    cmd = ['cat']
    for i in xrange(N):
        hs = '%s.%d.single.fastq' % (out_prefix,i)
        if file_size(hs)>0:
            cmd.append(hs)
    if len(cmd)>1:
        f = open(out_prefix+'.single.fastq','w')
        subprocess.call(cmd,stdout=f)
        f.close()
    # merge pe1 reads
    cmd = ['cat']
    for i in xrange(N):
        h1 = '%s.%d.pe1.fastq' % (out_prefix,i)
        if file_size(h1)>0:
            cmd.append(h1)
    if len(cmd)>1:
        f = open(out_prefix+'.pe1.fastq','w')
        subprocess.call(cmd,stdout=f)
        f.close()
    # merge pe2 reads
    cmd = ['cat']
    for i in xrange(N):
        h2 = '%s.%d.pe2.fastq' % (out_prefix,i)
        if file_size(h2)>0:
            cmd.append(h2)
    if len(cmd)>1:
        f = open(out_prefix+'.pe2.fastq','w')
        subprocess.call(cmd,stdout=f)
        f.close()
    # remove temporary file
    cmd = ['rm']
    for i in xrange(N):
        hs = '%s.%d.single.fastq' % (out_prefix,i)
        h1 = '%s.%d.pe1.fastq' % (out_prefix,i)
        h2 = '%s.%d.pe2.fastq' % (out_prefix,i)
        if os.path.exists(hs):
            cmd.append(hs)
        if os.path.exists(h1):
            cmd.append(h1)
        if os.path.exists(h2):
            cmd.append(h2)
    subprocess.call(cmd)
예제 #2
0
def extract_bam_reads(bam, verbose=False):
    '''
    It is to extract short reads from the input bam file.
    '''
    if verbose:
        logging.info('extract short reads from ' + os.path.basename(bam))

    prefix = os.path.join(os.getcwd(),os.path.splitext(os.path.basename(bam))[0])
    PE1,PE2,SIN = extract_reads.extract_reads(bamlist=[bam],prefix=prefix)
    # merge
    os.system('cat {} {} {} | seqtk seq -A > {}'.format(PE1,PE2,SIN,prefix+'.fasta'))
    # remove temporary file
    cmd = ['rm','-f',PE1,PE2,SIN]
    call(cmd)
    return prefix+'.fasta'
예제 #3
0
def extract_bam_reads(bam, verbose=False):
    '''
    It is to extract short reads from the input bam file.
    '''
    if verbose:
        logging.info('extract short reads from ' + os.path.basename(bam))

    prefix = os.path.join(os.getcwd(),os.path.splitext(os.path.basename(bam))[0])
    PE1,PE2,SIN = extract_reads.extract_reads(bamlist=[bam],prefix=prefix)
    # merge
    os.system('cat {} {} {} | seqtk seq -A > {}'.format(PE1,PE2,SIN,prefix+'.fasta'))
    # remove temporary file
    cmd = ['rm','-f',PE1,PE2,SIN]
    call(cmd)
    return prefix+'.fasta'
예제 #4
0
def extract_sample_gene_read(params):
    smpl, bam = params
    extract_reads.extract_reads(bamlist=[bam], prefix=smpl, cores=1)
    return None
예제 #5
0
def extract_marker_gene_reads(bams, out_prefix, cores):
    N = min(len(bams), cores, cpu_count())
    extract_reads.extract_reads(bamlist=bams, cores=N, prefix=out_prefix)
예제 #6
0
파일: FUCHS.py 프로젝트: daaaaande/FUCHS
def main():
    # required packages
    import os
    import argparse
    import datetime
    import time

    parser = argparse.ArgumentParser(
        description='Main script of the FUCHS pipeline.'
        'For a detailed help see https://github.com/dieterich-lab/FUCHS '
        'or the included README.rst file.')

    # input
    parser.add_argument(
        '-C',
        '--circIDs',
        dest='circlefile',
        default='none',
        help='Tab-separated file chr:start_end(tab)read1,read2,read3.')
    parser.add_argument(
        '-D',
        '--DCC',
        dest='CircRNACount',
        default='none',
        help=
        'If you mapped with STAR and are using step1 you need to provide a list'
        ' of circle ids (CircRNACount or CircCoordinates from DCC)'
        'You must supply either -C or -DCC')
    parser.add_argument(
        '-J',
        '--chimericJunctions',
        dest='chimeric_junction',
        default='none',
        help=
        'If you mapped with STAR and are using step1 you need to provide the paired end Chimeric.junction.out file here'
    )
    parser.add_argument(
        '-F',
        '--mate1',
        dest='mate1',
        default='none',
        help=
        'If you mapped with STAR and are using step1 you need to provide the mate1.Chimeric.junction.out file here (optional if ends were mapped separately)'
    )
    parser.add_argument(
        '-R',
        '--mate2',
        dest='mate2',
        default='none',
        help=
        'If you mapped with STAR and are using step1 you need to provide the mate2.Chimeric.junction.out file here (optional if ends were mapped separately)'
    )
    parser.add_argument(
        '-B',
        '--bamfile',
        dest='bamfile',
        required=True,
        help=
        'BAM file containing chimeric reads, linear reads may be in it but are not required.'
    )
    parser.add_argument('-A',
                        '--annotation',
                        dest='bedfile',
                        required=True,
                        help='bed formatted feature file including exons.')
    # output
    parser.add_argument(
        '-O',
        '--outFolder',
        dest='out_folder',
        default='.',
        help=
        'Output folder. There will be a sub folder for the sample containing a BAM file '
        'for each circle.')
    parser.add_argument('-N',
                        '--sampleName',
                        dest='sample',
                        required=True,
                        help='sample name to title every thing.')

    # options
    parser.add_argument(
        '-r',
        '--thresholdReads',
        dest='reads',
        default=5,
        type=int,
        help='Circle has to have at least <r> reads to be analysed.')

    # TODO: default: no multi map
    parser.add_argument(
        '-q',
        '--thresholdMapq',
        dest='mapq',
        default=3,
        type=int,
        help=
        'MAPQ cutoff, only reads passing this threshold will be written to circle BAM file.'
    )
    # TODO: add 0 based info
    parser.add_argument('-c',
                        '--splitCharacter',
                        dest='split_character',
                        default='_',
                        help='feature name separator.')
    parser.add_argument(
        '-e',
        '--exonIndex',
        dest='exon_index',
        default=3,
        type=int,
        help=
        'Field indicating the exon number after splitting feature name by split_character (for the annotation file).'
    )
    parser.add_argument(
        '-p',
        '--annotationFormat',
        dest='ref_platform',
        default='refseq',
        help=
        'Specifies the annotation platform which was used (refseq or ensembl)')
    parser.add_argument(
        '-s',
        '--skipSteps',
        dest='skipped_steps',
        default='none',
        help=
        'Comma separated list of steps that should be skipped (e.g. step3,step4,step6)'
    )
    parser.add_argument(
        '-T',
        '--tmp',
        dest='tmp_folder',
        default='/tmp/',
        help='Folder to store temporary files generated by pybedtools.')

    parser.add_argument('-P',
                        '--cpus',
                        dest='num_cpus',
                        default=4,
                        type=int,
                        help='Number of CPUs used.')

    args = parser.parse_args()

    # parse arguments
    circles = os.path.expanduser(args.circlefile)
    circle_ids = os.path.expanduser(args.CircRNACount)
    paired = os.path.expanduser(
        args.chimeric_junction)  # not the greatest naming scheme
    mate1 = os.path.expanduser(args.mate1)
    mate2 = os.path.expanduser(args.mate2)
    bamfile = os.path.expanduser(args.bamfile)
    bedfile = os.path.expanduser(args.bedfile)
    outfolder = os.path.expanduser(args.out_folder) + '/'
    sample = args.sample
    num_cpus = args.num_cpus

    cutoff_reads = args.reads
    cutoff_mapq = args.mapq
    exon_index = args.exon_index
    split_character = args.split_character
    platform = args.ref_platform
    skipped_steps = args.skipped_steps.split(',')
    tmp_folder = os.path.expanduser(args.tmp_folder) + '/'

    # start writing down FUCHS time for retracing
    print('Started FUCHS at %s' % (datetime.datetime.now()))
    dt = str(datetime.datetime.now())
    start_time = time.time()
    # make log file
    # TODO
    # test if command line was correct
    if circles == 'none' and circle_ids == 'none':
        print(
            'ERROR, you need to specify either a -C or -DCC.\nIf you mapped and detected your circRNAs with STAR/DCC you may indicate \n-DCC CircRNACount, -CJ Chimeric.junction.out, -m1 mate1.Chimeric.junction.out and -m2 mate2.Chimeric.junction.out\nif you used a different program, please supply a circID list using -C.\n'
        )
        quit()

    if not circles == 'none' and not circle_ids == 'none':
        print(
            'You have indicated both -C and -DCC, this is not necessary, we will skip step1 (read extraction from the STAR output) and proceed with the circID file\n'
        )
        circle_ids == 'none'
        skipped_steps += ['step1']

    if not circle_ids == 'none' and paired == 'none':
        print(
            'You have indicated that you detected your circRNAs using STAR/DCC with the -DCC flag, \nhowever you did not specify a Chimeric.junction.out file, this is necessary, \nplease specify at least -CJ, if you have paired end data also specify -m1/-m2\n'
        )
        quit()

    # convert relative paths names to absolute path names
    working_dir = os.getcwd()
    if not circles == 'none' and not os.path.isabs(circles):
        circles = os.path.abspath(os.path.join(os.getcwd(), circles))
        print('changed circID file to %s\n' % (circles))
    if not circles == 'none' and not os.path.exists(circles):
        print('ERROR, no such file or directory: %s' % (circles))
        quit()

    if not circle_ids == 'none' and not os.path.isabs(circle_ids):
        circle_ids = os.path.abspath(os.path.join(os.getcwd(), circle_ids))
        print('changed CircRNACount file to %s\n' % (circle_ids))
    if not circle_ids == 'none' and not os.path.exists(circle_ids):
        print('ERROR, no such file or directory: %s' % (circle_ids))
        quit()

    if not paired == 'none' and not os.path.isabs(paired):
        paired = os.path.abspath(os.path.join(os.getcwd(), paired))
        print('changed Chimeric.junction.out file to %s\n' % (paired))
    if not paired == 'none' and not os.path.exists(paired):
        print('ERROR, no such file or directory: %s' % (paired))
        quit()

    if not mate2 == 'none' and not os.path.isabs(mate2):
        mate2 = os.path.abspath(os.path.join(os.getcwd(), mate2))
        print('changed mate2.Chimeric.junction.out file to %s\n' % (mate2))
    if not mate2 == 'none' and not os.path.exists(mate2):
        print('ERROR, no such file or directory: %s' % (mate2))
        quit()

    if not mate1 == 'none' and not os.path.isabs(mate1):
        mate1 = os.path.abspath(os.path.join(os.getcwd(), mate1))
        print('changed mate1.Chimeric.junction.out file to %s\n' % (mate1))
    if not mate1 == 'none' and not os.path.exists(mate1):
        print('ERROR, no such file or directory: %s' % (mate1))
        quit()

    if not os.path.isabs(bamfile):
        bamfile = os.path.abspath(os.path.join(os.getcwd(), bamfile))
        print('changed bamfile file to %s\n' % (bamfile))
    if not os.path.exists(bamfile):
        print('ERROR, no such file or directory: %s' % (bamfile))
        quit()

    if not os.path.isabs(outfolder):
        outfolder = os.path.abspath(os.path.join(os.getcwd(), outfolder))
        print('changed output folder to %s\n' % (outfolder))
    if not os.path.isdir(outfolder):
        os.mkdir(outfolder)

    if not os.path.isabs(tmp_folder):
        tmp_folder = os.path.abspath(os.path.join(os.getcwd(), tmp_folder))
        print('changed tmp folder to %s\n' % (tmp_folder))
    if not os.path.isdir(tmp_folder):
        os.mkdir(tmp_folder)

    if not os.path.isabs(bedfile):
        bedfile = os.path.abspath(os.path.join(os.getcwd(), bedfile))
        print('changed bedfile file to %s\n' % (bedfile))
    if not os.path.exists(bedfile):
        print('ERROR, no such file or directory: %s' % (bedfile))
        quit()

    accepted_platforms = ('refseq', 'ensembl')
    platform = platform.lower()
    if not platform in accepted_platforms:
        print(
            'ERROR please specify an accepted annotation platform. Possible options are: refseq or ensembl'
        )
        quit()

    print "The following analysis steps will be skipped: " + '%s' % ', '.join(
        map(str, skipped_steps))

    # Step 1: (optional) if DCC was used, extract circle read names from junction file
    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'w')
    output_file.write('FUCHS is starting at %s\n\n' % (dt))
    output_file.write(
        '%s: starting to get readnames from Chimeric.junction.out\n' %
        (datetime.datetime.now()))
    output_file.close()
    if not 'step1' in skipped_steps:

        circles = "%s.reads.txt" % paired
        if not os.path.isfile(circles):
            import get_readnames_from_DCC as get_readnames
            names = get_readnames.get_readnames_from_DCC(
                circle_ids, paired, mate1, mate2)
            names.run()
        else:
            output_file = open(
                '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')),
                'a')
            output_file.write(
                '\tskipping get_readnames_from_DCC because %s exists already\n'
                % (circles))
            output_file.close()

    # Step2 : extract circle reads from sample bam file
    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'a')
    output_file.write(
        '\tfinished\n\n%s: starting to extract chimeric reads from bamfile\n' %
        (datetime.datetime.now()))
    output_file.close()
    if not 'step2' in skipped_steps:
        import extract_reads as extract_reads
        er = extract_reads.extract_reads(cutoff_reads, cutoff_mapq, circles,
                                         bamfile, outfolder, sample,
                                         tmp_folder, num_cpus)
        er.run()

    # Step3 : (optional) get information about possibly rolling circles
    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'a')
    output_file.write(
        '\tfinished\n\n%s: starting to get mate pair information\n' %
        (datetime.datetime.now()))
    output_file.close()
    if not 'step3' in skipped_steps:

        if not os.path.isfile('%s/%s.mate_status.txt' % (outfolder, sample)):
            import get_mate_information as mateinformation
            mi = mateinformation.mate_information(platform, split_character,
                                                  bedfile, outfolder, sample,
                                                  tmp_folder, num_cpus)
            mi.run()
        else:
            output_file = open(
                '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')),
                'a')
            output_file.write(
                '\tskipping get_mate_information because %s/%s.mate_status.txt exists already\n'
                % (outfolder, sample))
            output_file.close()

    # Step4 : (optional) find exon skipping events
    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'a')
    output_file.write('\tfinished\n\n%s: starting to detect skipped exons\n' %
                      (datetime.datetime.now()))
    output_file.close()
    if not 'step4' in skipped_steps:

        if not os.path.isfile('%s/%s.skipped_exons.bed' % (outfolder, sample)):
            import detect_skipped_exons as skipped_exons
            se = skipped_exons.detect_skipped_exons(outfolder, sample, bedfile,
                                                    tmp_folder, platform,
                                                    num_cpus)
            se.run()
        else:
            output_file = open(
                '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')),
                'a')
            output_file.write(
                '\tskipping detect_skipped_exons because %s/%s.skipped_exons.bed exists already\n'
                % (outfolder, sample))
            output_file.close()

    # Step5 : (optional) identify different circles within the same host gene
    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'a')
    output_file.write(
        '\tfinished\n\n%s: starting to detect alternative splicing\n' %
        (datetime.datetime.now()))
    output_file.close()
    if not 'step5' in skipped_steps:

        if not os.path.isfile('%s/%s.alternative_splicing.txt' %
                              (outfolder, sample)):
            import detect_splicing_variants as splicing_variants
            sv = splicing_variants.detect_splicing_variants(
                split_character, platform, circles, bedfile, outfolder, sample,
                tmp_folder, num_cpus)
            sv.run()
        else:
            output_file = open(
                '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')),
                'a')
            output_file.write(
                '\tskipping detect_splicing_variants because %s/%s.alternative_splicing.txt exists already\n'
                % (outfolder, sample))
            output_file.close()

    # Step6 : (optional) generate coverage profile for each circle
    # (one transcript per gene, best if most fitting transcript)
    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'a')
    output_file.write(
        '\tfinished\n\n%s: starting to generate coverage profiles\n' %
        (datetime.datetime.now()))
    output_file.close()
    if not 'step6' in skipped_steps:
        if not os.path.isfile('%s/%s.exon_counts.bed' %
                              (outfolder, sample)) and not os.path.isdir(
                                  '%s/%s.coverage_profiles/' %
                                  (outfolder, sample)):
            import get_coverage_profile as coverage_profile
            sv = coverage_profile.get_coverage_profile(exon_index,
                                                       split_character,
                                                       platform, bedfile,
                                                       outfolder, sample,
                                                       tmp_folder, num_cpus)
            sv.run()
        else:
            output_file = open(
                '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')),
                'a')
            output_file.write(
                '\tskipping get_coverage_profile because %s/%s.exon_counts.bed exists already\n'
                % (outfolder, sample))
            output_file.close()

    # Step7 : (optional, requires step 5)
    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'a')
    output_file.write(
        '\tfinished\n\n%s: starting to summarize the coverage profiles\n' %
        (datetime.datetime.now()))
    output_file.close()
    if not 'step7' in skipped_steps:
        if not os.path.isfile(
                '%s/%s.coverage_profiles/coverage_profiles.all_circles.pdf' %
            (outfolder, sample)):
            if os.path.isdir('%s/%s.coverage_profiles/' % (outfolder, sample)):
                os.system(
                    'summarized_coverage_profiles.R %s/%s.coverage_profiles' %
                    (outfolder, sample))
            else:
                output_file = open(
                    '%s/%s.logfile.%s' %
                    (outfolder, sample, dt.replace(' ', '_')), 'a')
                output_file.write(
                    '\tYou are trying cluster the coverage profiles without '
                    'generating coverage profiles first, please run step 5 (get_coverage_profile)\n'
                )
                output_file.close()
        else:
            output_file = open(
                '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')),
                'a')
            output_file.write(
                '\tskipping summarized_coverage_profiles.R because %s/%s.coverage_profiles/coverage_profiles.all_circles.pdf already exists\n'
                % (outfolder, sample))
            output_file.close()

    # Step8 : (optional, requires step6) pictures for all circles
    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'a')
    output_file.write(
        '\tfinished\n\n%s: starting to visualize the coverage profiles\n' %
        (datetime.datetime.now()))
    output_file.close()
    if not 'step8' in skipped_steps:
        if os.path.isdir('%s/%s.coverage_profiles/' % (outfolder, sample)):
            files = os.listdir('%s/%s.coverage_profiles' % (outfolder, sample))
            folders = os.listdir(outfolder)
            if not '%s.coverage_pictures' % (sample) in folders:
                os.mkdir('%s/%s.coverage_pictures' % (outfolder, sample))

            def run_r_parallel(f):
                if f.endswith('.txt'):
                    os.system(
                        'make_coverage_picture.R %s/%s.coverage_profiles/%s %s/%s.coverage_pictures/'
                        % (outfolder, sample, f, outfolder, sample))

            from pathos.multiprocessing import ProcessingPool as Pool

            pool = Pool(num_cpus)
            pool.map(run_r_parallel, files)

        else:
            output_file = open(
                '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')),
                'a')
            output_file.write(
                '\tYou are trying to generate coverage pictures '
                'without generating coverage profiles, please run step 5 (get_coverage_profile)\n'
            )
            output_file.close()

    output_file = open(
        '%s/%s.logfile.%s' % (outfolder, sample, dt.replace(' ', '_')), 'a')
    output_file.write('\tfinished\n\n\nFUCHS finished at %s\n\n' %
                      (datetime.datetime.now()))
    output_file.write("FUCHS took --- %s minutes ---\n\n" % (round(
        (time.time() - start_time) / 60.0)))
    output_file.close()
def extract_marker_gene_reads(bams,out_prefix,cores):
    N = min(len(bams),cores,cpu_count())
    extract_reads.extract_reads(bamlist=bams,cores=N,prefix=out_prefix)
예제 #8
0
def extract_sample_gene_read(params):
    smpl, bam = params
    extract_reads.extract_reads(bamlist=[bam], prefix=smpl, cores=1)
    return None