def __init__(self, orientation, bed, job_name, out_sh, submit, directory): cmd_list = [] for filename in glob('{}/*sorted.bam'.format(directory)): cmd_list.append( 'count_tags.py --annotation_file {} -f {} -b {} -o {}.count'.format( bed, orientation, filename, filename)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=cmd_list, job_name=job_name, nodes=1, ppn=16, queue='home', walltime='1:00:00', array=True, max_running=20) sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh=None, queue_type='PBS', directory='./', submit=True): cmd_list = [] for file in glob('{}/*sam'.format(directory)): cmd_list.append( 'samtools view -bS -q 10 {} > {}.bam'.format(file, file)) sub = Submitter(queue_type=queue_type, sh_filename=out_sh, commands=cmd_list, job_name=job_name, nodes=1, ppn=1, queue='home', walltime='1:00:00', array=True, max_running=20) sub.job(submit=submit)
def __init__(self, gtf, job_name, out_sh, submit, directory): commands = [] for bam in iglob('{}/*.sorted.bam'.format(directory.rstrip('/'))): commands.append('cufflinks --GTF {0} --GTF-guide ' '--multi-read-correct --num-threads 8 {1}'.format( gtf, bam )) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=commands, job_name=job_name, nodes=1, ppn=8, walltime='0:30:00', array=True, max_running=20 ) sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh=None, directory='./', submit=True): command_list = [] for file in glob('{}/*sorted.bam'.format(directory)): command_list.append('samtools index {0}'.format(file)) # def submit_and_write(name, command_list): sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=command_list, job_name=job_name, nodes=1, ppn=1, queue='home', array=True, max_running=20, walltime='0:30:00') sub.job(submit=submit)
def __init__(self, job_name, out_sh=None, directory='./', submit=True): cmd_list = [] for file in glob('{}/*bam'.format(directory.rstrip('/'))): cmd_list.append( "samtools view -h -F 4 {0} | awk '$6 !~ /N/ || $1 ~ /@/' " "| " "samtools view -bS - > {0}.unspliced.bam".format(file)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=cmd_list, job_name=job_name, nodes=1, ppn=16, queue='home', array=True, walltime='0:30:00', max_running=10) sub.job(submit=submit)
def __init__(self, job_name, out_sh, directory, submit): command_list = [] for filename in glob('{}/*bam'.format(directory)): command_list.append( 'samtools sort -@ 8 -m 50000000000 {0} {0}.sorted' .format(filename)) # def submit_and_write(name, command_list): sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=command_list, job_name=job_name, nodes=1, ppn=8, queue='home', array=True, max_running=10, walltime='0:30:00') sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh, submit, directory): cmd_list = [] for file in glob('{}/*count'.format(directory.rstrip('/'))): cmd_list.append('single_RPKM.py -i {} -o {}.rpkm'.format( file, file)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=cmd_list, job_name=job_name, nodes=1, ppn=1, queue='home', walltime='1:00:00', array=True, max_running=20) sub.write_sh(submit=submit)
def __init__(self, orientation, bed, job_name, out_sh, submit, directory): cmd_list = [] for filename in glob('{}/*sorted.bam'.format(directory)): cmd_list.append( 'count_tags.py --annotation_file {} -f {} -b {} -o {}.count'. format(bed, orientation, filename, filename)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=cmd_list, job_name=job_name, nodes=1, ppn=16, queue='home', walltime='1:00:00', array=True, max_running=20) sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh=None, directory='./', submit=True): cmd_list = [] for filename in glob('{}/*fastq'.format(directory)): cmd_list.append('bowtie \ -c \ -S \ -q \ -p 16 \ -e 100 \ -l 20 \ --un {0}.norep \ all_ref \ {0} \ | grep -v \"@\" \ | perl /home/ppliu/tscc_scripts/count_aligned_from_sam.pl \ > {0}.repeat_counts'.format(filename)) for filename in glob('{}/*gz'.format(directory)): cmd_list.append('gunzip -c {0} \ |bowtie \ -c \ -S \ -q \ -p 16 \ -e 100 \ -l 20 \ --un {0}.norep \ all_ref \ - \ | grep -v \"@\" \ | perl /home/ppliu/tscc_scripts/count_aligned_from_sam.pl \ > {0}.repeat_counts'.format(filename)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=cmd_list, job_name=job_name, nodes=1, ppn=16, walltime='2:30:00', array=True, max_running=20) sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh=None, directory='./', submit=True): cmd_list = [] for filename in glob('{}/*fastq'.format(directory)): cmd_list.append('bowtie \ -c \ -S \ -q \ -p 16 \ -e 100 \ -l 20 \ --un {0}.norep \ all_ref \ {0} \ | grep -v \"@\" \ | perl /home/ppliu/tscc_scripts/count_aligned_from_sam.pl \ > {0}.repeat_counts'.format(filename)) for filename in glob('{}/*gz'.format(directory)): cmd_list.append('gunzip -c {0} \ |bowtie \ -c \ -S \ -q \ -p 16 \ -e 100 \ -l 20 \ --un {0}.norep \ all_ref \ - \ | grep -v \"@\" \ | perl /home/ppliu/tscc_scripts/count_aligned_from_sam.pl \ > {0}.repeat_counts'.format(filename)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=cmd_list, job_name=job_name, nodes=1, ppn=16, walltime='2:30:00', array=True, max_running=20 ) sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh, submit=False, directory='./'): try: os.mkdir('{}/filtered/'.format(directory.rstrip('/'))) except OSError: pass commands = [] for filename in iglob('{}/*.fastq.gz'.format(directory.rstrip('/'))): #TODO: the -l argument "20" should be a % of read length commands.append('echo {0}; zcat {0} | fastx_artifacts_filter | ' 'fastq_quality_trimmer -l 20 -t 30 | ' 'fastq_quality_filter -q 30 -p 90 -z ' '> filtered/{0}'.format(filename)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=commands, job_name=job_name, nodes=1, ppn=2, queue='home', array=True, max_running=20, walltime='1:00:00') sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh=None, queue_type='PBS', directory='./', submit=True): cmd_list = [] for file in glob('{}/*sam'.format(directory)): cmd_list.append('samtools view -bS -q 10 {} > {}.bam'.format( file, file)) sub = Submitter(queue_type=queue_type, sh_filename=out_sh, commands=cmd_list, job_name=job_name, nodes=1, ppn=1, queue='home', walltime='1:00:00', array=True, max_running=20) sub.job(submit=submit)
def __init__(self, genome, out_dir='./', directory='./', submit=True, ppn=8, job_name='STAR', out_sh='STAR.sh', walltime='0:30:00', outReadsUnmapped='Fastx', outFilterMismatchNmax=5, outFilterMismatchNoverLmax=0.3, outFilterMultimapNmax=5, outFilterScoreMin=10, outFilterType='BySJout', outSAMattributes='All', outSAMstrandField='intronMotif', clip5pNbases=0, clip3pNbases=0, additional_STAR_args='', extension='.gz'): """Read the fastq files in a directory, assuming that the first 2 underscore-separated parts of a filename are the unique sample ID, then running STAR. Most of these arguments are the defaults in STAR, except: outReadsUnmapped : str 'Fastx' instead of 'None' so the unmapped reads can be remapped to the spikein genomes, for example outFilterMismatchNmax : int 5 instead of 10 outFilterMultimapNmax : int 5 instead of 10 outFilterType : str 'BySJout' instead of 'None', so that all junction reads pass our stringent filter of at least 4bp overhang for annotated and at least 8bp overhang for unannotated outSAMattributes : str 'All' instead of 'None' for more information just in case outSAMstrandField : str 'intronMotif' instead of 'None' for compatibility with Cufflinks """ commands = [] # Make the directory try: os.mkdir(out_dir) except OSError: # It's already there, don't do anything pass # Set of unique sample ids for checking if we've read them all sample_ids = set([]) for read1 in iglob('{}/*R1*{}'.format(directory.rstrip('/'), extension)): # if read1.endswith('gz'): # compressed = True # else: # compressed = False # readFilesCommand = 'zcat' if compressed else 'cat' # Remove trailing "A" and "B" so they get merged sample_id = '_'.join(os.path.basename(read1).split('.')[0].split( '_')[:2]).rstrip( 'ABCDEFGH') if sample_id in sample_ids: continue paired = os.path.isfile(read1.replace('R1', 'R2')) print sample_id, 'paired', paired read1 = ','.join(glob('{}*R1*{}'.format(sample_id, extension))) read2 = read1.replace('R1', 'R2') if paired else "" print 'R1', read1 print 'R2', read2 sample_ids.add(sample_id) # print sample_id commands.append('''STAR \ --runMode alignReads \ --runThreadN {0} \ --genomeDir {1} \ --genomeLoad LoadAndRemove \ --readFilesCommand zcat \ --readFilesIn {2} {3} \ --outFileNamePrefix {4}/{5}. \ --outReadsUnmapped {6} \ --outFilterMismatchNmax {7} \ --outFilterMismatchNoverLmax {8} \ --outFilterMultimapNmax {9} \ --outFilterScoreMin {10} \ --outFilterType {11} \ --outSAMattributes {12} \ --outSAMstrandField {13} \ --clip5pNbases {14} \ --clip3pNbases {15} \ {16}'''.format(ppn, genome, read1, read2, out_dir.rstrip('/'), sample_id, outReadsUnmapped, outFilterMismatchNmax, outFilterMismatchNoverLmax, outFilterMultimapNmax, outFilterScoreMin, outFilterType, outSAMattributes, outSAMstrandField, clip5pNbases, clip3pNbases, additional_STAR_args)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=commands, job_name=job_name, nodes=1, ppn=ppn, array=True, max_running=20, queue='home', walltime=walltime) sub.write_sh(submit=submit)
def __init__(self, genome, out_dir='./', directory='./', submit=True, ppn=8, job_name='STAR', out_sh='STAR.sh', walltime='0:30:00', outReadsUnmapped='Fastx', outFilterMismatchNmax=5, outFilterMismatchNoverLmax=0.3, outFilterMultimapNmax=5, outFilterScoreMin=10, outFilterType='BySJout', outSAMattributes='All', outSAMstrandField='intronMotif', clip5pNbases=0, clip3pNbases=0, additional_STAR_args='', extension='.gz'): """Read the fastq files in a directory, assuming that the first 2 underscore-separated parts of a filename are the unique sample ID, then running STAR. Most of these arguments are the defaults in STAR, except: outReadsUnmapped : str 'Fastx' instead of 'None' so the unmapped reads can be remapped to the spikein genomes, for example outFilterMismatchNmax : int 5 instead of 10 outFilterMultimapNmax : int 5 instead of 10 outFilterType : str 'BySJout' instead of 'None', so that all junction reads pass our stringent filter of at least 4bp overhang for annotated and at least 8bp overhang for unannotated outSAMattributes : str 'All' instead of 'None' for more information just in case outSAMstrandField : str 'intronMotif' instead of 'None' for compatibility with Cufflinks """ commands = [] # Make the directory try: os.mkdir(out_dir) except OSError: # It's already there, don't do anything pass # Set of unique sample ids for checking if we've read them all sample_ids = set([]) for read1 in iglob('{}/*R1*{}'.format(directory.rstrip('/'), extension)): # if read1.endswith('gz'): # compressed = True # else: # compressed = False # readFilesCommand = 'zcat' if compressed else 'cat' # Remove trailing "A" and "B" so they get merged sample_id = '_'.join( os.path.basename(read1).split('.')[0].split('_')[:2]).rstrip( 'ABCDEFGH') if sample_id in sample_ids: continue paired = os.path.isfile(read1.replace('R1', 'R2')) print sample_id, 'paired', paired read1 = ','.join(glob('{}*R1*{}'.format(sample_id, extension))) read2 = read1.replace('R1', 'R2') if paired else "" print 'R1', read1 print 'R2', read2 sample_ids.add(sample_id) # print sample_id commands.append('''STAR \ --runMode alignReads \ --runThreadN {0} \ --genomeDir {1} \ --genomeLoad LoadAndRemove \ --readFilesCommand zcat \ --readFilesIn {2} {3} \ --outFileNamePrefix {4}/{5}. \ --outReadsUnmapped {6} \ --outFilterMismatchNmax {7} \ --outFilterMismatchNoverLmax {8} \ --outFilterMultimapNmax {9} \ --outFilterScoreMin {10} \ --outFilterType {11} \ --outSAMattributes {12} \ --outSAMstrandField {13} \ --clip5pNbases {14} \ --clip3pNbases {15} \ {16}'''.format(ppn, genome, read1, read2, out_dir.rstrip('/'), sample_id, outReadsUnmapped, outFilterMismatchNmax, outFilterMismatchNoverLmax, outFilterMultimapNmax, outFilterScoreMin, outFilterType, outSAMattributes, outSAMstrandField, clip5pNbases, clip3pNbases, additional_STAR_args)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=commands, job_name=job_name, nodes=1, ppn=ppn, array=True, max_running=20, queue='home', walltime=walltime) sub.write_sh(submit=submit)