def cat_counter_references(counter_references=None, target_dir=curdir, path_to_bowtie2='bowtie2', logger=None, **kwargs): if counter_references is None: return try: makedirs(target_dir, mode=0755) except OSError: pass debug('Validating counter-references and building counter-reference index') valid_references = validate_references(references=counter_references, target_dir=target_dir, path_to_bowtie2=path_to_bowtie2, logger=logger, environ_key= 'SOT_DEFAULT_COUNTER_REFERENCES') crefs_fa = open(join(target_dir, 'counter_references.fa'), 'w') for ref in valid_references: Popen([path_to_bowtie2 + '-inspect', ref], stdout=crefs_fa).wait() crefs_index = join(target_dir, counter_references) args = [path_to_bowtie2 + '-build', crefs_fa, crefs_index] P = Popen(args, stderr=PIPE) stderr = P.communicate()[1] if stderr.startswith('Error'): critical(stderr) critical('No counter-references will be used.') return crefs_index
def validate_references(references=None, path_to_bowtie2='bowtie2', logger=None, environ_key='SOT_DEFAULT_REFERENCES', target_dir=curdir, **kwargs): makedirs(target_dir, mode=0755) debug('Validating references') new_references = [] if references is None: if environ_key in environ: references = environ[environ_key].split() else: critical('no reference genomes specified') return [] for r in references: bowtie2_index = find_bowtie2_index(r, path_to_bowtie2=path_to_bowtie2) if bowtie2_index is None: if exists(r): debug('Attempting to build bowtie2 index from %s' % r) new_index = fasta_to_bowtie2(r, target_dir=target_dir, path_to_bowtie2=path_to_bowtie2) if new_index is not None: new_references.append(new_index) continue else: critical('Failed to build bowtie2 index.') critical('bowtie2 could not find the index for %s', r) critical('we will not align to %s', r) else: new_references.append(bowtie2_index) return new_references
def find_bowtie_index(r, path_to_bowtie='bowtie'): """check for bowtie index as given. return True if found, else return False """ args = [path_to_bowtie + '-inspect', '-v', '-s', r] debug(' '.join(args)) P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE, cwd=mkdtemp()) stderr = P.communicate()[1].splitlines() if not stderr[0].startswith('Could not locate'): for line in stderr: if line.startswith('Opening'): index_ebwt1 = line[(1+line.find('"')):line.rfind('"')] index_basename = index_ebwt1[0:index_ebwt1.find('.1.ebwt')] return index_basename rprime = join(getcwd(), r) args = [path_to_bowtie + '-inspect', '-v', '-s', rprime] debug(' '.join(args)) P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE, cwd=mkdtemp()) stderr = P.communicate()[1].splitlines() if not stderr[0].startswith('Could not locate'): for line in stderr: if line.startswith('Opening'): index_ebwt1 = line[(1+line.find('"')):line.rfind('"')] index_basename = index_ebwt1[0:index_ebwt1.find('.1.ebwt')] return index_basename return None
def find_bowtie2_index(r, path_to_bowtie2='bowtie2'): """check for bowtie2 index as given. return True if found, else return False """ args = [path_to_bowtie2 + '-inspect', '-v', '-s', r] debug(' '.join(args)) P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE, cwd=mkdtemp()) stderr = P.communicate()[1].splitlines() if not stderr[0].startswith('Could not locate'): for line in stderr: if line.startswith('Opening'): index_bt2 = line[(1 + line.find('"')):line.rfind('"')] index_basename = index_bt2[0:index_bt2.find('.1.bt2')] return index_basename for d in [getcwd(), os.path.split(path_to_bowtie2)[0], join(os.path.split(path_to_bowtie2)[0], 'indexes')]: rprime = join(d, r) args = [path_to_bowtie2 + '-inspect', '-v', '-s', rprime] debug(' '.join(args)) P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE, cwd=mkdtemp()) stderr = P.communicate()[1].splitlines() if not stderr[0].startswith('Could not locate'): for line in stderr: if line.startswith('Opening'): index_bt2 = line[(1 + line.find('"')):line.rfind('"')] index_basename = index_bt2[0:index_bt2.find('.1.bt2')] return index_basename return None
def remove_reads(parsed_filename, remove_all=False, sam_out=False, debug=False, **kwargs): ''' note: you must be looking at a sorted file, or this won't work ''' if sam_out: write_opts = 'w' else: write_opts = 'wb' with pysam.Samfile(parsed_filename.mapped_file) as mapped: if remove_all: reads_to_remove = set([read.qname for read in mapped]) else: reads_to_remove = set([read.qname for read in mapped if is_mapped(read)]) if debug: scripter.debug('Found {!s} reads in {!s}'.format(len(reads_to_remove), parsed_filename.mapped_file)) with pysam.Samfile(parsed_filename.input_file) as bam_file: with pysam.Samfile(parsed_filename.output_file, write_opts, template = bam_file) as out_bam_file: for read in bam_file: if read.qname not in reads_to_remove: out_bam_file.write(read) return
def __init__(self, filename, sam_out=False, *args, **kwargs): super(SubtractBamFilenameParser, self).__init__(filename, sam_out=sam_out, *args, **kwargs) fext = os.path.splitext(filename)[1].rstrip(os.extsep) if not (fext == 'sam' or fext =='bam'): raise InvalidFileException if not self.is_dummy_file: # check for the mapped_file input_dir_parts = self.input_dir.split(os.sep) glob_path = ['mapped', input_dir_parts[0], '*'] + \ input_dir_parts[2:] + \ [os.path.basename(self.input_file)] potential_filenames = glob.glob(os.sep.join(glob_path)) if len(potential_filenames) is 1: self.mapped_file = potential_filenames[0] elif len(potential_filenames) is 0: raise scripter.Usage('Could not find mapped file') else: raise scripter.Usage('Ambiguous mapped file', *potential_filenames) scripter.debug('Mapped file will be', self.mapped_file) if sam_out: self.output_file = os.sep.join([self.output_dir, self.with_extension('sam')]) else: self.output_file = os.sep.join([self.output_dir, self.with_extension('bam')]) scripter.debug('Output file will be', self.output_file)
def __init__(self, filename, *args, **kwargs): super(BowtieFilenameParser, self).__init__(filename, *args, **kwargs) open_func, format = discover_file_format(filename) # self.split_file = False self.format = format self.open_func = open_func self.second_file = None if format == 'SAM' or format == 'BAM': self.use_pysam = True # try to open the file so we're sure it works f = pysam.Samfile(filename) aread = f.next() self.paired_end = aread.is_paired del f, aread self.fastq_source = 'Unknown' elif format == 'FASTQ': self.use_pysam = False self.check_paired_end() if len(self.protoname.split('.')) > 6: self.fastq_source = self.protoname.split('.')[6] else: self.fastq_source = 'Unknown' else: if self.second_file is None: scripter.debug('Skipping file %s with dubious format', filename) raise scripter.InvalidFileException else: scripter.debug('Skipping files %s, %s with dubious' 'format', filename, self.second_file) raise scripter.InvalidFileException
def fastq_to_bowtie(fasta_file, target_dir=curdir, path_to_bowtie='bowtie'): """given a filename, makes a bowtie index if that file is a FASTA file """ if exists(fasta_file): f = open(fasta_file, 'rU') for line in f: if line.startswith('#'): continue elif line.startswith('>'): args = [path_to_bowtie + '-build', fasta_file, join(target_dir, fasta_file)] debug(' '.join(args)) P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE) stderr = P.communicate()[1] if len(stderr.splitlines()) == 0: return join(getcwd(), target_dir, fasta_file) elif stderr.splitlines()[0].startswith('Error'): return None else: return join(getcwd(), target_dir, fasta_file) return None
def fasta_to_bwa(fasta_file, path_to_bwa='bwa'): """given a filename, makes a bwa index if that file is a FASTA file """ if exists(fasta_file): f = open(fasta_file, 'rU') for line in f: if line.startswith('#'): continue elif line.startswith('>'): args = [path_to_bwa, 'index', fasta_file] debug(" ".join(args)) P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE) stderr = P.communicate()[1] if stderr.splitlines()[0].startswith('Error'): return None else: return fasta_file return None
def validate_references(references=None, path_to_bwa='bwa', logger=None, environ_key='SOT_DEFAULT_REFERENCES', target_dir=curdir, **kwargs): ## Make the output directory, complain if we fail #if os.path.exists(target_dir): # debug('Output directory %s already exists', target_dir) #else: # debug('Creating directory "%s"', target_dir) # makedirs(target_dir, mode=0755) # if not os.path.exists(target_dir): # raise IOError('Could not create directory %s' % target_dir) debug('Validating references') new_references = [] if references is None: if environ_key in environ: references = environ[environ_key].split() else: critical('no reference genomes specified') return [] for r in references: if exists(r): if not all(map(exists, [r + '.amb', r + '.ann', r + '.bwt', r + '.pac', r + '.sa'])): info('Attempting to build bwa index from %s' % r) new_index = fasta_to_bwa(r, target_dir=target_dir, path_to_bwa=path_to_bwa) if new_index is not None: new_references.append(new_index) continue else: critical('Failed to build bwa index.') else: debug('Found bwa index for %s' % r) new_references.append(r) else: critical('bwa could not find the reference %s', r) critical('we will not align to %s', r) return new_references
def __init__(self, filename, include_width_in_name=False, target=None, motif_file='unknown_motif', genome=None, *args, **kwargs): fext = splitext(filename)[1].lstrip(extsep) if fext == 'bed': self.is_bed = True self.is_xls = False elif fext == 'xls': self.is_bed = False self.is_xls = True else: raise InvalidFileException motif_name = sub('\W', '_', abspath(motif_file)) target = target + sep + motif_name super(PeaksFilenameParser, self).__init__(filename, target = target, *args, **kwargs) self.fasta_file = None for file_extension in ['fa', 'fasta', 'FA', 'FASTA']: fasta_file = join(self.input_dir, extsep.join([self.protoname, file_extension])) debug("Trying", fasta_file) if exists(fasta_file): self.fasta_file = fasta_file debug("Using", fasta_file) break if self.fasta_file is None: warning('Could not find the FASTA file for %s', self.input_file) if genome is None: raise Usage("Could not find the FASTA file for ", self.input_file, " and no genome was specified") else: t = try_to_find_genome(genome) if t is None: raise Usage("Could not find the FASTA file for ", self.input_file, " and failed to use %s" % genome) else: fasta_file = join(self.input_dir, '%s.fa' % self.protoname) debug('Creating FASTA file %s for %s using %s', fasta_file, self.input_file, genome) input_fhd = open(self.input_file, 'rU') fasta_fhd = open(fasta_file, 'w') twobit_reader(t, input_stream=input_fhd, write=fasta_fhd.write) fasta_fhd.close() self.fasta_file = fasta_file
def __init__(self, filename, controls = {}, *args, **kwargs): if not os.path.splitext(filename)[1] == '.bam': raise scripter.InvalidFileException(filename) super(BAMFilenameParser, self).__init__(filename, *args, **kwargs) sample = self.protoname control_files = [v[1] for v in controls.values()] # check controls if controls.has_key(sample): sample_name, control = controls[sample] scripter.debug('%s has control %s', sample, control) if control is None: self.control_file = None else: self.control_file = os.path.join(self.input_dir, control + '.bam') if controls.has_key(self.input_file): sample_name, control = controls[self.input_file] scripter.debug('%s has control %s', self.input_file, control) if control is None: self.control_file = None else: self.control_file = control elif sample in control_files or self.input_file in control_files: scripter.debug('%s is a control, aborting', sample) raise scripter.InvalidFileException else: scripter.debug('%s has no control indicated, continuing anyway', sample) # not in setup.txt, make an entry in controls self.control_file = None sample_name = sample controls[sample] = (sample, None) self.sample_name = sample_name self.output_dir = os.path.join(self.output_dir, sample_name)
def __init__(self, filename, verbose=False, *args, **kwargs): super(BarcodeFilenameParser, self).__init__(filename, *args, **kwargs) protoname = self.protoname # check for old-style if os.path.splitext(protoname)[-3:] == 'all': protoname = protoname[0:-4] # check if this is a paired-end file # if so, grab its partner input_file = self.input_file illumina_name = os.path.basename(input_file) # try new style first new_info = get_new_pair_info(illumina_name) if new_info is not None: scripter.debug('NOTICE: Detected new-style paired read file.') read = new_info[0] if read == 'R2': scripter.debug('This is the second file, ignoring it.') raise scripter.InvalidFileException(input_file) elif read == 'R1': second_file = os.path.join(self.input_dir, new_info[1]) try: scripter.assert_path(second_file) scripter.debug('Found %s', second_file) self.second_file = second_file self.protoname2 = os.path.splitext( os.path.basename(second_file))[0] paired_end = True except IOError: scripter.debug('Failed to find paired end file') paired_end = False else: scripter.debug('Failed to find paired end') paired_end = False elif illumina_name.count('_') >= 3: scripter.debug('NOTICE: Detected paired read file.') iln_parts = illumina_name.split('_') if iln_parts[2] == '1': scripter.debug('Attempting to find second file.') second_file = os.sep.join([self.input_dir, '_'.join(iln_parts[0:2] + ['2'] + iln_parts[3:])]) try: scripter.assert_path(second_file) scripter.debug('Found %s', second_file) self.second_file = second_file self.protoname2 = os.path.splitext( os.path.basename(second_file))[0] paired_end = True except IOError: scripter.debug('Failed to find paired end file') paired_end = False elif iln_parts[2] == '2': scripter.debug('This is the second file, ignoring it.') raise scripter.InvalidFileException(input_file) else: scripter.debug('Failed to find paired end') paired_end = False else: paired_end = False self.paired_end = paired_end
def check_paired_end(self): # check if this is a paired-end file # if so, grab its partner seqfile_name = os.path.basename(self.input_file) pair_info = get_pair_info(seqfile_name) if pair_info is None: pair_info = get_new_pair_info(seqfile_name) # if pair_info is not None: self.split_file = True if pair_info is not None: pair_index = pair_info[0] second_name = pair_info[1] new_name = pair_info[2] scripter.debug('NOTICE: Detected paired read file.') if pair_index == '1': scripter.debug('Attempting to find second file.') self.second_file = os.sep.join([self.input_dir, second_name]) self.protoname = os.path.splitext(new_name)[0] scripter.debug('Found %s', self.second_file) try: scripter.assert_path(self.second_file) self.paired_end = True except IOError: scripter.debug('Failed to find paired end file') self.paired_end = False elif pair_index == '2': scripter.debug('This is the second file, ignoring it.') raise scripter.InvalidFileException else: scripter.debug('Failed to find paired end') self.paired_end = False else: scripter.debug('This file contains single-end reads.') self.paired_end = False
def main(): """ runs the main checkmyclones script """ e = scripter.Environment(doc=__doc__, version=VERSION, handle_files=False) parser = e.argument_parser parser.add_argument( "--path-to-gbdb", default="/gbdb", help='Location of "gdbdb" or 2bit files. If gbdb is not in /gbdb or C:\gbdb, specify the path here', ) ggroup = parser.add_mutually_exclusive_group() ggroup.add_argument( "--genome", help="Use 2bit file foo as reference genome (Looks also for {path-to-gbdb}/foo/foo.2bit))" ) ggroup.add_argument("--hg18", const="hg18", action="store_const", help="Shortcut for --genome hg18") ggroup.add_argument("--hg19", const="hg19", action="store_const", help="Shortcut for --genome hg19") ggroup.add_argument("--mm9", const="mm9", action="store_const", help="Shortcut for --genome mm9") parser.add_argument("--reverse-orientation", action="store_true", help="Check only the reverse orientation") parser.add_argument("--both-orientations", action="store_true", help="Check forward and reverse orientations") parser.add_argument("--clones", nargs="+", help="list of files that contain clone sequences") parser.add_argument("--references", nargs="*", help="list of files that contain reference sequences") parser.add_argument("--bed-reference", help="Use the regions listed in the bed file as reference sequences") parser.add_argument("--only-use-references", nargs="*", help="Use the only regions with the following names") parser.set_defaults(**{"genome": "hg19", "logging_level": WARNING}) args = parser.parse_args() context = vars(args) scripter.LOGGER.setLevel(context["logging_level"]) clones = load_all_seqs(context["clones"], recursive=context["recursive"]) ref_seqs = [] if len(clones) == 0: raise Usage("Could not find any clone sequences") if context["references"] is None and context["bed_reference"] is None: raise Usage("No reference sequences specified") else: if context["bed_reference"] is not None: genome = find_2bit_file(context["genome"], context["path_to_gbdb"]) print "Fetching sequences from %s using %s" % (context["bed_reference"], genome) ref_seqs.extend(read_bed_file(context["bed_reference"], genome=genome)) if context["references"] is not None: ref_seqs.extend(load_all_seqs(context["references"], recursive=context["recursive"])) specified_references = context["only_use_references"] if specified_references is not None: good_name = lambda ref: real_name(ref.Name) in specified_references ref_seqs = filter(good_name, ref_seqs) if len(ref_seqs) == 0: raise Usage("Could not find any reference sequences") signal.signal(signal.SIGCHLD, signal.SIG_DFL) debug("multiprocessing enabled") p = multiprocessing.Pool(processes=context["num_cpus"]) debug("Initialized pool of %d workers", context["num_cpus"]) results = [] forward = not context["reverse_orientation"] rc = context["reverse_orientation"] or context["both_orientations"] or False for ref in ref_seqs: print "Loaded reference %s" % ref.Name for clone in clones: p.apply_async(announce_first, (clone,), context) for ref in ref_seqs: if forward: r = p.apply_async(compare_clone_to_ref, (clone, ref), context) results.append(r) if rc: r = p.apply_async(compare_clone_to_ref, (clone.rc(), ref), context) results.append(r) p.close() p.join() result_values = [] for r in results: current_pickle = r.get() current_result = loads(current_pickle) if current_result is None: continue else: result_values.append(current_result) all_matches = [] for clone_name, group in groupby(result_values, key=itemgetter(0)): alns = map(itemgetter(1), list(group)) is_matched = lambda aln: not aln.is_truncated and not aln.has_gaps matches = filter(is_matched, alns) if len(matches) > 0: all_matches.extend(alns) continue elif len(alns) == 0: print "No match for %s" % clone_name continue else: print_good_alns(alns) print_matched_alns(all_matches) return