def randomize_names_paired(filename1, filename2, working_dir): """ Write out paired end sequences with randomized names. Args: filename1: The first paired file to randomize. filename2: The second paired file to randomize. working_dir: Put temp files here. """ temp_file1 = os.path.join(working_dir, str(uuid.uuid4())) temp_file2 = os.path.join(working_dir, str(uuid.uuid4())) with open(filename1, 'r') as handle1, \ open(filename2, 'r') as handle2, \ open(temp_file1, 'w') as out_handle1, \ open(temp_file2, 'w') as out_handle2: final_generator = itertools.izip( custom_generators.parse_fastq(handle1), custom_generators.parse_fastq(handle2)) for record1, record2 in final_generator: name = str(uuid.uuid4()) record1 = (''.join(['@', name, "/1", '\n']), record1[1], record1[2], record1[3]) record2 = (''.join(['@', name, "/2", '\n']), record2[1], record2[2], record2[3]) out_handle1.write(''.join(record1)) out_handle2.write(''.join(record2)) shutil.move(temp_file1, filename1) shutil.move(temp_file2, filename2)
def add_generator(fq_filename, sam_filename, drm_list, all_blockers, remove_rt, paired_end): """ Refulfill all prevalences by adding reads. Designed to be chained to a delete_generator. Args: fq_filename: a fastq filename. sam_filename: a sam filename. drm_list: a list of DRMs that must have exact prevalences. all_blockers: blocking DRM sets, with populated num_deleted. remove_rt: are we removing (almost) all RT DRMs? Returns: A tuple containing: * A generator that adds reads to re-fulfill all prevalences. * A list of open file handles (NB because this generator is not exhausted by this function, file handles must be closed only when generator is exhausted). """ final_generator = iter([]) handles_to_close = [] for blocker in all_blockers: if paired_end: fq_handle1 = open(fq_filename[0], 'r') fq_handle2 = open(fq_filename[1], 'r') sam_handle1 = open(sam_filename[0], 'r') sam_handle2 = open(sam_filename[1], 'r') in_generator = itertools.izip( custom_generators.parse_fastq(fq_handle1), custom_generators.parse_sam(sam_handle1), custom_generators.parse_fastq(fq_handle2), custom_generators.parse_sam(sam_handle2) ) else: fq_handle = open(fq_filename, 'r') sam_handle = open(sam_filename, 'r') handles_to_close.extend([fq_handle, sam_handle]) in_generator = itertools.izip( custom_generators.parse_fastq(fq_handle), custom_generators.parse_sam(sam_handle) ) generator_frag = add_generator_internal(in_generator, drm_list, blocker, remove_rt) final_generator = itertools.chain(final_generator, generator_frag) return final_generator, handles_to_close
def single_generator(fq_handle, sam_handle, drm_list, reads_per_drm, remove_rt): """ Return a generator to an output file and an associated dictionary to be filled. Args: fq_handle: an open file handle to a fastq file. sam_handle: an open file handle to a sam file. drm_list: a list of DRMs that must have exact prevalences. reads_per_drm: the number of reads to add per DRM. remove_rt: are we removing (almost) all RT DRMs? Returns: a tuple of: * an unpopulated dictionary of {DRM: coverage count} * the generator which, once exhausted, populates the dictionary and yields the reads to include. """ drm_dict = {d: 0 for d in drm_list} in_generator = itertools.izip(custom_generators.parse_fastq(fq_handle), custom_generators.parse_sam(sam_handle)) out_generator = ({ "read": read[0], "sam": read[1] } for read in in_generator \ if within_required_prevalence( read, drm_dict, reads_per_drm, remove_rt )) return drm_dict, out_generator
def _open_all_data_files(sequences, fastq_files, sam_files, diff_files): open_fq_files = {} for mutation_type, fq in fastq_files.iteritems(): fastq_siblings = [] for _fq in fq: with open(_fq, 'r') as f: fastq_siblings.append([i for i in parse_fastq(f)]) open_fq_files[mutation_type] = fastq_siblings open_sam_files = {} for mutation_type, fs in sam_files.iteritems(): with open(fs, 'r') as f: sam_reads = {} for sam_read in parse_sam(f): read_id = sam_read[0].strip('\n') if read_id in sam_reads: sam_reads[read_id].append(sam_read) else: sam_reads[read_id] = [sam_read] open_sam_files[mutation_type] = sam_reads open_diff_files = {'null': {i.id: [] for i in sequences}} for mutation_type, fd in diff_files.iteritems(): with open(fd, 'r') as f: open_diff_files[mutation_type] = pickle.load(f) return open_fq_files, open_sam_files, open_diff_files
def single_generator(fq_handle, sam_handle, drm_list, reads_per_drm, remove_rt): """ Return a generator to an output file and an associated dictionary to be filled. Args: fq_handle: an open file handle to a fastq file. sam_handle: an open file handle to a sam file. drm_list: a list of DRMs that must have exact prevalences. reads_per_drm: the number of reads to add per DRM. remove_rt: are we removing (almost) all RT DRMs? Returns: a tuple of: * an unpopulated dictionary of {DRM: coverage count} * the generator which, once exhausted, populates the dictionary and yields the reads to include. """ drm_dict = { d : 0 for d in drm_list } in_generator = itertools.izip( custom_generators.parse_fastq(fq_handle), custom_generators.parse_sam(sam_handle) ) out_generator = ({ "read": read[0], "sam": read[1] } for read in in_generator \ if within_required_prevalence( read, drm_dict, reads_per_drm, remove_rt )) return drm_dict, out_generator
def add_generator(fq_filename, sam_filename, drm_list, all_blockers, remove_rt, paired_end): """ Refulfill all prevalences by adding reads. Designed to be chained to a delete_generator. Args: fq_filename: a fastq filename. sam_filename: a sam filename. drm_list: a list of DRMs that must have exact prevalences. all_blockers: blocking DRM sets, with populated num_deleted. remove_rt: are we removing (almost) all RT DRMs? Returns: A tuple containing: * A generator that adds reads to re-fulfill all prevalences. * A list of open file handles (NB because this generator is not exhausted by this function, file handles must be closed only when generator is exhausted). """ final_generator = iter([]) handles_to_close = [] for blocker in all_blockers: if paired_end: fq_handle1 = open(fq_filename[0], 'r') fq_handle2 = open(fq_filename[1], 'r') sam_handle1 = open(sam_filename[0], 'r') sam_handle2 = open(sam_filename[1], 'r') in_generator = itertools.izip( custom_generators.parse_fastq(fq_handle1), custom_generators.parse_sam(sam_handle1), custom_generators.parse_fastq(fq_handle2), custom_generators.parse_sam(sam_handle2)) else: fq_handle = open(fq_filename, 'r') sam_handle = open(sam_filename, 'r') handles_to_close.extend([fq_handle, sam_handle]) in_generator = itertools.izip( custom_generators.parse_fastq(fq_handle), custom_generators.parse_sam(sam_handle)) generator_frag = add_generator_internal(in_generator, drm_list, blocker, remove_rt) final_generator = itertools.chain(final_generator, generator_frag) return final_generator, handles_to_close
def randomize_names_paired(filename1, filename2, working_dir): """ Write out paired end sequences with randomized names. Args: filename1: The first paired file to randomize. filename2: The second paired file to randomize. working_dir: Put temp files here. """ temp_file1 = os.path.join(working_dir, str(uuid.uuid4())) temp_file2 = os.path.join(working_dir, str(uuid.uuid4())) with open(filename1, 'r') as handle1, \ open(filename2, 'r') as handle2, \ open(temp_file1, 'w') as out_handle1, \ open(temp_file2, 'w') as out_handle2: final_generator = itertools.izip( custom_generators.parse_fastq(handle1), custom_generators.parse_fastq(handle2) ) for record1, record2 in final_generator: name = str(uuid.uuid4()) record1 = ( ''.join(['@', name, "/1", '\n']), record1[1], record1[2], record1[3] ) record2 = ( ''.join(['@', name, "/2", '\n']), record2[1], record2[2], record2[3] ) out_handle1.write(''.join(record1)) out_handle2.write(''.join(record2)) shutil.move(temp_file1, filename1) shutil.move(temp_file2, filename2)
def randomize_names(filename, working_dir): """ Write out sequences with randomized names. Args: filename: The file to randomize. working_dir: Put temp files here. """ temp_file = os.path.join(working_dir, str(uuid.uuid4())) with open(filename, 'r') as handle, open(temp_file, 'w') as out_handle: for record in custom_generators.parse_fastq(handle): record = (''.join(['@', str(uuid.uuid4()), '\n']), record[1], record[2], record[3]) out_handle.write(''.join(record)) shutil.move(temp_file, filename)
def tag_names(filename, sequence_name, working_dir): """ Change all sequence names to a particular name. Args: filename: The file to rename. sequence_name: The name to call the sequence working_dir: Put temp files here. """ temp_file = os.path.join(working_dir, str(uuid.uuid4())) with open(filename, 'r') as handle, open(temp_file, 'w') as out_handle: for record in custom_generators.parse_fastq(handle): record = (''.join(['@', sequence_name, '\n']), record[1], record[2], record[3]) out_handle.write(''.join(record)) shutil.move(temp_file, filename)
def select_only_flag( fastq_file, sam_file, working_dir, flag): """ Select only fastq entries that have a particular SAM flag associated with them (usually forward or reverse). Args: fastq_file: the FASTQ file to take from sam_file: the SAM file to take from working_dir: the working directory flag: the flag to select Returns: (new fq file, new sam file) """ out_fq_path = os.path.join(working_dir, str(uuid.uuid4())) out_sam_path = os.path.join(working_dir, str(uuid.uuid4())) with open(fastq_file, 'r') as fq_handle, \ open(sam_file, 'r') as sam_handle: in_generator = itertools.izip( custom_generators.parse_fastq(fq_handle), custom_generators.parse_sam(sam_handle) ) in_generator, in_generator2 = itertools.tee(in_generator) with open(out_fq_path, 'w') as fq_out, \ open(out_sam_path, 'w') as sam_out: for p in in_generator: if int(p[1][custom_generators.FLAG]) != flag: continue fq_out.write("".join(p[0])) sam_out.write("\t".join(p[1])) return out_fq_path, out_sam_path
def randomize_names(filename, working_dir): """ Write out sequences with randomized names. Args: filename: The file to randomize. working_dir: Put temp files here. """ temp_file = os.path.join(working_dir, str(uuid.uuid4())) with open(filename, 'r') as handle, open(temp_file, 'w') as out_handle: for record in custom_generators.parse_fastq(handle): record = ( ''.join(['@', str(uuid.uuid4()), '\n']), record[1], record[2], record[3] ) out_handle.write(''.join(record)) shutil.move(temp_file, filename)
def tag_names(filename, sequence_name, working_dir): """ Change all sequence names to a particular name. Args: filename: The file to rename. sequence_name: The name to call the sequence working_dir: Put temp files here. """ temp_file = os.path.join(working_dir, str(uuid.uuid4())) with open(filename, 'r') as handle, open(temp_file, 'w') as out_handle: for record in custom_generators.parse_fastq(handle): record = ( ''.join(['@', sequence_name, '\n']), record[1], record[2], record[3] ) out_handle.write(''.join(record)) shutil.move(temp_file, filename)
def select_only_flag(fastq_file, sam_file, working_dir, flag): """ Select only fastq entries that have a particular SAM flag associated with them (usually forward or reverse). Args: fastq_file: the FASTQ file to take from sam_file: the SAM file to take from working_dir: the working directory flag: the flag to select Returns: (new fq file, new sam file) """ out_fq_path = os.path.join(working_dir, str(uuid.uuid4())) out_sam_path = os.path.join(working_dir, str(uuid.uuid4())) with open(fastq_file, 'r') as fq_handle, \ open(sam_file, 'r') as sam_handle: in_generator = itertools.izip(custom_generators.parse_fastq(fq_handle), custom_generators.parse_sam(sam_handle)) in_generator, in_generator2 = itertools.tee(in_generator) with open(out_fq_path, 'w') as fq_out, \ open(out_sam_path, 'w') as sam_out: for p in in_generator: if int(p[1][custom_generators.FLAG]) != flag: continue fq_out.write("".join(p[0])) sam_out.write("\t".join(p[1])) return out_fq_path, out_sam_path