示例#1
0
def add_generator(fq_filename, sam_filename, drm_list, all_blockers, 
    remove_rt, paired_end):

    """
    Refulfill all prevalences by adding reads.  Designed to be chained to
    a delete_generator.

    Args:
        fq_filename: a fastq filename.
        sam_filename: a sam filename.
        drm_list: a list of DRMs that must have exact prevalences.
        all_blockers: blocking DRM sets, with populated num_deleted.
        remove_rt: are we removing (almost) all RT DRMs?
    Returns:
        A tuple containing:
            * A generator that adds reads to re-fulfill all prevalences.
            * A list of open file handles (NB because this generator is not
              exhausted by this function, file handles must be closed
              only when generator is exhausted).
    """

    final_generator = iter([])
    handles_to_close = []

    for blocker in all_blockers:
        if paired_end:
            fq_handle1 = open(fq_filename[0], 'r')
            fq_handle2 = open(fq_filename[1], 'r')
            sam_handle1 = open(sam_filename[0], 'r')
            sam_handle2 = open(sam_filename[1], 'r')
            in_generator = itertools.izip(
                custom_generators.parse_fastq(fq_handle1),
                custom_generators.parse_sam(sam_handle1),
                custom_generators.parse_fastq(fq_handle2),
                custom_generators.parse_sam(sam_handle2)
            )
        else:
            fq_handle = open(fq_filename, 'r')
            sam_handle = open(sam_filename, 'r')
            handles_to_close.extend([fq_handle, sam_handle])
            in_generator = itertools.izip(
                custom_generators.parse_fastq(fq_handle), 
                custom_generators.parse_sam(sam_handle)
            )
        generator_frag = add_generator_internal(in_generator, drm_list,
            blocker, remove_rt)
        final_generator = itertools.chain(final_generator, generator_frag)

    return final_generator, handles_to_close
示例#2
0
def single_generator(fq_handle, sam_handle, drm_list, reads_per_drm,
                     remove_rt):
    """
    Return a generator to an output file and an associated dictionary
    to be filled.

    Args:
        fq_handle: an open file handle to a fastq file.
        sam_handle: an open file handle to a sam file.
        drm_list: a list of DRMs that must have exact prevalences.
        reads_per_drm: the number of reads to add per DRM.
        remove_rt: are we removing (almost) all RT DRMs?
    Returns:
        a tuple of:
            * an unpopulated dictionary of {DRM: coverage count}
            * the generator which, once exhausted, populates the dictionary
              and yields the reads to include.
    """

    drm_dict = {d: 0 for d in drm_list}

    in_generator = itertools.izip(custom_generators.parse_fastq(fq_handle),
                                  custom_generators.parse_sam(sam_handle))
    out_generator = ({
            "read": read[0],
            "sam": read[1]
        } for read in in_generator \
            if within_required_prevalence(
                read, drm_dict, reads_per_drm, remove_rt
            ))
    return drm_dict, out_generator
示例#3
0
def _open_all_data_files(sequences, fastq_files, sam_files, diff_files):
    open_fq_files = {}
    for mutation_type, fq in fastq_files.iteritems():
        fastq_siblings = []
        for _fq in fq:
            with open(_fq, 'r') as f:
                fastq_siblings.append([i for i in parse_fastq(f)])
        open_fq_files[mutation_type] = fastq_siblings
    
    open_sam_files = {}
    for mutation_type, fs in sam_files.iteritems():
        with open(fs, 'r') as f:
            sam_reads = {}
            for sam_read in parse_sam(f):
                read_id = sam_read[0].strip('\n') 
                if read_id in sam_reads:
                    sam_reads[read_id].append(sam_read)
                else:
                    sam_reads[read_id] = [sam_read]
            open_sam_files[mutation_type] = sam_reads

    open_diff_files = {'null': {i.id: [] for i in sequences}}
    for mutation_type, fd in diff_files.iteritems():
        with open(fd, 'r') as f:
            open_diff_files[mutation_type] = pickle.load(f)
    return open_fq_files, open_sam_files, open_diff_files
示例#4
0
def correct_sam_file(original_sam_file, working_dir):
    """
    Replace the '=' and 'X' SAM arguments with 'M', because pysam doesn't
    understand '=' and 'X'.

    Args:
        original_sam_file: The file to change.
        working_dir: Put temp files here.
    """

    corrected_sam_filename = os.path.join(working_dir, str(uuid.uuid4()))
    corrected_sam_file = open(corrected_sam_filename, 'w')

    copy_sam_header(original_sam_file, corrected_sam_file)

    incorrect_sam_file = custom_generators.parse_sam(
        open(original_sam_file, 'r'))

    for read in incorrect_sam_file:
        if read[custom_generators.CIGAR] is not None \
            and read[custom_generators.CIGAR] != "":

            read[custom_generators.CIGAR] = \
                read[custom_generators.CIGAR].replace("=", "M")
            read[custom_generators.CIGAR] = \
                read[custom_generators.CIGAR].replace("X", "M")

        corrected_sam_file.write('\t'.join(read))

    incorrect_sam_file.close()
    corrected_sam_file.close()

    shutil.move(corrected_sam_filename, original_sam_file)
示例#5
0
def unaligned_sam_file(original_sam_filename, working_dir):
    """
    Remove the alignment from a SAM format file.

    Args:
        original_sam_file: The aligned SAM file.
        working_dir: Put temp files here.

    """

    corrected_sam_filename = os.path.join(working_dir, str(uuid.uuid4()))
    corrected_sam_file = open(corrected_sam_filename, 'w')

    copy_sam_header(original_sam_filename, corrected_sam_file)

    incorrect_sam_file = custom_generators.parse_sam(
        open(original_sam_filename, 'r'))

    for read in incorrect_sam_file:

        read[custom_generators.CIGAR] = ''
        # 4 is the SAM flag signal for "unmapped"
        read[custom_generators.FLAG] = '4'
        corrected_sam_file.write('\t'.join(read))

    incorrect_sam_file.close()
    corrected_sam_file.close()

    shutil.move(corrected_sam_filename, original_sam_filename)
def correct_sam_file(original_sam_file, working_dir):
    """
    Replace the '=' and 'X' SAM arguments with 'M', because pysam doesn't
    understand '=' and 'X'.

    Args:
        original_sam_file: The file to change.
        working_dir: Put temp files here.
    """

    corrected_sam_filename = os.path.join(working_dir, str(uuid.uuid4()))
    corrected_sam_file = open(corrected_sam_filename, 'w')

    copy_sam_header(original_sam_file, corrected_sam_file)

    incorrect_sam_file = custom_generators.parse_sam(
        open(original_sam_file, 'r')
    )

    for read in incorrect_sam_file:
        if read[custom_generators.CIGAR] is not None \
            and read[custom_generators.CIGAR] != "":

            read[custom_generators.CIGAR] = \
                read[custom_generators.CIGAR].replace("=", "M")
            read[custom_generators.CIGAR] = \
                read[custom_generators.CIGAR].replace("X", "M")

        corrected_sam_file.write('\t'.join(read))

    incorrect_sam_file.close()
    corrected_sam_file.close()

    shutil.move(corrected_sam_filename, original_sam_file)
def unaligned_sam_file(original_sam_filename, working_dir):
    """
    Remove the alignment from a SAM format file.

    Args:
        original_sam_file: The aligned SAM file.
        working_dir: Put temp files here.

    """

    corrected_sam_filename = os.path.join(working_dir, str(uuid.uuid4()))
    corrected_sam_file = open(corrected_sam_filename, 'w')

    copy_sam_header(original_sam_filename, corrected_sam_file)

    incorrect_sam_file = custom_generators.parse_sam(
        open(original_sam_filename, 'r')
    )

    for read in incorrect_sam_file:
        
        read[custom_generators.CIGAR] = ''
        # 4 is the SAM flag signal for "unmapped"
        read[custom_generators.FLAG] = '4'
        corrected_sam_file.write('\t'.join(read))

    incorrect_sam_file.close()
    corrected_sam_file.close()

    shutil.move(corrected_sam_filename, original_sam_filename)
示例#8
0
def single_generator(fq_handle, sam_handle, drm_list, 
    reads_per_drm, remove_rt):

    """
    Return a generator to an output file and an associated dictionary
    to be filled.

    Args:
        fq_handle: an open file handle to a fastq file.
        sam_handle: an open file handle to a sam file.
        drm_list: a list of DRMs that must have exact prevalences.
        reads_per_drm: the number of reads to add per DRM.
        remove_rt: are we removing (almost) all RT DRMs?
    Returns:
        a tuple of:
            * an unpopulated dictionary of {DRM: coverage count}
            * the generator which, once exhausted, populates the dictionary
              and yields the reads to include.
    """

    drm_dict = { d : 0 for d in drm_list }

    in_generator = itertools.izip(
        custom_generators.parse_fastq(fq_handle), 
        custom_generators.parse_sam(sam_handle)
    )
    out_generator = ({
            "read": read[0], 
            "sam": read[1]
        } for read in in_generator \
            if within_required_prevalence(
                read, drm_dict, reads_per_drm, remove_rt
            ))
    return drm_dict, out_generator
示例#9
0
def add_generator(fq_filename, sam_filename, drm_list, all_blockers, remove_rt,
                  paired_end):
    """
    Refulfill all prevalences by adding reads.  Designed to be chained to
    a delete_generator.

    Args:
        fq_filename: a fastq filename.
        sam_filename: a sam filename.
        drm_list: a list of DRMs that must have exact prevalences.
        all_blockers: blocking DRM sets, with populated num_deleted.
        remove_rt: are we removing (almost) all RT DRMs?
    Returns:
        A tuple containing:
            * A generator that adds reads to re-fulfill all prevalences.
            * A list of open file handles (NB because this generator is not
              exhausted by this function, file handles must be closed
              only when generator is exhausted).
    """

    final_generator = iter([])
    handles_to_close = []

    for blocker in all_blockers:
        if paired_end:
            fq_handle1 = open(fq_filename[0], 'r')
            fq_handle2 = open(fq_filename[1], 'r')
            sam_handle1 = open(sam_filename[0], 'r')
            sam_handle2 = open(sam_filename[1], 'r')
            in_generator = itertools.izip(
                custom_generators.parse_fastq(fq_handle1),
                custom_generators.parse_sam(sam_handle1),
                custom_generators.parse_fastq(fq_handle2),
                custom_generators.parse_sam(sam_handle2))
        else:
            fq_handle = open(fq_filename, 'r')
            sam_handle = open(sam_filename, 'r')
            handles_to_close.extend([fq_handle, sam_handle])
            in_generator = itertools.izip(
                custom_generators.parse_fastq(fq_handle),
                custom_generators.parse_sam(sam_handle))
        generator_frag = add_generator_internal(in_generator, drm_list,
                                                blocker, remove_rt)
        final_generator = itertools.chain(final_generator, generator_frag)

    return final_generator, handles_to_close
示例#10
0
def select_only_flag(
    fastq_file, sam_file, working_dir, flag):
    """
    Select only fastq entries that have a particular SAM flag
    associated with them (usually forward or reverse).

    Args:
        fastq_file: the FASTQ file to take from
        sam_file: the SAM file to take from
        working_dir: the working directory
        flag: the flag to select

    Returns: (new fq file, new sam file)

    """

    out_fq_path = os.path.join(working_dir, str(uuid.uuid4()))
    out_sam_path = os.path.join(working_dir, str(uuid.uuid4()))

    with open(fastq_file, 'r') as fq_handle, \
         open(sam_file, 'r') as sam_handle:

        in_generator = itertools.izip(
            custom_generators.parse_fastq(fq_handle), 
            custom_generators.parse_sam(sam_handle)
        )

        in_generator, in_generator2 = itertools.tee(in_generator)

        with open(out_fq_path, 'w') as fq_out, \
             open(out_sam_path, 'w') as sam_out:

            for p in in_generator:
                if int(p[1][custom_generators.FLAG]) != flag:
                    continue
                fq_out.write("".join(p[0]))
                sam_out.write("\t".join(p[1]))

    return out_fq_path, out_sam_path
示例#11
0
def select_only_flag(fastq_file, sam_file, working_dir, flag):
    """
    Select only fastq entries that have a particular SAM flag
    associated with them (usually forward or reverse).

    Args:
        fastq_file: the FASTQ file to take from
        sam_file: the SAM file to take from
        working_dir: the working directory
        flag: the flag to select

    Returns: (new fq file, new sam file)

    """

    out_fq_path = os.path.join(working_dir, str(uuid.uuid4()))
    out_sam_path = os.path.join(working_dir, str(uuid.uuid4()))

    with open(fastq_file, 'r') as fq_handle, \
         open(sam_file, 'r') as sam_handle:

        in_generator = itertools.izip(custom_generators.parse_fastq(fq_handle),
                                      custom_generators.parse_sam(sam_handle))

        in_generator, in_generator2 = itertools.tee(in_generator)

        with open(out_fq_path, 'w') as fq_out, \
             open(out_sam_path, 'w') as sam_out:

            for p in in_generator:
                if int(p[1][custom_generators.FLAG]) != flag:
                    continue
                fq_out.write("".join(p[0]))
                sam_out.write("\t".join(p[1]))

    return out_fq_path, out_sam_path