Python parse_fastq 예제들, custom_generators.parse_fastq Python 예제들

예제 #1

0

파일 보기

def randomize_names_paired(filename1, filename2, working_dir):
    """
    Write out paired end sequences with randomized names.

    Args:
        filename1: The first paired file to randomize.
        filename2: The second paired file to randomize.
        working_dir: Put temp files here.
    """

    temp_file1 = os.path.join(working_dir, str(uuid.uuid4()))
    temp_file2 = os.path.join(working_dir, str(uuid.uuid4()))

    with open(filename1, 'r') as handle1, \
            open(filename2, 'r') as handle2, \
            open(temp_file1, 'w') as out_handle1, \
            open(temp_file2, 'w') as out_handle2:
        final_generator = itertools.izip(
            custom_generators.parse_fastq(handle1),
            custom_generators.parse_fastq(handle2))
        for record1, record2 in final_generator:
            name = str(uuid.uuid4())
            record1 = (''.join(['@', name, "/1",
                                '\n']), record1[1], record1[2], record1[3])
            record2 = (''.join(['@', name, "/2",
                                '\n']), record2[1], record2[2], record2[3])
            out_handle1.write(''.join(record1))
            out_handle2.write(''.join(record2))

    shutil.move(temp_file1, filename1)
    shutil.move(temp_file2, filename2)

예제 #2

0

파일 보기

파일: prevalence.py 프로젝트: hyraxbio/simulated-data

def add_generator(fq_filename, sam_filename, drm_list, all_blockers, 
    remove_rt, paired_end):

    """
    Refulfill all prevalences by adding reads.  Designed to be chained to
    a delete_generator.

    Args:
        fq_filename: a fastq filename.
        sam_filename: a sam filename.
        drm_list: a list of DRMs that must have exact prevalences.
        all_blockers: blocking DRM sets, with populated num_deleted.
        remove_rt: are we removing (almost) all RT DRMs?
    Returns:
        A tuple containing:
            * A generator that adds reads to re-fulfill all prevalences.
            * A list of open file handles (NB because this generator is not
              exhausted by this function, file handles must be closed
              only when generator is exhausted).
    """

    final_generator = iter([])
    handles_to_close = []

    for blocker in all_blockers:
        if paired_end:
            fq_handle1 = open(fq_filename[0], 'r')
            fq_handle2 = open(fq_filename[1], 'r')
            sam_handle1 = open(sam_filename[0], 'r')
            sam_handle2 = open(sam_filename[1], 'r')
            in_generator = itertools.izip(
                custom_generators.parse_fastq(fq_handle1),
                custom_generators.parse_sam(sam_handle1),
                custom_generators.parse_fastq(fq_handle2),
                custom_generators.parse_sam(sam_handle2)
            )
        else:
            fq_handle = open(fq_filename, 'r')
            sam_handle = open(sam_filename, 'r')
            handles_to_close.extend([fq_handle, sam_handle])
            in_generator = itertools.izip(
                custom_generators.parse_fastq(fq_handle), 
                custom_generators.parse_sam(sam_handle)
            )
        generator_frag = add_generator_internal(in_generator, drm_list,
            blocker, remove_rt)
        final_generator = itertools.chain(final_generator, generator_frag)

    return final_generator, handles_to_close

예제 #3

0

파일 보기

def single_generator(fq_handle, sam_handle, drm_list, reads_per_drm,
                     remove_rt):
    """
    Return a generator to an output file and an associated dictionary
    to be filled.

    Args:
        fq_handle: an open file handle to a fastq file.
        sam_handle: an open file handle to a sam file.
        drm_list: a list of DRMs that must have exact prevalences.
        reads_per_drm: the number of reads to add per DRM.
        remove_rt: are we removing (almost) all RT DRMs?
    Returns:
        a tuple of:
            * an unpopulated dictionary of {DRM: coverage count}
            * the generator which, once exhausted, populates the dictionary
              and yields the reads to include.
    """

    drm_dict = {d: 0 for d in drm_list}

    in_generator = itertools.izip(custom_generators.parse_fastq(fq_handle),
                                  custom_generators.parse_sam(sam_handle))
    out_generator = ({
            "read": read[0],
            "sam": read[1]
        } for read in in_generator \
            if within_required_prevalence(
                read, drm_dict, reads_per_drm, remove_rt
            ))
    return drm_dict, out_generator

예제 #4

0

파일 보기

def _open_all_data_files(sequences, fastq_files, sam_files, diff_files):
    open_fq_files = {}
    for mutation_type, fq in fastq_files.iteritems():
        fastq_siblings = []
        for _fq in fq:
            with open(_fq, 'r') as f:
                fastq_siblings.append([i for i in parse_fastq(f)])
        open_fq_files[mutation_type] = fastq_siblings
    
    open_sam_files = {}
    for mutation_type, fs in sam_files.iteritems():
        with open(fs, 'r') as f:
            sam_reads = {}
            for sam_read in parse_sam(f):
                read_id = sam_read[0].strip('\n') 
                if read_id in sam_reads:
                    sam_reads[read_id].append(sam_read)
                else:
                    sam_reads[read_id] = [sam_read]
            open_sam_files[mutation_type] = sam_reads

    open_diff_files = {'null': {i.id: [] for i in sequences}}
    for mutation_type, fd in diff_files.iteritems():
        with open(fd, 'r') as f:
            open_diff_files[mutation_type] = pickle.load(f)
    return open_fq_files, open_sam_files, open_diff_files

예제 #5

0

파일 보기

파일: prevalence.py 프로젝트: hyraxbio/simulated-data

def single_generator(fq_handle, sam_handle, drm_list, 
    reads_per_drm, remove_rt):

    """
    Return a generator to an output file and an associated dictionary
    to be filled.

    Args:
        fq_handle: an open file handle to a fastq file.
        sam_handle: an open file handle to a sam file.
        drm_list: a list of DRMs that must have exact prevalences.
        reads_per_drm: the number of reads to add per DRM.
        remove_rt: are we removing (almost) all RT DRMs?
    Returns:
        a tuple of:
            * an unpopulated dictionary of {DRM: coverage count}
            * the generator which, once exhausted, populates the dictionary
              and yields the reads to include.
    """

    drm_dict = { d : 0 for d in drm_list }

    in_generator = itertools.izip(
        custom_generators.parse_fastq(fq_handle), 
        custom_generators.parse_sam(sam_handle)
    )
    out_generator = ({
            "read": read[0], 
            "sam": read[1]
        } for read in in_generator \
            if within_required_prevalence(
                read, drm_dict, reads_per_drm, remove_rt
            ))
    return drm_dict, out_generator

예제 #6

0

파일 보기

def add_generator(fq_filename, sam_filename, drm_list, all_blockers, remove_rt,
                  paired_end):
    """
    Refulfill all prevalences by adding reads.  Designed to be chained to
    a delete_generator.

    Args:
        fq_filename: a fastq filename.
        sam_filename: a sam filename.
        drm_list: a list of DRMs that must have exact prevalences.
        all_blockers: blocking DRM sets, with populated num_deleted.
        remove_rt: are we removing (almost) all RT DRMs?
    Returns:
        A tuple containing:
            * A generator that adds reads to re-fulfill all prevalences.
            * A list of open file handles (NB because this generator is not
              exhausted by this function, file handles must be closed
              only when generator is exhausted).
    """

    final_generator = iter([])
    handles_to_close = []

    for blocker in all_blockers:
        if paired_end:
            fq_handle1 = open(fq_filename[0], 'r')
            fq_handle2 = open(fq_filename[1], 'r')
            sam_handle1 = open(sam_filename[0], 'r')
            sam_handle2 = open(sam_filename[1], 'r')
            in_generator = itertools.izip(
                custom_generators.parse_fastq(fq_handle1),
                custom_generators.parse_sam(sam_handle1),
                custom_generators.parse_fastq(fq_handle2),
                custom_generators.parse_sam(sam_handle2))
        else:
            fq_handle = open(fq_filename, 'r')
            sam_handle = open(sam_filename, 'r')
            handles_to_close.extend([fq_handle, sam_handle])
            in_generator = itertools.izip(
                custom_generators.parse_fastq(fq_handle),
                custom_generators.parse_sam(sam_handle))
        generator_frag = add_generator_internal(in_generator, drm_list,
                                                blocker, remove_rt)
        final_generator = itertools.chain(final_generator, generator_frag)

    return final_generator, handles_to_close

예제 #7

0

파일 보기

파일: prevalence.py 프로젝트: hyraxbio/simulated-data

def randomize_names_paired(filename1, filename2, working_dir):
    """
    Write out paired end sequences with randomized names.

    Args:
        filename1: The first paired file to randomize.
        filename2: The second paired file to randomize.
        working_dir: Put temp files here.
    """

    temp_file1 = os.path.join(working_dir, str(uuid.uuid4()))
    temp_file2 = os.path.join(working_dir, str(uuid.uuid4()))

    with open(filename1, 'r') as handle1, \
            open(filename2, 'r') as handle2, \
            open(temp_file1, 'w') as out_handle1, \
            open(temp_file2, 'w') as out_handle2:
        final_generator = itertools.izip(
            custom_generators.parse_fastq(handle1),
            custom_generators.parse_fastq(handle2)
        )
        for record1, record2 in final_generator:
            name = str(uuid.uuid4())
            record1 = (
                ''.join(['@', name, "/1", '\n']), 
                record1[1], 
                record1[2], 
                record1[3]
            )
            record2 = (
                ''.join(['@', name, "/2", '\n']), 
                record2[1], 
                record2[2], 
                record2[3]
            )
            out_handle1.write(''.join(record1))
            out_handle2.write(''.join(record2))

    shutil.move(temp_file1, filename1)
    shutil.move(temp_file2, filename2)

예제 #8

0

파일 보기

def randomize_names(filename, working_dir):
    """
    Write out sequences with randomized names.

    Args:
        filename: The file to randomize.
        working_dir: Put temp files here.
    """

    temp_file = os.path.join(working_dir, str(uuid.uuid4()))

    with open(filename, 'r') as handle, open(temp_file, 'w') as out_handle:
        for record in custom_generators.parse_fastq(handle):
            record = (''.join(['@', str(uuid.uuid4()),
                               '\n']), record[1], record[2], record[3])
            out_handle.write(''.join(record))

    shutil.move(temp_file, filename)

예제 #9

0

파일 보기

def tag_names(filename, sequence_name, working_dir):
    """
    Change all sequence names to a particular name.

    Args:
        filename: The file to rename.
        sequence_name: The name to call the sequence
        working_dir: Put temp files here.
    """

    temp_file = os.path.join(working_dir, str(uuid.uuid4()))

    with open(filename, 'r') as handle, open(temp_file, 'w') as out_handle:
        for record in custom_generators.parse_fastq(handle):
            record = (''.join(['@', sequence_name,
                               '\n']), record[1], record[2], record[3])
            out_handle.write(''.join(record))

    shutil.move(temp_file, filename)

예제 #10

0

파일 보기

파일: sequencing_error.py 프로젝트: hyraxbio/simulated-data

def select_only_flag(
    fastq_file, sam_file, working_dir, flag):
    """
    Select only fastq entries that have a particular SAM flag
    associated with them (usually forward or reverse).

    Args:
        fastq_file: the FASTQ file to take from
        sam_file: the SAM file to take from
        working_dir: the working directory
        flag: the flag to select

    Returns: (new fq file, new sam file)

    """

    out_fq_path = os.path.join(working_dir, str(uuid.uuid4()))
    out_sam_path = os.path.join(working_dir, str(uuid.uuid4()))

    with open(fastq_file, 'r') as fq_handle, \
         open(sam_file, 'r') as sam_handle:

        in_generator = itertools.izip(
            custom_generators.parse_fastq(fq_handle), 
            custom_generators.parse_sam(sam_handle)
        )

        in_generator, in_generator2 = itertools.tee(in_generator)

        with open(out_fq_path, 'w') as fq_out, \
             open(out_sam_path, 'w') as sam_out:

            for p in in_generator:
                if int(p[1][custom_generators.FLAG]) != flag:
                    continue
                fq_out.write("".join(p[0]))
                sam_out.write("\t".join(p[1]))

    return out_fq_path, out_sam_path

예제 #11

0

파일 보기

파일: prevalence.py 프로젝트: hyraxbio/simulated-data

def randomize_names(filename, working_dir):
    """
    Write out sequences with randomized names.

    Args:
        filename: The file to randomize.
        working_dir: Put temp files here.
    """

    temp_file = os.path.join(working_dir, str(uuid.uuid4()))

    with open(filename, 'r') as handle, open(temp_file, 'w') as out_handle:
        for record in custom_generators.parse_fastq(handle):
            record = (
                ''.join(['@', str(uuid.uuid4()), '\n']), 
                record[1], 
                record[2], 
                record[3]
            )
            out_handle.write(''.join(record))

    shutil.move(temp_file, filename)

예제 #12

0

파일 보기

파일: prevalence.py 프로젝트: hyraxbio/simulated-data

def tag_names(filename, sequence_name, working_dir):
    """
    Change all sequence names to a particular name.

    Args:
        filename: The file to rename.
        sequence_name: The name to call the sequence
        working_dir: Put temp files here.
    """

    temp_file = os.path.join(working_dir, str(uuid.uuid4()))

    with open(filename, 'r') as handle, open(temp_file, 'w') as out_handle:
        for record in custom_generators.parse_fastq(handle):
            record = (
                ''.join(['@', sequence_name, '\n']), 
                record[1], 
                record[2], 
                record[3]
            )
            out_handle.write(''.join(record))

    shutil.move(temp_file, filename)

예제 #13

0

파일 보기

def select_only_flag(fastq_file, sam_file, working_dir, flag):
    """
    Select only fastq entries that have a particular SAM flag
    associated with them (usually forward or reverse).

    Args:
        fastq_file: the FASTQ file to take from
        sam_file: the SAM file to take from
        working_dir: the working directory
        flag: the flag to select

    Returns: (new fq file, new sam file)

    """

    out_fq_path = os.path.join(working_dir, str(uuid.uuid4()))
    out_sam_path = os.path.join(working_dir, str(uuid.uuid4()))

    with open(fastq_file, 'r') as fq_handle, \
         open(sam_file, 'r') as sam_handle:

        in_generator = itertools.izip(custom_generators.parse_fastq(fq_handle),
                                      custom_generators.parse_sam(sam_handle))

        in_generator, in_generator2 = itertools.tee(in_generator)

        with open(out_fq_path, 'w') as fq_out, \
             open(out_sam_path, 'w') as sam_out:

            for p in in_generator:
                if int(p[1][custom_generators.FLAG]) != flag:
                    continue
                fq_out.write("".join(p[0]))
                sam_out.write("\t".join(p[1]))

    return out_fq_path, out_sam_path