Пример #1
0
def buildIndex(genome_fp, ref_genome_fp, genome_seq=None, ref_genome_seq=None, 
               fill_gaps=True, max_gap_width=300, smooth_edges=True, 
               smoothing_radius=20):
    ''' Builds and returns an index mapping from the genome in `genome_fp` to
    the genome in `ref_genome_fp`. If `fillGaps` is True, gaps up to
    `max_gap_width` will be filled if flanking indices can be found with
    commensurate map spacing (see `fillGaps` docstring). If `smooth_edges` is
    True, areas of high mismatch frequency with gap-penalty-related problems
    will be smoothed.
    '''
    # Generate absolute filepaths
    genome_fp = os.path.abspath(genome_fp)
    ref_genome_fp = os.path.abspath(ref_genome_fp)
    # Run mauve and parse output (all output files are cleaned up after run /
    # parsing)
    with TempDirs(1) as (temp_dir,):
        output_fp = os.path.join(temp_dir, 'mauveout.xmfa')
        runMauve([genome_fp, ref_genome_fp], flags={'--output': output_fp})
        sub_alignment_groups = parseXMFA(output_fp)
    genome_length = getGenomeLength(genome_fp)

    # LUT to map indices from the engineered genome to the reference genome
    idx_lut = np.empty(genome_length, dtype=IDX_ARRAY_DTYPE)
    # Initialize all values to -1 (un-mapped indices will be -1)
    idx_lut[:] = -1
    # Parse sub-alignments into index mapping
    for sub_alignment_group in sub_alignment_groups:
        genome_sa = indexutils.lookupSubAlignment(1, sub_alignment_group)
        ref_genome_sa = indexutils.lookupSubAlignment(2, sub_alignment_group)
        if genome_sa is not None and ref_genome_sa is not None:
            # Note that we have to convert to 0-based indexing
            genome_idx = genome_sa.start_idx - 1
            ref_genome_idx = ref_genome_sa.start_idx - 1
            for idx, base in enumerate(genome_sa.seq):
                ref_genome_base = ref_genome_sa.seq[idx]
                if base != '-':
                    genome_idx += 1
                if ref_genome_base != '-':
                    ref_genome_idx += 1
                if base == ref_genome_base and genome_idx < genome_length:
                    idx_lut[genome_idx] = ref_genome_idx
    if fill_gaps:
        genome_seq = genome_seq or getSeqFromFile(genome_fp)
        ref_genome_seq = ref_genome_seq or getSeqFromFile(ref_genome_fp)
        indexutils.fixZeroIdx(idx_lut, genome_seq, ref_genome_seq)
        indexutils.fillGaps(idx_lut, max_gap_width=max_gap_width)
    if smooth_edges:
        indexutils.smoothEdges(idx_lut, smoothing_radius=smoothing_radius)
    return idx_lut
Пример #2
0
def getGenomeLength(genome_fp):
    ''' Return the genome length from a fasta file containing a single
    header / sequence
    '''
    return len(getSeqFromFile(genome_fp))