def buildIndex(genome_fp, ref_genome_fp, genome_seq=None, ref_genome_seq=None, fill_gaps=True, max_gap_width=300, smooth_edges=True, smoothing_radius=20): ''' Builds and returns an index mapping from the genome in `genome_fp` to the genome in `ref_genome_fp`. If `fillGaps` is True, gaps up to `max_gap_width` will be filled if flanking indices can be found with commensurate map spacing (see `fillGaps` docstring). If `smooth_edges` is True, areas of high mismatch frequency with gap-penalty-related problems will be smoothed. ''' # Generate absolute filepaths genome_fp = os.path.abspath(genome_fp) ref_genome_fp = os.path.abspath(ref_genome_fp) # Run mauve and parse output (all output files are cleaned up after run / # parsing) with TempDirs(1) as (temp_dir,): output_fp = os.path.join(temp_dir, 'mauveout.xmfa') runMauve([genome_fp, ref_genome_fp], flags={'--output': output_fp}) sub_alignment_groups = parseXMFA(output_fp) genome_length = getGenomeLength(genome_fp) # LUT to map indices from the engineered genome to the reference genome idx_lut = np.empty(genome_length, dtype=IDX_ARRAY_DTYPE) # Initialize all values to -1 (un-mapped indices will be -1) idx_lut[:] = -1 # Parse sub-alignments into index mapping for sub_alignment_group in sub_alignment_groups: genome_sa = indexutils.lookupSubAlignment(1, sub_alignment_group) ref_genome_sa = indexutils.lookupSubAlignment(2, sub_alignment_group) if genome_sa is not None and ref_genome_sa is not None: # Note that we have to convert to 0-based indexing genome_idx = genome_sa.start_idx - 1 ref_genome_idx = ref_genome_sa.start_idx - 1 for idx, base in enumerate(genome_sa.seq): ref_genome_base = ref_genome_sa.seq[idx] if base != '-': genome_idx += 1 if ref_genome_base != '-': ref_genome_idx += 1 if base == ref_genome_base and genome_idx < genome_length: idx_lut[genome_idx] = ref_genome_idx if fill_gaps: genome_seq = genome_seq or getSeqFromFile(genome_fp) ref_genome_seq = ref_genome_seq or getSeqFromFile(ref_genome_fp) indexutils.fixZeroIdx(idx_lut, genome_seq, ref_genome_seq) indexutils.fillGaps(idx_lut, max_gap_width=max_gap_width) if smooth_edges: indexutils.smoothEdges(idx_lut, smoothing_radius=smoothing_radius) return idx_lut
def getGenomeLength(genome_fp): ''' Return the genome length from a fasta file containing a single header / sequence ''' return len(getSeqFromFile(genome_fp))