def trim_reads(self): ''' Trim a random length barcode from the beginning by searching for the expected starting sequence. ''' ti = self.target_info if ti.sequencing_direction == '+': start = ti.sequencing_start.start prefix = ti.target_sequence[start:start + 6] else: end = ti.sequencing_start.end prefix = utilities.reverse_complement( ti.target_sequence[end - 5:end + 1]) prefix = prefix.upper() trimmed_fn = self.fns_by_read_type['fastq']['trimmed'] with gzip.open(trimmed_fn, 'wt', compresslevel=1) as trimmed_fh: for read in self.progress(self.reads, desc='Trimming reads'): try: start = read.seq.index(prefix, 0, 20) except ValueError: start = 0 end = adapters.trim_by_local_alignment(adapters.truseq_R2_rc, read.seq) trimmed_fh.write(str(read[start:end]))
def at_least_n_Bs(pool, n, B): outcomes = [] b_rc = utilities.reverse_complement(B) for c, s, d in pool.outcome_counts().index.values: if c == 'mismatches': outcome = knock_knock.outcome.MismatchOutcome.from_string(d) if Counter(outcome.snvs.basecalls)[b_rc] >= n: outcomes.append((c, s, d)) return outcomes
def convert_insertion(ins, source_target_info, dest_target_info): ''' insertion are defined by starts_afters and seqs When switching between anchor/+ and sgRNA/sgRNA strand coordinate, starts_afters may become starts_before, and seqs maybe be reverse complemented. ''' before_after_pairs = [(s, s + 1) for s in ins.starts_afters] sgRNA_coords = [(convert_to_sgRNA_coords(source_target_info, b), convert_to_sgRNA_coords(source_target_info, a)) for a, b in before_after_pairs] anchor_coords = [(convert_to_anchor_coords(dest_target_info, b), convert_to_anchor_coords(dest_target_info, a)) for a, b in sgRNA_coords] anchor_coords = [sorted(pair) for pair in anchor_coords] starts_afters = sorted([s for s, e in anchor_coords]) if source_target_info.sgRNA_feature.strand != dest_target_info.sgRNA_feature.strand: seqs = [utilities.reverse_complement(seq) for seq in ins.seqs][::-1] else: seqs = ins.seqs return knock_knock.target_info.DegenerateInsertion(starts_afters, seqs)
def build_guide_index(guides_fn, index_dir): ''' index entries are in same orientation as R2 ''' index_dir = Path(index_dir) index_dir.mkdir(exist_ok=True) fasta_fn = index_dir / 'expected_R2s.fasta' guides_df = pd.read_csv(guides_fn, sep='\t', index_col=0) before_ps = 'AGTACCAAGTTGATAACGGACTAGCCTTATTTAAACTTGCTATGCTGTTTCCAGCTTAGCTCTTAAAC' # Note: Cs here are from untemplated addition and are not deterministically 3. after_ps = 'CCCATATAAGAAA' with fasta_fn.open('w') as fh: for name, protospacer in guides_df['protospacer'].items(): expected_R2 = before_ps + utilities.reverse_complement( protospacer) + after_ps fh.write(str(fasta.Record(name, expected_R2))) pysam.faidx(str(fasta_fn)) mapping_tools.build_STAR_index([fasta_fn], index_dir) bustools_dir = index_dir / 'bustools_annotations' bustools_dir.mkdir(exist_ok=True) matrix_fn = bustools_dir / 'matrix.ec' with matrix_fn.open('w') as fh: for i, name in enumerate(guides_df.index): fh.write(f'{i}\t{i}\n') transcript_to_gene_fn = bustools_dir / 'transcripts_to_genes.txt' with transcript_to_gene_fn.open('w') as fh: for i, name in enumerate(guides_df.index): fh.write(f'{name}\t{name}\t{name}\n') transcripts_fn = bustools_dir / 'transcripts.txt' with transcripts_fn.open('w') as fh: for i, name in enumerate(guides_df.index): fh.write(f'{name}\n')
def get_resolvers(base_dir, group): sample_sheet = load_sample_sheet(base_dir, group) expected_seqs = {} resolvers = {} I7_indices = { name: details['I7_index'] for name, details in sample_sheet['pool_details'].items() } I5_indices = { name: details['I5_index'] for name, details in sample_sheet['pool_details'].items() } expected_I7_indices = set() for seqs in I7_indices.values(): if not isinstance(seqs, list): seqs = [seqs] expected_I7_indices.update(seqs) expected_I5_indices = set() for seqs in I5_indices.values(): if not isinstance(seqs, list): seqs = [seqs] expected_I5_indices.update(seqs) expected_seqs['I7'] = expected_I7_indices expected_seqs['I5'] = expected_I5_indices resolvers['I7'] = utilities.get_one_mismatch_resolver(I7_indices).get resolvers['I5'] = utilities.get_one_mismatch_resolver(I5_indices).get variable_guide_library = repair_seq.guide_library.GuideLibrary( base_dir, sample_sheet['variable_guide_library']) ti_prefix = sample_sheet['target_info_prefix'] guide_seqs = {} guide_seqs['variable_guide'] = defaultdict(set) if 'fixed_guide_library' in sample_sheet: has_fixed_barcode = True fixed_guide_library = repair_seq.guide_library.GuideLibrary( base_dir, sample_sheet['fixed_guide_library']) guide_seqs['fixed_guide_barcode'] = defaultdict(set) guide_pairs = list( itertools.product(fixed_guide_library.guides, variable_guide_library.guides)) else: has_fixed_barcode = False guide_pairs = [('none', vg) for vg in variable_guide_library.guides] for fg, vg in guide_pairs: if fg == 'none': ti_name = f'{ti_prefix}_{variable_guide_library.name}_{vg}' else: ti_name = f'{ti_prefix}-{fg}-{vg}' ti = knock_knock.target_info.TargetInfo(base_dir, ti_name) R1_primer = ti.features[ti.target, sample_sheet['R1_primer']] R2_primer = ti.features[ti.target, sample_sheet['R2_primer']] target_seq = ti.reference_sequences[ti.target] expected_R1 = target_seq[R1_primer.start:R1_primer.start + sample_sheet['R1_read_length']] guide_seqs['variable_guide'][vg].add(expected_R1) if fg != 'none': fixed_guide_barcode = ti.features[ti.target, 'fixed_guide_barcode'] expected_R2 = utilities.reverse_complement( target_seq[fixed_guide_barcode.start:R2_primer.end + 1]) guide_seqs['fixed_guide_barcode'][fg].add(expected_R2) for which in ['fixed_guide_barcode', 'variable_guide']: if which in guide_seqs: dictionary = guide_seqs[which] for g in sorted(dictionary): seqs = dictionary[g] if len(seqs) != 1: raise ValueError(which, g, seqs) else: seq = seqs.pop() dictionary[g] = seq # convert from defaultdict to dict guide_seqs[which] = dict(dictionary) if has_fixed_barcode: fixed_lengths = { len(s) for s in guide_seqs['fixed_guide_barcode'].values() } if len(fixed_lengths) != 1: raise ValueError(fixed_lengths) fixed_length = fixed_lengths.pop() guide_barcode_slice = idx[:fixed_length] # If a guide barcode is present, remove it from R2 before passing along # to simplify analysis of common sequences in pool. after_guide_barcode_slice = idx[fixed_length:] resolvers['fixed_guide_barcode'] = utilities.get_one_mismatch_resolver( guide_seqs['fixed_guide_barcode']).get expected_seqs['fixed_guide_barcode'] = set( guide_seqs['fixed_guide_barcode'].values()) else: # If there weren't multiple fixed guide pools present, keep everything # to allow possibility of outcomes that don't include the intended NotI site. def fixed_guide_barcode_resolver(*args): return {'none'} resolvers['fixed_guide_barcode'] = fixed_guide_barcode_resolver expected_seqs['fixed_guide_barcode'] = set() guide_barcode_slice = slice(None) after_guide_barcode_slice = idx[:] resolvers['variable_guide'] = utilities.get_one_mismatch_resolver( guide_seqs['variable_guide']).get expected_seqs['variable_guide'] = set( guide_seqs['variable_guide'].values()) return resolvers, expected_seqs, guide_barcode_slice, after_guide_barcode_slice
def evaluate_candidate(al): results = { 'location': f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}', } full_window_around = 5000 full_around = region_fetcher( al.reference_name, al.reference_start - full_window_around, al.reference_end + full_window_around).upper() if sam.get_strand(al) == '+': ps_seq = protospacer_seq ps_strand = 1 else: ps_seq = utilities.reverse_complement(protospacer_seq) ps_strand = -1 ps_start = full_around.index(ps_seq) protospacer_locations = [(protospacer_name, ps_seq, ps_start, ps_strand)] for other_protospacer_name, other_protospacer_seq in other_protospacers: # Initial G may not match genome. if other_protospacer_seq.startswith('G'): other_protospacer_seq = other_protospacer_seq[1:] if other_protospacer_seq in full_around: ps_seq = other_protospacer_seq ps_strand = 1 else: ps_seq = utilities.reverse_complement(other_protospacer_seq) if ps_seq not in full_around: results[ 'failed'] = f'protospacer {other_protospacer_seq} not present near protospacer {protospacer_seq}' return results ps_strand = -1 ps_start = full_around.index(ps_seq) protospacer_locations.append( (other_protospacer_name, ps_seq, ps_start, ps_strand)) if 'effector' in info: effector_type = info['effector'] else: if donor_type == 'pegRNA': effector_type = 'SpCas9H840A' else: effector_type = 'SpCas9' effector = target_info.effectors[effector_type] for ps_name, ps_seq, ps_start, ps_strand in protospacer_locations: PAM_pattern = effector.PAM_pattern if (ps_strand == 1 and effector.PAM_side == 3) or (ps_strand == -1 and effector.PAM_side == 5): PAM_offset = len(ps_seq) PAM_transform = utilities.identity else: PAM_offset = -len(PAM_pattern) PAM_transform = utilities.reverse_complement PAM_start = ps_start + PAM_offset PAM = PAM_transform(full_around[PAM_start:PAM_start + len(PAM_pattern)]) pattern, *matches = Bio.SeqUtils.nt_search(PAM, PAM_pattern) if 0 not in matches and not offtargets: # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer # in full_around. results[ 'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})' return results if primers[0] in full_around: leftmost_primer = primers[0] rightmost_primer = utilities.reverse_complement(primers[1]) if rightmost_primer not in full_around: results[ 'failed'] = f'primer {primers[1]} not present near protospacer' return results leftmost_primer_name = 'forward_primer' rightmost_primer_name = 'reverse_primer' else: leftmost_primer = primers[1] rightmost_primer = utilities.reverse_complement(primers[0]) if leftmost_primer not in full_around: results[ 'failed'] = f'primer {primers[1]} not present near protospacer' return results if rightmost_primer not in full_around: results[ 'failed'] = f'primer {primers[0]} not present near protospacer' return results leftmost_primer_name = 'reverse_primer' rightmost_primer_name = 'forward_primer' leftmost_start = full_around.index(leftmost_primer) rightmost_start = full_around.index(rightmost_primer) if leftmost_start >= rightmost_start: results['failed'] = f'primers don\'t flank protospacer' return results # Now that primers have been located, redefine the target sequence to include a fixed # window on either side of the primers. final_window_around = 500 offset = leftmost_start - final_window_around final_start = leftmost_start - final_window_around final_end = rightmost_start + len( rightmost_primer) + final_window_around target_seq = full_around[final_start:final_end] leftmost_location = FeatureLocation(leftmost_start - offset, leftmost_start - offset + len(leftmost_primer), strand=1) rightmost_location = FeatureLocation(rightmost_start - offset, rightmost_start - offset + len(rightmost_primer), strand=-1) colors = { 'HA_1': '#c7b0e3', 'HA_RT': '#c7b0e3', 'HA_2': '#85dae9', 'HA_PBS': '#85dae9', 'forward_primer': '#75C6A9', 'reverse_primer': '#9eafd2', 'sgRNA': '#c6c9d1', 'donor_specific': '#b1ff67', 'PCR_adapter_1': '#F8D3A9', 'PCR_adapter_2': '#D59687', 'protospacer': '#ff9ccd', 'scaffold': '#b7e6d7', } target_features = [ SeqFeature( location=leftmost_location, id=leftmost_primer_name, type='misc_feature', qualifiers={ 'label': leftmost_primer_name, 'ApEinfo_fwdcolor': colors[leftmost_primer_name], }, ), SeqFeature( location=rightmost_location, id=rightmost_primer_name, type='misc_feature', qualifiers={ 'label': rightmost_primer_name, 'ApEinfo_fwdcolor': colors[rightmost_primer_name], }, ), ] if leftmost_primer_name == 'forward_primer': start = leftmost_start - offset start_location = FeatureLocation(start, start + 5, strand=1) else: start = rightmost_start - offset + len(rightmost_primer) - 5 start_location = FeatureLocation(start, start + 5, strand=-1) target_features.extend([ SeqFeature( location=start_location, id='sequencing_start', type='misc_feature', qualifiers={ 'label': 'sequencing_start', }, ), SeqFeature( location=start_location, id='anchor', type='misc_feature', qualifiers={ 'label': 'anchor', }, ), ]) sgRNA_features = [] for sgRNA_i, (ps_name, ps_seq, ps_start, ps_strand) in enumerate(protospacer_locations): sgRNA_feature = SeqFeature( location=FeatureLocation(ps_start - offset, ps_start - offset + len(ps_seq), strand=ps_strand), id=f'sgRNA_{ps_name}', type=f'sgRNA_{effector.name}', qualifiers={ 'label': f'sgRNA_{ps_name}', 'ApEinfo_fwdcolor': colors['sgRNA'], }, ) target_features.append(sgRNA_feature) sgRNA_features.append(sgRNA_feature) results['gb_Records'] = {} if has_donor: if not defer_HA_identification: # If multiple sgRNAs are given, the edited one must be listed first. sgRNA_feature = sgRNA_features[0] cut_after_offset = [ offset for offset in effector.cut_after_offset if offset is not None ][0] if sgRNA_feature.strand == 1: # sgRNA_feature.end is the first nt of the PAM cut_after = sgRNA_feature.location.end + cut_after_offset else: # sgRNA_feature.start - 1 is the first nt of the PAM cut_after = sgRNA_feature.location.start - 1 - cut_after_offset - 1 if donor_type == 'pegRNA': HA_info = identify_pegRNA_homology_arms( donor_seq, target_seq, cut_after, protospacer_seq, colors) else: HA_info = identify_homology_arms(donor_seq, donor_type, target_seq, cut_after, colors) if 'failed' in HA_info: results['failed'] = HA_info['failed'] return results donor_Seq = Seq(HA_info['possibly_flipped_donor_seq']) donor_features = HA_info['donor_features'] target_features.extend(HA_info['target_features']) else: donor_Seq = Seq(donor_seq) donor_features = [] donor_Record = SeqRecord(donor_Seq, name=donor_name, features=donor_features, annotations={'molecule_type': 'DNA'}) results['gb_Records']['donor'] = donor_Record target_Seq = Seq(target_seq) target_Record = SeqRecord(target_Seq, name=target_name, features=target_features, annotations={'molecule_type': 'DNA'}) results['gb_Records']['target'] = target_Record if has_nh_donor: nh_donor_Seq = Seq(nh_donor_seq) nh_donor_Record = SeqRecord(nh_donor_Seq, name=nh_donor_name, annotations={'molecule_type': 'DNA'}) results['gb_Records']['nh_donor'] = nh_donor_Record return results
def identify_homology_arms(donor_seq, donor_type, target_seq, cut_after, colors, required_match_length=15): header = pysam.AlignmentHeader.from_references( ['donor', 'target'], [len(donor_seq), len(target_seq)]) mapper = sw.SeedAndExtender(donor_seq.encode(), 8, header, 'donor') target_bytes = target_seq.encode() alignments = { 'before_cut': [], 'after_cut': [], } seed_starts = { 'before_cut': range(cut_after - required_match_length, 0, -1), 'after_cut': range(cut_after, len(target_seq) - required_match_length), } for side in ['before_cut', 'after_cut']: for seed_start in seed_starts[side]: alignments[side] = mapper.seed_and_extend( target_bytes, seed_start, seed_start + required_match_length, 'target') if alignments[side]: break else: results = {'failed': f'cannot locate homology arm on {side}'} return results possible_HA_boundaries = [] for before_al in alignments['before_cut']: for after_al in alignments['after_cut']: if sam.get_strand(before_al) == sam.get_strand(after_al): strand = sam.get_strand(before_al) if strand == '+': if before_al.reference_end < after_al.reference_start: possible_HA_boundaries.append( (donor_seq, before_al.reference_start, after_al.reference_end)) elif strand == '-': if before_al.reference_start > after_al.reference_end: flipped_seq = utilities.reverse_complement(donor_seq) start = len(donor_seq) - 1 - (before_al.reference_end - 1) end = len(donor_seq) - 1 - after_al.reference_start + 1 possible_HA_boundaries.append( (flipped_seq, start, end)) possible_HAs = [] for possibly_flipped_donor_seq, HA_start, HA_end in possible_HA_boundaries: donor_window = possibly_flipped_donor_seq[HA_start:HA_end] donor_prefix = donor_window[:required_match_length] donor_suffix = donor_window[-required_match_length:] # Try to be resilient against multiple occurrence of HA substrings in the target # by prioritizing matches closest to the cut site. target_HA_start = target_seq.rfind(donor_prefix, 0, cut_after + required_match_length) target_HA_end = target_seq.find( donor_suffix, cut_after - required_match_length) + len(donor_suffix) if target_HA_start == -1 or target_HA_end == -1 or target_HA_start >= target_HA_end: results = {'failed': f'cannot locate homology arms in target'} return results relevant_target_seq = target_seq[target_HA_start:target_HA_end] total_HA_length = target_HA_end - target_HA_start mismatches_before_deletion = np.cumsum( [t != d for t, d in zip(relevant_target_seq, donor_window)]) flipped_target = relevant_target_seq[::-1] flipped_donor = donor_window[::-1] mismatches_after_deletion = np.cumsum( [0] + [t != d for t, d in zip(flipped_target, flipped_donor)][:-1])[::-1] total_mismatches = mismatches_before_deletion + mismatches_after_deletion last_index_in_HA_1 = int(np.argmin(total_mismatches)) min_mismatches = total_mismatches[last_index_in_HA_1] lengths = {} lengths['HA_1'] = last_index_in_HA_1 + 1 lengths['HA_2'] = total_HA_length - lengths['HA_1'] lengths['donor_specific'] = len(donor_seq) - total_HA_length info = { 'min_mismatches': min_mismatches, 'possibly_flipped_donor_seq': possibly_flipped_donor_seq, 'donor_HA_start': HA_start, 'donor_HA_end': HA_end, 'target_HA_start': target_HA_start, 'target_HA_end': target_HA_end, 'lengths': lengths, } possible_HAs.append((info)) def priority(info): return info['min_mismatches'], -min(info['lengths']['HA_1'], info['lengths']['HA_2']) if not possible_HAs: results = {'failed': 'cannot locate homology arms'} else: results = min(possible_HAs, key=priority) lengths = results['lengths'] donor_starts = { 'HA_1': results['donor_HA_start'], 'donor_specific': results['donor_HA_start'] + lengths['HA_1'], 'HA_2': results['donor_HA_end'] - lengths['HA_2'], } donor_ends = { 'HA_1': donor_starts['HA_1'] + lengths['HA_1'], 'donor_specific': donor_starts['HA_2'], 'HA_2': donor_starts['HA_2'] + lengths['HA_2'], } if donor_type == 'PCR': if donor_starts['HA_1'] != 0: donor_starts['PCR_adapter_1'] = 0 donor_ends['PCR_adapter_1'] = donor_starts['HA_1'] if donor_ends['HA_2'] != len(donor_seq): donor_starts['PCR_adapter_2'] = donor_ends['HA_2'] donor_ends['PCR_adapter_2'] = len(donor_seq) target_starts = { 'HA_1': results['target_HA_start'], 'HA_2': results['target_HA_end'] - lengths['HA_2'], } target_ends = { key: target_starts[key] + lengths[key] for key in target_starts } donor_strand = 1 target_strand = 1 donor_features = [ SeqFeature( location=FeatureLocation(donor_starts[feature_name], donor_ends[feature_name], strand=donor_strand), id=feature_name, type='misc_feature', qualifiers={ 'label': feature_name, 'ApEinfo_fwdcolor': colors[feature_name], }, ) for feature_name in donor_starts ] target_features = ([ SeqFeature( location=FeatureLocation(target_starts[feature_name], target_ends[feature_name], strand=target_strand), id=feature_name, type='misc_feature', qualifiers={ 'label': feature_name, 'ApEinfo_fwdcolor': colors[feature_name], }, ) for feature_name in target_starts ]) HA_info = { 'possibly_flipped_donor_seq': results['possibly_flipped_donor_seq'], 'donor_features': donor_features, 'target_features': target_features, } return HA_info
def evaluate_candidate(al): results = { 'location': f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}', 'ref_name': al.reference_name, 'cut_afters': [], } full_window_around = 5000 full_around = region_fetcher( al.reference_name, al.reference_start - full_window_around, al.reference_end + full_window_around).upper() if sam.get_strand(al) == '+': ps_seq = protospacer ps_strand = 1 else: ps_seq = utilities.reverse_complement(protospacer) ps_strand = -1 ps_start = full_around.index(ps_seq) protospacer_locations = [(ps_seq, ps_start, ps_strand)] for other_protospacer in other_protospacers: if other_protospacer in full_around: ps_seq = other_protospacer ps_strand = 1 else: ps_seq = utilities.reverse_complement(other_protospacer) if ps_seq not in full_around: results[ 'failed'] = f'protospacer {other_protospacer} not present near protospacer {protospacer}' return results ps_strand = -1 ps_start = full_around.index(ps_seq) protospacer_locations.append((ps_seq, ps_start, ps_strand)) for ps_seq, ps_start, ps_strand in protospacer_locations: if ps_strand == 1: PAM_offset = len(protospacer) PAM_transform = utilities.identity cut_after = al.reference_start - full_window_around + ps_start + PAM_offset - 3 else: PAM_offset = -3 PAM_transform = utilities.reverse_complement cut_after = al.reference_start - full_window_around + ps_start + 2 results['cut_afters'].append(cut_after) PAM_start = ps_start + PAM_offset PAM = PAM_transform(full_around[PAM_start:PAM_start + 3]) pattern, *matches = Bio.SeqUtils.nt_search(PAM, 'NGG') if 0 not in matches: # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer # in full_around. results[ 'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})' return results min_start = min( ps_start for ps_seq, ps_start, ps_strand in protospacer_locations) max_start = max( ps_start for ps_seq, ps_start, ps_strand in protospacer_locations) results['min_cut_after'] = min(results['cut_afters']) results['max_cut_after'] = max(results['cut_afters']) final_window_around = 500 final_start = min_start - final_window_around final_end = max_start + final_window_around target_seq = full_around[final_start:final_end] results['target_seq'] = target_seq return results
def build_doubles_guide_specific_target( original_target, fixed_guide_library, variable_guide_library, fixed_guide, variable_guide, tasks_queue=None, ): warnings.simplefilter('ignore') new_name = f'{original_target.name}-{fixed_guide}-{variable_guide}' new_dir = original_target.dir.parent / new_name new_dir.mkdir(exist_ok=True) original_genbank_name = 'doubles_vector' gb_fn = original_target.dir / f'{original_genbank_name}.gb' gb = Bio.SeqIO.read(str(gb_fn), 'genbank') fixed_ps = original_target.features[original_target.name, 'fixed_protospacer'] fixed_ps_seq = fixed_guide_library.guides_df.loc[fixed_guide, 'protospacer'] gb.seq = gb.seq[:fixed_ps.start] + fixed_ps_seq + gb.seq[fixed_ps.end + 1:] variable_ps = original_target.features[original_target.name, 'variable_protospacer'] variable_ps_seq = variable_guide_library.guides_df.loc[variable_guide, 'protospacer'] gb.seq = gb.seq[:variable_ps. start] + variable_ps_seq + gb.seq[variable_ps.end + 1:] guide_bc_start = original_target.features[original_target.name, 'fixed_guide_barcode'].start guide_bc_end = original_target.features[original_target.name, 'fixed_guide_barcode'].end # guide barcode sequence in library df is on the reverse strand fixed_bc_seq_rc = fixed_guide_library.guides_df.loc[fixed_guide, 'guide_barcode'] fixed_bc_seq = utilities.reverse_complement(fixed_bc_seq_rc) gb.seq = gb.seq[:guide_bc_start] + fixed_bc_seq + gb.seq[guide_bc_end + 1:] new_gb_fn = new_dir / f'{original_genbank_name}.gb' if new_gb_fn.exists(): new_gb_fn.unlink() Bio.SeqIO.write(gb, str(new_gb_fn), 'genbank') fns_to_copy = [ f'{source}.gb' for source in original_target.sources if source != original_genbank_name ] fns_to_copy.append('manifest.yaml') relative_original_dir = Path(os.path.relpath(original_target.dir, new_dir)) for fn in fns_to_copy: new_fn = new_dir / fn old_fn = relative_original_dir / fn if new_fn.exists() or new_fn.is_symlink(): new_fn.unlink() new_fn.symlink_to(old_fn) new_ti = target_info.TargetInfo(original_target.base_dir, new_name) new_ti.make_references() new_ti.identify_degenerate_indels() if tasks_queue is not None: tasks_queue.put((fixed_guide, variable_guide))