Exemplo n.º 1
0
    def fix_sequence(self):
        """
        Remove internal primer and restriction sites from the coding sequence.

        Returns
        -------
        None.
        """
        to_exclude = [
            self.typeIIs, self.asmf_re, self.asmr_re, self.gsp_f, self.gsp_r,
            self.asm_f, self.asm_r, 'AAAAA', 'GGGGG', 'CCCCC', 'TTTTT'
        ]
        to_exclude.extend([utils.rev_comp(subseq) for subseq in to_exclude])

        primers = [
            self.gsp_f,
            utils.rev_comp(self.gsp_r), self.asm_f,
            utils.rev_comp(self.asm_r)
        ]

        gc_limits = [35, 65]

        good_seq = False

        while not good_seq:
            bad_codons = set()
            for subseq in to_exclude:
                bad_site = self.nt_seq.find(subseq)
                if bad_site > -1:
                    positions = np.arange(bad_site, bad_site + len(subseq))
                    bad_codons.update(utils.get_codons(positions))

            for primer in primers:
                match_len, bad_nt_pos, _ = utils.lcs(self.nt_seq, primer)
                if match_len > 10:
                    bad_codons.update(utils.get_codons(bad_nt_pos))

                # primers are single stranded, but the templates are not (after one cycle, at least)
                match_len, bad_nt_pos, _ = utils.lcs(
                    utils.rev_comp(self.nt_seq), primer)
                if match_len > 10:
                    bad_codons.update(utils.get_codons(bad_nt_pos))

            if len(bad_codons) == 0:
                good_seq = 1
            else:
                to_fix = self.rng.choice(list(bad_codons))
                self.sample_new_codon(to_fix)

            # no restriction sites but a bad GC content... pick a random site to change
            if good_seq and not (gc_limits[0] <= GC(self.nt_seq) <=
                                 gc_limits[1]):
                good_seq = 0
                to_fix = self.rng.choice(np.arange(len(self.codons)))
                self.sample_new_codon(to_fix)

        return
Exemplo n.º 2
0
    def gen():
        while True:
            random_mirs, random_images, random_labels, random_stypes = [], [], [], []

            # choose one of the RBNS miRNAs, generate target with no pairing, and assign logkd of 2
            rbns1_mir = np.random.choice(TRAIN_MIRS_KDS)
            random_mirs.append(rbns1_mir.encode('utf-8'))
            rbns1_mirseq = MIRNA_DATA.loc[rbns1_mir]['guide_seq'][:options.
                                                                  MIRLEN]
            rbns1_target = utils.get_target_no_match(rbns1_mirseq, SEQLEN)
            random_images.append(
                np.outer(utils.one_hot_encode(rbns1_mirseq),
                         utils.one_hot_encode(rbns1_target)))
            random_labels.append([2.0])
            random_stypes.append(b'extra')

            # generate miRNA and target with no pairing and assign log kd of 2
            rbns2_mir = np.random.choice(TRAIN_MIRS_KDS)
            random_mirs.append(rbns2_mir.encode('utf-8'))
            rbns2_target = utils.generate_random_seq(3) + utils.rev_comp(
                MIRNA_DATA.loc[rbns2_mir]['guide_seq']
                [1:7]) + utils.generate_random_seq(3)
            rbns2_mirseq = utils.get_mir_no_match(rbns2_target, options.MIRLEN)
            random_images.append(
                np.outer(utils.one_hot_encode(rbns2_mirseq),
                         utils.one_hot_encode(rbns2_target)))
            random_labels.append([2.0])
            random_stypes.append(b'extra')

            # generate random 8mer pair and assign KD of average 8mer
            random_mirseq = utils.generate_random_seq(options.MIRLEN)
            random_mirs.append(b'random')
            up_flank = utils.generate_random_seq(2)
            down_flank = utils.generate_random_seq(2)
            random_target = up_flank + utils.rev_comp(
                random_mirseq[1:8]) + 'A' + down_flank
            random_images.append(
                np.outer(utils.one_hot_encode(random_mirseq),
                         utils.one_hot_encode(random_target)))
            # new_label = -5.367
            new_label = -5
            flank_vals = {
                'A': -0.34923908,
                'T': -0.24840472,
                'C': 0.12640774,
                'G': 0.47123606
            }
            all_flank = up_flank + down_flank
            for nt, val in flank_vals.items():
                new_label += val * all_flank.count(nt)
            random_labels.append([new_label])
            random_stypes.append(b'extra')

            yield np.array(random_mirs), np.stack(random_images), np.array(
                random_labels), np.array(random_stypes)
Exemplo n.º 3
0
    def gen():
        while True:
            random_mirseq = utils.generate_random_seq(options.MIRLEN)
            random_target = utils.get_target_no_match(random_mirseq, SEQLEN)
            random_image = np.outer(utils.one_hot_encode(random_mirseq),
                                    utils.one_hot_encode(random_target))

            rbns1_mir = np.random.choice(TRAIN_MIRS_KDS)
            rbns1_mirseq = MIRNA_DATA.loc[rbns1_mir]['guide_seq'][:options.
                                                                  MIRLEN]
            rbns1_target = utils.get_target_no_match(rbns1_mirseq, SEQLEN)
            rbns1_image = np.outer(utils.one_hot_encode(rbns1_mirseq),
                                   utils.one_hot_encode(rbns1_target))

            rbns2_mir = np.random.choice(TRAIN_MIRS_KDS)
            rbns2_target = utils.generate_random_seq(3) + utils.rev_comp(
                MIRNA_DATA.loc[rbns2_mir]['guide_seq']
                [1:7]) + utils.generate_random_seq(3)
            rbns2_mirseq = utils.get_mir_no_match(rbns2_target, options.MIRLEN)
            rbns2_image = np.outer(utils.one_hot_encode(rbns2_mirseq),
                                   utils.one_hot_encode(rbns2_target))

            yield np.array([
                b'random',
                rbns1_mir.encode('utf-8'),
                rbns2_mir.encode('utf-8')
            ]), np.stack([random_image, rbns1_image, rbns2_image]), np.array(
                [[0.0], [0.0],
                 [0.0]]), np.array([b'no site', b'no site', b'no site'])
Exemplo n.º 4
0
def generate_repeats(sizes, atomic):
    """Generates all possible motifs for repeats in a given length range"""
    generated_repeats = []
    alphabet = ['A', 'C', 'G', 'T']
    expanded_set = set()
    repeat_set = set()
    sizes.sort()
    min_size = sizes[0]
    max_size = sizes[-1]
    non_atomic_repeats = dict()
    for s in range(1, max_size):
        if s not in sizes:
            non_atomic_repeats[s] = set()
            if atomic:
                for combination in product(alphabet, repeat=s):
                    repeat = ''.join(combination)
                    expanded = expand_repeat(repeat, max_size)
                    non_atomic_repeats[s].add(expanded)
    for i in sizes:
        factors = num_factors(i)
        for combination in product(alphabet, repeat=i):
            repeat = ''.join(combination)
            repeat_revcomp = rev_comp(repeat)
            expanded = expand_repeat(repeat, max_size)
            atomic_check = False
            if atomic:
                for factor in factors:
                    if factor not in sizes and expanded in non_atomic_repeats[
                            factor]:
                        atomic_check = True
            if expanded in expanded_set:
                continue
            elif atomic and atomic_check:
                continue
            else:
                repeat_cycles = get_cycles(repeat)
                for cycle in repeat_cycles:
                    strand = '+'
                    string = expand_repeat(cycle, max_size)
                    expanded_set.add(string)
                    if cycle not in repeat_set:
                        repeat_set.add(cycle)
                        if len(cycle) >= min_size:
                            generated_repeats.append('\t'.join(
                                [cycle, repeat,
                                 str(len(cycle)), strand]))
                if repeat_revcomp == repeat:
                    continue
                repeat_cycles = get_cycles(repeat_revcomp)
                for cycle in repeat_cycles:
                    strand = '-'
                    string = expand_repeat(cycle, max_size)
                    expanded_set.add(string)
                    if cycle not in repeat_set:
                        repeat_set.add(cycle)
                        if len(cycle) >= min_size:
                            generated_repeats.append('\t'.join(
                                [cycle, repeat,
                                 str(len(cycle)), strand]))
    return generated_repeats
Exemplo n.º 5
0
def write_12mers(mirname, mirseq, outfile):

    site8 = utils.rev_comp(mirseq[1:8]) + 'A'
    all_12mers = generate_12mers(site8)

    if len(all_12mers) != 262144:
        raise (ValueError("all_12mers should be 262144 in length"))

    with tf.python_io.TFRecordWriter(outfile) as tfwriter:
        for siteseq in all_12mers:

            aligned_stype = utils.get_centered_stype(site8, siteseq)
            if aligned_stype == 'no site':
                keep_prob = 0.001
            else:
                keep_prob = 1.0

            feature_dict = {
                'mir': tf_utils._bytes_feature(mirname.encode('utf-8')),
                'mir_1hot':
                tf_utils._float_feature(utils.one_hot_encode(mirseq)),
                'seq_1hot':
                tf_utils._float_feature(utils.one_hot_encode(siteseq)),
                'log_kd': tf_utils._float_feature([-0.0]),
                'keep_prob': tf_utils._float_feature([keep_prob]),
                'stype':
                tf_utils._bytes_feature(aligned_stype.encode('utf-8')),
            }

            example_proto = tf.train.Example(features=tf.train.Features(
                feature=feature_dict))
            example_proto = example_proto.SerializeToString()

            tfwriter.write(example_proto)
Exemplo n.º 6
0
def calculate_threep_score(mirseq, utr, site_start, upstream_limit):
    """
    Calculate the three-prime pairing score

    Parameters
    ----------
    mirseq: string, miRNA sequence
    utr: string, utr sequence
    site_start: int, start of 12mer site
    upstream_limit: int, how far upstream to look for 3p pairing

    Output
    ------
    float: 3' pairing score
    """
    if site_start <= 0:
        return 0

    # get the 3' region of the mirna and the corresponding utr seq
    mirseq_3p = mirseq[8:]  # miRNA sequence from position 9 onward
    trailing = utr[max(0, site_start - upstream_limit):site_start +
                   2]  # site sequence up to edges of possible 8mer site
    utr_5p = utils.rev_comp(trailing)

    # initiate array for dynamic programming search
    scores = np.empty((len(utr_5p) + 1, len(mirseq_3p) + 1))
    scores.fill(np.nan)
    possible_scores = [0]

    # fill in array
    for i, nt1 in enumerate(utr_5p):
        for j, nt2 in enumerate(mirseq_3p):
            if nt1 == nt2:
                new_score = 0.5 + 0.5 * ((j > 3) & (j < 8))
                if not np.isnan(scores[i, j]):
                    new_score += scores[i, j]
                    scores[i + 1, j + 1] = new_score
                    possible_scores.append(new_score)
                else:
                    offset_penalty = max(0, (abs(i - j) - 2) * 0.5)
                    scores[i + 1, j + 1] = new_score - offset_penalty
            else:
                scores[i + 1, j + 1] = float('NaN')

    return np.nanmax(possible_scores)
Exemplo n.º 7
0
def generate_repeats(min_size, max_size, atomic):
    """Generates all possible motifs for repeats in a given length range"""
    generated_repeats = []
    alphabet = ['A', 'C', 'G', 'T']
    expanded_set = set()
    repeat_set = set()
    init_size = 1
    if atomic:
        init_size = min_size
    for i in range(init_size, max_size+1):
        for combination in product(alphabet, repeat=i):
            repeat = ''.join(combination)
            repeat_revcomp = rev_comp(repeat)
            expanded = expand_repeat(repeat, max_size)
            if expanded in expanded_set:
                continue
            else:
                repeat_cycles = get_cycles(repeat)
                for cycle in repeat_cycles:
                    strand = '+'
                    string = expand_repeat(cycle, max_size)
                    expanded_set.add(string)
                    if cycle not in repeat_set:
                        repeat_set.add(cycle)
                        if len(cycle) >= min_size:
                            generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand]))
                if repeat_revcomp == repeat:
                    continue
                repeat_cycles = get_cycles(repeat_revcomp)
                for cycle in repeat_cycles:
                    strand = '-'
                    string = expand_repeat(cycle, max_size)
                    expanded_set.add(string)
                    if cycle not in repeat_set:
                        repeat_set.add(cycle)
                        if len(cycle) >= min_size:
                            generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand]))
    return generated_repeats
Exemplo n.º 8
0
    parser.add_option("--overlap_dist", dest="OVERLAP_DIST", help="minimum distance between neighboring sites", type=int)
    parser.add_option("--upstream_limit", dest="UPSTREAM_LIMIT", help="how far upstream to look for 3p pairing", type=int)
    parser.add_option("--only_canon", dest="ONLY_CANON", help="only use canonical sites", default=False, action='store_true')

    (options, args) = parser.parse_args()

    TRANSCRIPTS = pd.read_csv(options.TRANSCRIPTS, sep='\t', index_col='transcript')
    mirseqs = pd.read_csv(options.MIR_SEQS, sep='\t', index_col='mir')
    if '_pass' in options.MIR:
        MIRSEQ = mirseqs.loc[options.MIR.replace('_pass', '')]['pass_seq']
        FAMILY = mirseqs.loc[options.MIR.replace('_pass', '')]['pass_family']
    else:
        MIRSEQ = mirseqs.loc[options.MIR]['guide_seq']
        FAMILY = mirseqs.loc[options.MIR]['guide_family']

    SITE8 = utils.rev_comp(MIRSEQ[1:8]) + 'A'
    print(options.MIR, SITE8)

    # if KD file provided, find sites based on KD file
    if options.KDS is not None:
        KDS = pd.read_csv(options.KDS, sep='\t')
        if options.ONLY_CANON:
            KDS = KDS[KDS['aligned_stype'] != 'no site']
        KDS = KDS[KDS['best_stype'] == KDS['aligned_stype']]

        temp = KDS[KDS['mir'] == FAMILY]
        if len(temp) == 0:
            raise ValueError('{} not in kd files'.format(FAMILY))
        mir_kd_dict = {x: y for (x, y) in zip(temp['12mer'], temp['log_kd']) if (y < options.KD_CUTOFF)}

        # find all the sites and KDs
Exemplo n.º 9
0
def calculate_12mer_kds(mirname, mirseq, mirlen, load_model, outfile):
    """
    For a given miRNA sequence, use a saved ConvNet to generate predictions
    """

    if len(mirseq) < 12:
        raise(ValueError("miRNA must be at least 12 nt long"))
    
    if len(mirseq.replace('A','').replace('C','').replace('G','').replace('T','')) > 0:
        raise(ValueError("miRNA must only contain A, C, T, G"))

    site8 = utils.rev_comp(mirseq[1:8]) + 'A'
    mirseq_one_hot = utils.one_hot_encode(mirseq[:mirlen])

    # generate all 12mer sequences, there should be 262,144
    kmers = generate_12mers(site8)

    if len(kmers) != 438784:
        raise(ValueError("kmers should be 438784 in length"))

    # load trained model 
    tf.reset_default_graph()
    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())
        saver = tf.train.import_meta_graph(load_model + '.meta')
        print('Restoring from {}'.format(load_model))
        saver.restore(sess, load_model)

        _dropout_rate = tf.get_default_graph().get_tensor_by_name('dropout_rate:0')
        _phase_train = tf.get_default_graph().get_tensor_by_name('phase_train:0')
        _combined_x = tf.get_default_graph().get_tensor_by_name('combined_x:0')
        _prediction = tf.get_default_graph().get_tensor_by_name('final_layer/pred_ka:0')

        num_batches = 64
        batch_size = 6856

        with open(outfile, 'w') as outfile_writer:
            outfile_writer.write('12mer\tlog_kd\tmir\tmirseq\taligned_stype\tbest_stype\n')
            for batch in range(num_batches):
                print("Processing {}/{}...".format((batch+1)*batch_size, 438784))
                seqs = kmers[batch*batch_size: (batch+1) * batch_size]
                input_data = []
                for ix, seq in enumerate(seqs):
                    seq_one_hot = utils.one_hot_encode(seq)
                    input_data.append(np.outer(mirseq_one_hot, seq_one_hot))

                input_data = np.stack(input_data)

                feed_dict = {
                                _dropout_rate: 0.0,
                                _phase_train: False,
                                _combined_x: input_data
                            }

                pred_kds = -1 * sess.run(_prediction, feed_dict=feed_dict).flatten()
                aligned_stypes = [utils.get_centered_stype(site8, seq) for seq in seqs]
                best_stypes = [utils.get_best_stype(site8, seq) for seq in seqs]

                for seq, kd, aligned_stype, best_stype in zip(seqs, pred_kds, aligned_stypes, best_stypes):
                    outfile_writer.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(seq, kd, mirname, mirseq, aligned_stype, best_stype))
Exemplo n.º 10
0
                      action='store_true')

    (options, args) = parser.parse_args()

    ### READ miRNA DATA and filter for ones to keep ###
    MIRNAS = pd.read_csv(options.MIR_SEQS, sep='\t')
    MIRNAS = MIRNAS[MIRNAS['use_tpms']]
    ALL_GUIDES = sorted(list(MIRNAS['mir'].values))

    MIR_DICT = {}
    for row in MIRNAS.iterrows():
        guide_seq = row[1]['guide_seq']
        pass_seq = row[1]['pass_seq']
        MIR_DICT[row[1]['mir']] = {
            'mirseq': guide_seq,
            'site8': utils.rev_comp(guide_seq[1:8]) + 'A',
            'one_hot': utils.one_hot_encode(guide_seq[:options.MIRLEN])
        }

        MIR_DICT[row[1]['mir'] + '*'] = {
            'mirseq': pass_seq,
            'site8': utils.rev_comp(pass_seq[1:8]) + 'A',
            'one_hot': utils.one_hot_encode(pass_seq[:options.MIRLEN])
        }

    ### READ EXPRESSION DATA ###
    TPM = pd.read_csv(options.TPM_FILE, sep='\t', index_col=0).sort_index()
    for mir in ALL_GUIDES:
        if mir not in TPM.columns:
            raise ValueError(
                '{} given in mirseqs file but not in TPM file.'.format(mir))
Exemplo n.º 11
0
                      dest="OUTFILE",
                      help="file for writing outputs, use MIR as placeholder")
    parser.add_option("--passenger",
                      dest="PASSENGER",
                      help="also calculate for passenger strand",
                      default=False,
                      action='store_true')

    (options, args) = parser.parse_args()

    mirseqs = pd.read_csv(options.MIRSEQS, sep='\t', index_col='mir')

    for row in mirseqs.iterrows():
        # get names and 8mer sites for the guide and passenger strands, if applicable
        mirnames = [row[0]]
        site8s = [utils.rev_comp(row[1]['guide_seq'][1:8]) + 'A']
        if options.PASSENGER:
            mirnames += [row[0] + '_pass']
            site8s += [utils.rev_comp(row[1]['pass_seq'][1:8]) + 'A']

        # read in RNAplfold results
        for mirname, site8 in zip(mirnames, site8s):
            SA_bg = []
            for ix in range(options.NBINS):
                seqs = pd.read_csv(os.path.join(
                    options.INFILE_SEQS.replace('MIR', mirname).replace(
                        'IX', str(ix))),
                                   sep='\t')
                temp = pd.read_csv(os.path.join(
                    options.INFILE_BG.replace('MIR',
                                              mirname).replace('IX', str(ix))),
Exemplo n.º 12
0
    def split_kmers(self,
                    min_overlap=16,
                    max_overlap=35,
                    min_tm=59,
                    max_tm=64,
                    min_gc=40,
                    max_gc=60,
                    min_ddg=-3,
                    min_dimer=-9):
        """
        Split the coding sequnce

        Parameters
        ----------
        (int) min_overlap -- minimum bp in overlap sequences. Default 16
        (int) max_overlap -- maximum bp in overlap sequences. Default 35
        (float) min_tm -- minimum melting temp in overlap sequences. Default 59
        (float) max_tm -- maximum melting temp in overlap sequences. Default 64
        (float) min_gc -- minimum %GC content in overlap sequences. Default 40
        (float) max_gc -- maximum %GC content in overlap sequences. Default 60
        (float) min_ddg=-3 -- minimum hairpin/secondary structure deltaG from ViennaRNA RNAfold (DNA parameters)
                              If None, do not check this
        (float) min_dimer=-9 -- minimum self-association deltaG from ViennaRNA RNAduplex (DNA parameters)
                              If None, do not check this
    
        Returns
        -------
        None. Oligos stored in oligos field of Gene object.
        """

        # calculate expected number of oligos necessary so that length can be equal-ish
        tot_assembled = len(self.aa_seq)*3+len(self.asm_f)+len(self.asmf_re) + \
                        len(self.asmr_re)+len(self.asm_r) # total sequence that needs to be split into fragments
        available_nt = self.oligo_size - len(self.gsp_f) - len(
            self.gsp_r) - 2 * len(
                self.typeIIs)  # non-constant oligo region size
        expected_overlap_nt = (
            np.ceil(tot_assembled / available_nt) - 1
        ) * max_overlap  # expected number of bp doubly represented due to overlaps
        expected_oligos = np.ceil(
            (tot_assembled + expected_overlap_nt) /
            available_nt)  # expected number of oligos including overlap region
        target_length = int(
            (tot_assembled + expected_overlap_nt) // expected_oligos
        )  # length to target per fragment to get roughly equal lengths

        # The basic gist of how this works is you start with the full sequence, which you have already determined the nucleotides
        # for (this is the main difference from Bill's code). You take as much of that sequence as you can to fill the current
        # oligo, cut it back until you get a GC on the 3' end, then work backwards until you get a good overlap. If you can't get
        # a good overlap (usually due to GC content), start over with a different max length. Max lengths go from n, n-1, n+1, n-2, n+2, ...
        # If the max length becomes greater than the allowed length (or shorter than max_overlap+2, but usually the former happens first),
        # can't assemble the sequence. Try with a different random seed to produce a different sequence.
        # Note: the former constraint can be relaxed to try some shorter lengths too, but seems to work OK for now

        all_oligos = False  # keep track of progress
        curr_max = target_length  # max number of bp allowed in single oligo
        curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp(
            self.asm_r)
        next_oligo = ""
        while not all_oligos:

            if len(curr_oligo) > curr_max:
                # just take what will fit on the current oligo, save the
                # rest for later
                next_oligo = curr_oligo[curr_max:]
                curr_oligo = curr_oligo[:curr_max]

            if len(next_oligo) == 0:  # don't need to find an overlap bc done

                self.oligos.append(curr_oligo)

                # check the assembly
                gene = ""
                badoverlap = False
                for i in range(len(self.oligos)):
                    if i == 0:
                        gene += self.oligos[i]
                    else:
                        common = self.oligos[i].find(self.oligos[i - 1][-10:])
                        if common < 0:
                            badoverlap = True
                            break
                        gene += self.oligos[i][common + 10:]

                # check for additional overlap sites between different oligos
                for i, olap in enumerate(self.overlaps):
                    for j, oligo in enumerate(self.oligos):
                        # overlap i corresponds to oligo i and oligo i+1
                        if j == i:  # overlap at the end
                            true_occur = oligo.find(olap)
                            trimmed_seq = oligo[:true_occur]
                        elif j == i + 1:  #overlap at the beginning
                            true_occur = oligo.find(olap)
                            trimmed_seq = oligo[true_occur + len(olap):]
                        else:
                            trimmed_seq = oligo

                        match_len_fwd, _, _ = utils.lcs(trimmed_seq, olap)
                        match_len_rev, _, _ = utils.lcs(
                            trimmed_seq, utils.rev_comp(olap))
                        if match_len_fwd > 10 or match_len_rev > 10:
                            print("Bad overlap due to possible mispriming.")
                            print("Oligo %d overlap %d match %d bp" %
                                  (j, i, max(match_len_fwd, match_len_rev)))
                            badoverlap = True
                            break

                if badoverlap or \
                    gene != self.asm_f + self.asmf_re + self.nt_seq + \
                            self.stop + self.asmr_re + utils.rev_comp(self.asm_r):

                    if curr_max < target_length:
                        curr_max = 2 * target_length - curr_max
                    else:
                        curr_max = 2 * target_length - curr_max - 1

                    if curr_max > available_nt or curr_max < max_overlap + 2:
                        raise Exception(
                            "Couldn't find oligos with given framework, failed at assembly"
                        )

                    curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp(
                        self.asm_r)
                    self.oligos = []
                    self.overlaps = []
                    self.overlap_gc = []
                    self.overlap_tm = []
                else:
                    assert gene.count(self.asm_f) == 1, "Incorrect number AsmF"
                    assert gene.count(
                        self.asmf_re) == 1, "Incorrect number AsmF RE"

                    assert gene.count(utils.rev_comp(
                        self.asm_r)) == 1, "Incorrect number AsmR"
                    assert gene.count(
                        self.asmr_re) == 1, "Incorrect number AsmR RE"

                    all_oligos = True
                    continue

            # trim back to g or c
            while curr_oligo[-1] not in 'GC':
                next_oligo = curr_oligo[-1] + next_oligo
                curr_oligo = curr_oligo[:-1]

            # find the overlap
            overlap_pos = len(curr_oligo) - min_overlap + 1
            curr_tm = 0
            curr_gc = 0
            curr_ss_ddg = 10
            curr_dimer = 10
            while (curr_oligo[overlap_pos] not in 'GC'
                   or not min_tm <= curr_tm <= max_tm
                   or not min_gc <= curr_gc <= max_gc
                   or (min_ddg is not None and not curr_ss_ddg > min_ddg)
                   or (min_dimer is not None and not curr_dimer > min_dimer)):

                overlap_pos -= 1  # initial case accounted for in math above

                if overlap_pos < len(
                        curr_oligo
                ) - max_overlap or curr_tm > max_tm:  #Tm is never going to decrease
                    break
                    # no good overlap... try different max length and
                    # restart the loop

                # don't bother with expensive calcs if the loop is just going to fail anyway
                if curr_oligo[overlap_pos] not in 'GC':
                    continue

                temp_overlap = Seq(curr_oligo[overlap_pos:])
                # tm calculation with salt correction for KOD reaction
                curr_tm = mt.Tm_NN(temp_overlap, Mg=1.5, dNTPs=0.8)
                curr_gc = GC(temp_overlap)

                # ViennaRNA external software
                if min_ddg is not None:
                    curr_ss_ddg = utils.pred_ss_ddg(str(
                        temp_overlap))  # this calculation slows it down a LOT
                if min_dimer is not None:
                    curr_dimer = utils.pred_dimer(str(temp_overlap),
                                                  str(temp_overlap))

            if (curr_oligo[overlap_pos] not in 'GC'
                    or not min_tm <= curr_tm <= max_tm
                    or not min_gc <= curr_gc <= max_gc
                    or (min_ddg != None and not curr_ss_ddg > min_ddg)
                    or (min_dimer != None and not curr_dimer > min_dimer)):
                # this means the above loop broke, so try diff max length
                # and restart the loop
                if curr_max < target_length:
                    curr_max = 2 * target_length - curr_max
                else:
                    curr_max = 2 * target_length - curr_max - 1

                if curr_max > available_nt or curr_max < max_overlap + 2:
                    raise Exception(
                        "Couldn't find oligos with given framework, failed at melting temp"
                    )

                curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp(
                    self.asm_r)
                self.oligos = []
                self.overlaps = []
                self.overlap_gc = []
                self.overlap_tm = []

                continue

            # otherwise process the overlap!
            self.oligos.append(curr_oligo)
            self.overlaps.append(curr_oligo[overlap_pos:])
            self.overlap_gc.append(curr_gc)
            self.overlap_tm.append(curr_tm)

            curr_oligo = self.overlaps[-1] + next_oligo  # add in the overlap
            next_oligo = ""

        # add the gsps, type IIs, any buffer residues to everything

        full_oligos = []
        for i, oligo in enumerate(self.oligos):

            # add buffer between 3' GSP site, TypeIIs site to bring oligo
            # up to full size
            padding_size = available_nt - len(oligo)
            padding = self.rng.choice(['A', 'C', 'G', 'T'], padding_size)
            good_padding = False
            to_exclude = [
                self.typeIIs, self.asmf_re, self.asmr_re, self.gsp_f,
                self.gsp_r, self.asm_f, self.asm_r, 'AAAAA', 'GGGGG', 'CCCCC',
                'TTTTT'
            ]
            to_exclude.extend(
                [utils.rev_comp(subseq) for subseq in to_exclude])

            primers = [
                self.gsp_f,
                utils.rev_comp(self.gsp_r), self.asm_f,
                utils.rev_comp(self.asm_r)
            ]

            left_boundary = oligo[-4:] + utils.rev_comp(self.typeIIs)
            fixable_pos = set(
                range(len(left_boundary),
                      len(left_boundary) + padding_size))

            # make sure you're not getting any restriction/priming sites in the buffer bp that will
            # mess things up
            while not good_padding:
                subseq = left_boundary + "".join(padding) + utils.rev_comp(
                    self.gsp_r)[:5]

                bad_pos = set()
                for site in to_exclude:
                    # include boundaries
                    substr = subseq.find(site)
                    if substr >= 0:
                        bad_pos.update(
                            fixable_pos.intersection(
                                range(substr, substr + len(site))))

                for primer in primers:
                    match_len, bad_nt_pos, _ = utils.lcs(subseq, primer)
                    if match_len > 10:
                        bad_pos.update(fixable_pos.intersection(bad_nt_pos))

                    # primers are single stranded, but the templates are not (after one cycle, at least)
                    match_len, bad_nt_pos, _ = utils.lcs(
                        utils.rev_comp(subseq), primer)
                    if match_len > 10:
                        bad_pos.update(fixable_pos.intersection(bad_nt_pos))

                if len(bad_pos) == 0:
                    good_padding = True
                else:
                    to_fix = self.rng.choice(list(bad_pos))
                    padding[to_fix - len(left_boundary)] = self.rng.choice(
                        ['A', 'C', 'G', 'T'])

            padding = "".join(padding)

            complete_oligo = self.gsp_f + self.typeIIs + oligo + utils.rev_comp(self.typeIIs) + \
                             padding + utils.rev_comp(self.gsp_r)

            # already checked the full assembly, so now make sure nothing was accidentally introduced
            # at boundaries
            assert complete_oligo.count(
                self.gsp_f) == 1, "GSP F not found in %d -th oligo" % i
            assert complete_oligo.count(utils.rev_comp(
                self.gsp_r)) == 1, "GSP_R not found in %d -th oligo" % i
            assert complete_oligo.count(utils.rev_comp(
                self.gsp_f)) == 0, "GSP_F RC found in %d -th oligo" % i
            assert complete_oligo.count(
                self.gsp_r) == 0, "GSP_R RC found in %d -th oligo" % i

            if self.typeIIs == utils.rev_comp(self.typeIIs):
                assert complete_oligo.count(
                    self.typeIIs
                ) == 2, "Extra Type IIS sites in %d -th oligo" % i
            else:
                assert complete_oligo.count(
                    self.typeIIs
                ) == 1, "Extra Type IIS sites in %d -th oligo" % i
                assert complete_oligo.count(
                    utils.rev_comp(self.typeIIs)
                ) == 1, "Extra Type IIS sites in %d -th oligo" % i

            assert complete_oligo.count(utils.rev_comp(
                self.asm_f)) == 0, "AsmF RC in in %d -th oligo" % i
            assert complete_oligo.count(
                self.asm_r) == 0, "AsmR in %d -th oligo" % i
            if self.asmf_re != utils.rev_comp(self.asmf_re):
                assert complete_oligo.count(utils.rev_comp(
                    self.asmf_re)) == 0, "AsmF RE RC in %d -th oligo" % i
            if self.asmr_re != utils.rev_comp(self.asmr_re):
                assert complete_oligo.count(utils.rev_comp(
                    self.asmr_re)) == 0, "AsmR RE RC in %d -th oligo" % i

            full_oligos.append(complete_oligo)

        self.oligos = full_oligos

        return