示例#1
0
def generate_chromosome(seq,
                        markov_list,
                        coord_adjust,
                        rpt_gen,
                        mask=False,
                        max_interval=None,
                        num_repeats=None,
                        max_length=None,
                        limiting_chr=None,
                        sim_type=0):
    """
    Generate a syntehtic sequence with real repeats:
    * seq: A sequence (as a string).
    * markov_list: List of the k+1 i-th order markov chains (from the markov_gen module).
    * coord_adjust: Size of the prefix that has been cut off the template sequence (requiring that
    *               that .fa.out coordinate be adjusted).
    * rpt_gen: A generating function returning the repeat information (created by nextRepeat)
    * mask: If true, all repeats will be lower-case.  Otherwise, upper case.)
    * max_interval: Maximum inter-repeat length.
    """
    current_coord = coord_adjust
    if max_interval == -1:
        max_interval = len(seq)

    s = []  # Hold the sequence (in chunks)
    fa_out = []  # Hold the new .fa.out file contents (by line)

    rpt_count = 0
    length = min(len(seq), max_length) if max_length else len(seq)

    for chr, start, finish, strand, family, rpt_class, rpt_id, ancestor_seq, modern_seq in rpt_gen:
        if limiting_chr and chr not in limiting_chr:
            continue
        if start >= current_coord:

            rpt_count += 1
            inter_seq_len = min(start - current_coord, max_interval)
            inter_seq = markov_gen.generate_sequence(markov_list,
                                                     inter_seq_len)
            assert len(inter_seq) == inter_seq_len
            s.append(inter_seq)
            coord_adjust += max(0, start - current_coord - max_interval)

            if sim_type == 0:
                rpt_seq = seq[start:finish]
            if sim_type == 1:
                rpt_seq = re.sub("-", "", ancestor_seq)
            elif sim_type == 2:
                rpt_seq = "".join([
                    m if m != '-' else a
                    for a, m in zip(ancestor_seq, modern_seq) if a != '-'
                ])
            else:  # sim_type == 3
                rpt_seq = "".join([
                    a if a != '-' else m
                    for a, m in zip(ancestor_seq, modern_seq) if m != '-'
                ])

            rpt_seq = "".join([
                a if a.upper() in {'A', 'C', 'T', 'G'} else 'N'
                for a in rpt_seq
            ])

            s.append(rpt_seq.lower() if mask else rpt_seq.upper())
            new_finish = start + len(rpt_seq)

            fa_out.append([
                chr, start + 1 - coord_adjust, new_finish - coord_adjust,
                strand, family, rpt_class, rpt_id
            ])
            coord_adjust += finish - new_finish

            if num_repeats and rpt_count == num_repeats:
                break

            current_coord = finish

    if num_repeats:
        max_interval = min(1000, max_interval)

    tail_length = min(max_interval, length - current_coord)
    if tail_length > 0:
        s.append(markov_gen.generate_sequence(markov_list, tail_length))

    sim_seq = "".join(s)
    sim_seq_len = len(sim_seq)
    fa_out_str = fa_out_header
    for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out:
        fa_out_str += fa_out_template.format(chr=chr,
                                             start=start,
                                             finish=finish,
                                             left=sim_seq_len - finish,
                                             strand=strand,
                                             family=family,
                                             rpt_class=rpt_class,
                                             rpt_id=rpt_id)

    return sim_seq, fa_out_str
def generate_chromosome(seq, markov_list, chr_start, chr_finish, rpt_gen, mask = False, max_interval = -1, min_interval = 0,num_repeats = None, max_length = None, limiting_chr = None, rep_base_hash = None):
    """
    Generate a syntehtic sequence with real repeats:
    * seq: A sequence (as a string).
    * markov_list: List of the k+1 i-th order markov chains (from the markov_gen module).
    * start/finish: Defined the coordinates of our actual template sequence.  (We are ignoring anything that occurs before/faster.
    *               Allows us to cut of a prefix and/or suffix.
    * rpt_gen: A generating function returning the repeat information (created by nextRepeat)
    * mask: If true, all repeats will be lower-case.  Otherwise, upper case.)
    * max_interval: Maximum inter-repeat length.
    * min_interval: Minimum allowed length of a sequence between repeats.  If two repeats are closer than this,
    *             extend the length.
    * max_interval: Minimum allowed length of a sequence between repeats.  If two repeats are closer than this,
    *             cut the length.
    """
    last_end = chr_start
    if max_interval == -1:
        max_interval = len(seq)

    sim_seq = ""          # Simulated sequence
    fa_out = []           # Hold the new .fa.out file contents (by line)

    rpt_count = 0         # Count of repeats (so we can quit when we reach num_repeats, if applicable)

    for chr, start, finish, strand, family, rpt_class, rpt_id in rpt_gen:
        if limiting_chr and chr not in limiting_chr:    # Skip if we are on the wrong chromsome
            continue

        if start >= chr_finish:     # Quit if we have gone past the allowed range (repeats are assumed to be sorted by start)
            break
        
        if start < chr_start or finish > chr_finish:   # Skip if we are outside the allowed range
            continue

        if start < last_end:      # Skip if this repeat overlapped the last one
            continue

        rpt_count += 1

        # Add the next inter-TE sequence
        inter_seq_len = max(min_interval, min(start - last_end, max_interval))
        sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len)
        
        # Add the next sequence
        if rep_base_hash:
            rpt_seq = rep_base_hash[family]
        else:
            rpt_seq = seq[start:finish]

        fa_out.append([chr, len(sim_seq)+1, len(sim_seq) + len(rpt_seq), strand, family, rpt_class, rpt_id])   # Coords adjusted for biologist notation
        sim_seq += rpt_seq.lower() if mask else rpt_seq.upper()

        
        if rpt_count == num_repeats:
            break

        last_end = max(last_end, finish)

    # Add final sequence on
    final_seq_len = max(min_interval, min(chr_finish - last_end, max_interval))
    sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len)
    
    sim_seq_len = len(sim_seq)
    fa_out_str = fa_out_header
    for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out:
        fa_out_str += fa_out_template.format(chr=chr, start=start, finish=finish, left = sim_seq_len - finish, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id)


    return sim_seq, fa_out_str
def generate_chromosome(seq, markov_list, coord_adjust, rpt_gen, mask = False, max_interval = None, num_repeats = None, max_length = None, limiting_chr = None):
    """
    Generate a syntehtic sequence with real repeats:
    * seq: A sequence (as a string).
    * markov_list: List of the k+1 i-th order markov chains (from the markov_gen module).
    * coord_adjust: Size of the prefix that has been cut off the template sequence (requiring that
    *               that .fa.out coordinate be adjusted).
    * rpt_gen: A generating function returning the repeat information (created by nextRepeat)
    * mask: If true, all repeats will be lower-case.  Otherwise, upper case.)
    * max_interval: Maximum inter-repeat length.
    """
    current_coord = coord_adjust
    if max_interval == -1:
        max_interval = len(seq)

    s = []                # Hold the sequence (in chunks)
    fa_out = []           # Hold the new .fa.out file contents (by line)

    rpt_count = 0
    length = min(len(seq), max_length) if max_length else len(seq)
    debug_sim_len = 0
    
    for chr, start, finish, strand, family, rpt_class, rpt_id in rpt_gen:
        if limiting_chr and chr not in limiting_chr:
            continue

        if start >= current_coord:
            
            rpt_count += 1
            inter_seq_len = min(start-current_coord, max_interval)
            inter_seq = markov_gen.generate_sequence(markov_list, inter_seq_len)
            assert len(inter_seq) == inter_seq_len
            s.append(inter_seq)
            debug_sim_len += len(inter_seq)
            coord_adjust += max(0, start-current_coord-max_interval)

            rpt_seq = seq[start:finish]
            s.append(rpt_seq.lower() if mask else rpt_seq.upper())
            debug_sim_len += len(rpt_seq)
            
            fa_out.append([chr, start+1-coord_adjust, finish-coord_adjust, strand, family, rpt_class, rpt_id])
            #fa_out.append(fa_out_template.format(chr=chr, start=start+1-coord_adjust, finish=finish-coord_adjust, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id))
            
            if num_repeats and rpt_count == num_repeats:
                break

            current_coord = finish
    
    if num_repeats:
        max_interval = min(1000, max_interval)

    tail_length = min(max_interval, length-current_coord)
    if tail_length > 0:
        s.append(markov_gen.generate_sequence(markov_list, tail_length))

    sim_seq = "".join(s)
    sim_seq_len = len(sim_seq)
    fa_out_str = fa_out_header
    for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out:
        fa_out_str += fa_out_template.format(chr=chr, start=start, finish=finish, left = sim_seq_len - finish, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id)


    return sim_seq, fa_out_str
def generate_chromosome(seq,
                        markov_list,
                        chr_start,
                        chr_finish,
                        rpt_gen,
                        mask=False,
                        max_interval=-1,
                        min_interval=0,
                        num_repeats=None,
                        max_length=None,
                        limiting_chr=None,
                        rep_base_hash=None):
    """
    Generate a syntehtic sequence with real repeats:
    * seq: A sequence (as a string).
    * markov_list: List of the k+1 i-th order markov chains (from the markov_gen module).
    * start/finish: Defined the coordinates of our actual template sequence.  (We are ignoring anything that occurs before/faster.
    *               Allows us to cut of a prefix and/or suffix.
    * rpt_gen: A generating function returning the repeat information (created by nextRepeat)
    * mask: If true, all repeats will be lower-case.  Otherwise, upper case.)
    * max_interval: Maximum inter-repeat length.
    * min_interval: Minimum allowed length of a sequence between repeats.  If two repeats are closer than this,
    *             extend the length.
    * max_interval: Minimum allowed length of a sequence between repeats.  If two repeats are closer than this,
    *             cut the length.
    """
    last_end = chr_start
    if max_interval == -1:
        max_interval = len(seq)

    sim_seq = ""  # Simulated sequence
    fa_out = []  # Hold the new .fa.out file contents (by line)

    rpt_count = 0  # Count of repeats (so we can quit when we reach num_repeats, if applicable)

    for chr, start, finish, strand, family, rpt_class, rpt_id in rpt_gen:
        if limiting_chr and chr not in limiting_chr:  # Skip if we are on the wrong chromsome
            continue

        if start >= chr_finish:  # Quit if we have gone past the allowed range (repeats are assumed to be sorted by start)
            break

        if start < chr_start or finish > chr_finish:  # Skip if we are outside the allowed range
            continue

        if start < last_end:  # Skip if this repeat overlapped the last one
            continue

        rpt_count += 1

        # Add the next inter-TE sequence
        inter_seq_len = max(min_interval, min(start - last_end, max_interval))
        sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len)

        # Add the next sequence
        if rep_base_hash:
            rpt_seq = rep_base_hash[family]
        else:
            rpt_seq = seq[start:finish]

        fa_out.append([
            chr,
            len(sim_seq) + 1,
            len(sim_seq) + len(rpt_seq), strand, family, rpt_class, rpt_id
        ])  # Coords adjusted for biologist notation
        sim_seq += rpt_seq.lower() if mask else rpt_seq.upper()

        if rpt_count == num_repeats:
            break

        last_end = max(last_end, finish)

    # Add final sequence on
    final_seq_len = max(min_interval, min(chr_finish - last_end, max_interval))
    sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len)

    sim_seq_len = len(sim_seq)
    fa_out_str = fa_out_header
    for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out:
        fa_out_str += fa_out_template.format(chr=chr,
                                             start=start,
                                             finish=finish,
                                             left=sim_seq_len - finish,
                                             strand=strand,
                                             family=family,
                                             rpt_class=rpt_class,
                                             rpt_id=rpt_id)

    return sim_seq, fa_out_str