Пример #1
0
def parse_cdhit_output_100p(unclustered_fasta, cdhit_output):


    final_cluster_mappings = dict()

    seqdict_100p = dict()
    for rec in SeqIO.parse(cdhit_output, 'fasta'):
        fwd = str(rec.seq)
        rev = misc.revcomp(fwd)

        seq_key = tuple(sorted([fwd, rev]))
        seqdict_100p[seq_key] = rec


    for rec in SeqIO.parse(unclustered_fasta, 'fasta'):
        fwd = str(rec.seq)
        rev = misc.revcomp(fwd)

        seq_key = tuple(sorted([fwd, rev]))

        cluster = seqdict_100p[seq_key].id

        final_cluster_mappings[rec.id] = cluster

    return final_cluster_mappings
Пример #2
0
def get_inferred_sequences(pairs, genome_dict, add_softclipped_bases=False):

    inferred_sequences = []
    for read1, read2 in pairs:
        if read1.query_name.count('_') == 2:
            context_width = int(read1.query_name.split('_')[-2])
            name = read1.reference_name + ':' + str(
                read1.reference_start +
                context_width) + '-' + str(read2.reference_end - context_width)

            inferred_sequence = genome_dict[read1.reference_name][
                read1.reference_start:read2.reference_end]

            if add_softclipped_bases:
                inferred_sequence = sctools.left_softclipped_sequence_strict(
                    read1
                ) + inferred_sequence + sctools.right_softclipped_sequence_strict(
                    read2)

            inferred_sequence = inferred_sequence[context_width:-context_width]

            if read1.query_name.split('_')[-1] == '2':
                inferred_sequence = misc.revcomp(inferred_sequence)

            contig_edge = False
            if sctools.is_left_softclipped_strict(read1) and \
                sctools.left_softclipped_position(read1) < 0:
                contig_edge = True
            elif sctools.is_right_softclipped_strict(read2) and \
                sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]):
                contig_edge = True

        else:
            name = read1.reference_name + ':' + str(
                read1.reference_start) + '-' + str(read2.reference_end)
            inferred_sequence = genome_dict[read1.reference_name][
                read1.reference_start:read2.reference_end]

            if add_softclipped_bases:
                inferred_sequence = sctools.left_softclipped_sequence_strict(
                    read1
                ) + inferred_sequence + sctools.right_softclipped_sequence_strict(
                    read2)

            if read1.query_name.split('_')[-1] == '2':
                inferred_sequence = misc.revcomp(inferred_sequence)

            contig_edge = False
            if sctools.is_left_softclipped_strict(read1) and \
                            sctools.left_softclipped_position(read1) < 0:
                contig_edge = True
            elif sctools.is_right_softclipped_strict(read2) and \
                            sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]):
                contig_edge = True

        inferred_sequences.append(
            (name, len(inferred_sequence), contig_edge, inferred_sequence))

    return inferred_sequences
Пример #3
0
    def get_inferred_sequence(self, forward_read, reverse_read, is_reverse):
        contig = forward_read.reference_name
        start = forward_read.reference_start
        end = reverse_read.reference_end

        inferred_sequence = ''.join(self.genome_dict[contig][start:end])

        inferred_sequence = sctools.left_softclipped_sequence_strict(forward_read) + \
                            inferred_sequence + \
                            sctools.right_softclipped_sequence_strict(reverse_read)

        inferred_sequence = inferred_sequence[self.context_width:-self.context_width]

        if is_reverse:
            inferred_sequence = misc.revcomp(inferred_sequence)

        contig_edge = False
        if sctools.is_left_softclipped_strict(forward_read) and \
                        sctools.left_softclipped_position(forward_read) < 0:
            contig_edge = True
        elif sctools.is_right_softclipped_strict(reverse_read) and \
                        sctools.right_softclipped_position(reverse_read) >= len(self.genome_dict[contig]):
            contig_edge = True


        return inferred_sequence, contig_edge
Пример #4
0
    def get_inferred_sequence(self, forward_read, reverse_read, is_reverse):
        contig, start, end = forward_read.reference_name, forward_read.reference_start, reverse_read.reference_end
        inferred_sequence = ''.join(self.genome_dict[contig][start:end])

        inferred_sequence = sctools.left_softclipped_sequence_strict(forward_read) + \
                            inferred_sequence + \
                            sctools.right_softclipped_sequence_strict(reverse_read)
        if is_reverse:
            inferred_sequence = misc.revcomp(inferred_sequence)

        return inferred_sequence
Пример #5
0
def add_sequence_to_secondary_alignment(sam_file_in, sam_file_out):
    outfile = open(sam_file_out, 'w')
    infile = pysam.AlignmentFile(sam_file_in, 'r')
    outfile = pysam.AlignmentFile(sam_file_out, "w", template=infile)

    current_seq = None
    current_seq_reverse = None
    for read in infile:
        if read.query_sequence is not None:
            current_seq = read.query_sequence
            current_seq_reverse = read.is_reverse
        else:
            if current_seq_reverse == read.is_reverse:
                read.query_sequence = current_seq
            else:
                read.query_sequence = misc.revcomp(current_seq)

        outfile.write(read)
    outfile.close()
Пример #6
0
def get_seq_lengths(clusters,
                    seqs,
                    header=[
                        'cluster', 'num_unique_seqs', 'mean_length',
                        'min_length', 'max_length'
                    ]):

    seq_lengths1 = defaultdict(set)

    for cluster, seq in zip(clusters, seqs):
        seq_lengths1[cluster].add(seq)

    seq_lengths2 = defaultdict(
        lambda: {
            'unique_seqs': set(),
            'num_unique_seqs': 0,
            'mean_length': 0,
            'min_length': 0,
            'max_length': 0
        })
    for cluster in seq_lengths1:
        for seq in seq_lengths1[cluster]:
            seq_lengths2[cluster]['unique_seqs'].add(
                tuple(sorted([seq, misc.revcomp(seq)])))

    for cluster in seq_lengths2:

        all_seq_lengths = list(
            map(lambda x: len(x[0]), seq_lengths2[cluster]['unique_seqs']))
        seq_lengths2[cluster]['num_unique_seqs'] = len(all_seq_lengths)
        seq_lengths2[cluster]['mean_length'] = np.mean(all_seq_lengths)
        seq_lengths2[cluster]['min_length'] = np.min(all_seq_lengths)
        seq_lengths2[cluster]['max_length'] = np.max(all_seq_lengths)

    seq_lengths2 = [
        tuple([cluster] + [seq_lengths2[cluster][lab] for lab in header[1:]])
        for cluster in seq_lengths2
    ]
    seq_lengths2 = pd.DataFrame(seq_lengths2, columns=header)

    return seq_lengths2
Пример #7
0
def cluster_100p(infile, outfile):

    seqdict1 = dict()
    for rec in SeqIO.parse(infile, 'fasta'):
        seq = str(rec.seq)

        if seq not in seqdict1:
            seqdict1[seq] = rec

    seqdict2 = dict()
    for seq in seqdict1:
        fwd = seq
        rev = misc.revcomp(seq)

        seq_key = tuple(sorted([fwd, rev]))

        if seq_key not in seqdict2:
            seqdict2[seq_key] = seqdict1[seq]

    with open(outfile, "w") as handle:
        SeqIO.write(seqdict2.values(), handle, "fasta")
Пример #8
0
def write_termini_to_unpaired_fasta(pairs, fasta_prefix):

    fiveprime_outseqs = []
    threeprime_outseqs = []
    for p in pairs:
        name5p = p['pair_id']+'_1'
        name3p = p['pair_id']+'_2'
        seq5p = p['seq_5p']
        seq3p = p['seq_3p']

        record5p = SeqRecord(Seq(seq5p, IUPAC.IUPACAmbiguousDNA),
                           id=str(name5p), description=str(name5p)+'_5p')
        record3p = SeqRecord(Seq(revcomp(seq3p), IUPAC.IUPACAmbiguousDNA),
                             id=str(name3p), description=str(name3p)+'_3p')

        fiveprime_outseqs.append(record5p)
        threeprime_outseqs.append(record3p)

    with open(fasta_prefix+'.fasta', 'w') as output_handle:
        SeqIO.write(fiveprime_outseqs, output_handle, 'fasta')

    with open(fasta_prefix + '.fasta', 'a') as output_handle:
        SeqIO.write(threeprime_outseqs, output_handle, 'fasta')
Пример #9
0
def get_inferred_sequences(pairs, genome_dict, add_softclipped_bases=False):

    inferred_sequences = []
    for read1, read2 in pairs:

        name = read1.reference_name + ':' + str(
            read1.reference_start) + '-' + str(read2.reference_end)
        inferred_sequence = genome_dict[
            read1.reference_name][read1.reference_start:read2.reference_end]

        if add_softclipped_bases:
            inferred_sequence = sctools.left_softclipped_sequence_strict(
                read1
            ) + inferred_sequence + sctools.right_softclipped_sequence_strict(
                read2)

        if read1.is_read2:
            inferred_sequence = misc.revcomp(inferred_sequence)

        inferred_sequences.append(
            (name, len(inferred_sequence), inferred_sequence))

    return inferred_sequences