Exemplo n.º 1
0
def seqdist(s1, s2, mismatchpen=-.5, gapopenpen=-.25, gapextendpen=-.05):
    """
    The distance between two sequences.
    """
    # s1, s2 = removecommongaps(s1, s2)
    from Bio.pairwise2 import align, format_alignment
    alignment = next(iter(align.globalms(
        s1, s2, 1, mismatchpen, gapopenpen, gapextendpen)))
    print(format_alignment(*alignment))
    return alignment[2]
Exemplo n.º 2
0
def string_distance(seq1,
                    seq2,
                    len_seq1,
                    len_seq2,
                    dist_mat,
                    dist_mat_max,
                    tol=3,
                    length_constraint=True):
    """Calculate a distance between two input sequences.

    Parameters
    ----------
    seq1, seq2 : str
        String sequences.
    dist_mat : pandas.DataFrame
        Matrix which define the distance between the single characters.
    norm_by : float, deprecated
        Normalising value for the distance.
    tol : int, optional, default: 3
        Tolerance in the length of the sequences. Default is 3 (3 nucleotides
        form an amminoacid. If seq1 and seq2 represent amminoacidic sequences,
        use tol = 1).
    length_constraint : boolean, optional, default: True
        Insert the constraint on the difference between the lengths of seq1 and
        seq2. If False, `tol` is ignored.

    Returns
    -------
    distance : float
        A normalised distance between seq1 and seq2. Values are in [0,1].
    """
    if length_constraint:
        if abs(len_seq1 - len_seq2) > tol:
            return 1.  # min(len_seq1, len_seq2) / norm_by  # should be 1

        if 0 < abs(len_seq1 - len_seq2) <= tol:
            # different lengths, seqs alignment
            seq1, seq2 = map(extra.junction_re,
                             align.globalms(seq1, seq2, 5, -4, -3, -.1)[0][:2])
            len_seq1 = len(seq1)
            # print 'befor align:\n', seq1, '\n', seq2, '\n--------------'
            # seq1, seq2 = map(extra.junction_re, igalign.alignment(seq1, seq2))
            # print 'after align:\n', seq1, '\n', seq2, '\n--------------'
    norm_by = len_seq1 * dist_mat_max
    return sum([
        np.mean((float(dist_mat.at[c1, c2]), float(dist_mat.at[c2, c1])))
        for c1, c2 in izip(list(seq1), list(seq2))
    ]) / norm_by
Exemplo n.º 3
0
def string_distance(seq1, seq2, len_seq1, len_seq2, dist_mat, dist_mat_max,
                    tol=3, length_constraint=True):
    """Calculate a distance between two input sequences.

    Parameters
    ----------
    seq1, seq2 : str
        String sequences.
    dist_mat : pandas.DataFrame
        Matrix which define the distance between the single characters.
    norm_by : float, deprecated
        Normalising value for the distance.
    tol : int, optional, default: 3
        Tolerance in the length of the sequences. Default is 3 (3 nucleotides
        form an amminoacid. If seq1 and seq2 represent amminoacidic sequences,
        use tol = 1).
    length_constraint : boolean, optional, default: True
        Insert the constraint on the difference between the lengths of seq1 and
        seq2. If False, `tol` is ignored.

    Returns
    -------
    distance : float
        A normalised distance between seq1 and seq2. Values are in [0,1].
    """
    if length_constraint:
        if abs(len_seq1 - len_seq2) > tol:
            return 1.  # min(len_seq1, len_seq2) / norm_by  # should be 1

        if 0 < abs(len_seq1 - len_seq2) <= tol:
            # different lengths, seqs alignment
            seq1, seq2 = map(extra.junction_re, align.globalms(
                seq1, seq2, 5, -4, -3, -.1)[0][:2])
            len_seq1 = len(seq1)
            # print 'befor align:\n', seq1, '\n', seq2, '\n--------------'
            # seq1, seq2 = map(extra.junction_re, igalign.alignment(seq1, seq2))
            # print 'after align:\n', seq1, '\n', seq2, '\n--------------'
    norm_by = len_seq1 * dist_mat_max
    return sum([np.mean((
        float(dist_mat.at[c1, c2]),
        float(dist_mat.at[c2, c1]))) for c1, c2 in izip(
            list(seq1), list(seq2))]) / norm_by
Exemplo n.º 4
0
def get_graph(seq1, seq2, x=0, y=0, node_id=0):
    row_a, row_b, _, _, _ = align.globalms(seq1, seq2, 0, -1, -1, -0.5)[0]
    print(row_a)
    print(row_b)
    return alignment.AlignmentGraph(row_a, row_b, x, y, node_id)
Exemplo n.º 5
0
def myalign(s1, s2):
    return align.globalms(s1, s2, 2, -1, -2, -0.2, penalize_end_gaps=False)[0]
Exemplo n.º 6
0
def myalign(s1,s2):
  return align.globalms(s1,s2,2,-1,-2,-0.2,penalize_end_gaps=False)[0]
Exemplo n.º 7
0
        return tikz_matrix(name, body(self._matrix), args) + "\n" + "\n".join(self.lines)


    def to_file(self, filename):
        f = open(filename, "w")
        f.write(self.header + "\n")
        # f.write(self._row_a + "\n")
        f.write(tikz_matrix("seq", body(self._matrix))+"\n")
        for line in self.lines:
            f.write(line+"\n")
        f.write(self.footer)
        f.close()
        subprocess.call(["pdflatex", filename])

if __name__ == "__main__":
    seq_a = "ACGGG"
    seq_b = "ATGG"
    # seq_b = "ACCGGTAAGGT"
    # seq_a = "ACGGTGTACAT"
    # seq_b = "ACCGGTAAGGT"

    row_a, row_b, _, _, _ =  align.globalms(seq_a, seq_b, 0, -1, -1, -1)[0]
    print(row_a)
    print(row_b)
    graph = AlignmentGraph(row_a, row_b)
    graph.generate_tikz()
    graph.to_file("testing.tex")

#for line in graph.lines:
#
Exemplo n.º 8
0
def evaluate_assembly(assembly, gt_data, stacks_data, gt_stats, args):
    """Analyze an assembly of RAGE vs Stacks data.

    Arguments:
        assembly (dict): Mapping of RAGE sequences to RAGE records
            and associated Stacks records.
        gt_data (list of GTRecords): From a RAGE _gt.yaml file.
        stacks_data (list of TSVRecords): From a Stacks _export.tsv file.
        args (argparse.NameSpace): User parameters.
    """
    nr_undiscovered_gt_loci = 0
    nr_loci_with_undiscovered_mutations = 0
    nr_loci_with_discovered_mutations = 0
    nr_evaluated_loci = 0
    nr_successfully_aligned_loci = 0
    # write mapping to file

    with open(args.output, "w") as outfile:
        outdata = {
            "Loci": {},
        }
        for (gt_name, gt_seq), (gt_locus, stacks_loci) in assembly.items():
            outdata["Loci"][gt_name] = {
                "ground_truth_seq": gt_seq,
                "ground_truth_alleles": gt_locus.alleles,
                "stacks_loci": []
            }

            # if len(stacks_loci) > 1:
            #     # This means that more than one stacks locus associated
            #     # with the active ground truth locus
            #     print("HIT")
            #     print(gt_seq)
            #     print(stacks_loci)

            successfully_detected, successfully_aligned = False, False
            undetected, no_mutations = False, False

            # compute semiglobal alignments of the loci to verify that
            # they actually match
            for stacks_locus in stacks_loci:

                stacks_locus_info = {
                    "seq": stacks_locus.seq.decode(),
                }
                all_alns = align.globalms(
                    gt_seq,
                    stacks_locus.seq.decode(),
                    1,  # match score
                    0,  # mismatch panalty
                    -5,  # gap open penalty
                    -3,  # gap extend penalty
                    penalize_end_gaps=(False, False),
                    one_alignment_only=True,
                )
                # pick the first reported alignments
                # these are either unique or good enough
                aln = all_alns[0]

                if aln[2] >= 130:
                    successfully_aligned = True
                    stacks_locus_info["SNPs"] = [
                        {
                            # TODO: this is not 100% accurate. The precise
                            # position can variate with different spacer length
                            # etc. This is a conservative estimate.
                            "orientation": "p7" if entry.pos > 98 else "p5",
                            "pos": entry.pos,
                            "ref": entry.ref,
                            "alt": "".join(entry.alts),
                        } for entry in stacks_locus.data
                    ]
                    if stacks_locus.data:
                        successfully_detected = True
                        # NOTE: Currently this only checks, if the stacks locus
                        # was successfully aligned and also detected a mutation
                        # the kind of the mutation is not checked. Neither, if
                        # the correct mutation (e.g. SNP C>G at position 42 in
                        # the forward read) was detected.
                        #
                        # TODO: Evaluate if right SNPs were found
                        #       (mind that Stacks might not call the allele
                        #       RAGE simulated as root as main allele
                        #         -> consider x>y == y>x)
                        #
                        # WARNING how to manage split up loci?
                        # consider: stacks split up a heterozygous locus
                        # to two loci. How do we count that? Two misses?
                        # Check if the union of all variants covers the
                        # set of simulated variants?
                        #
                        # TODO: Evaluate if the right allele frequencies were
                        #       detected by stacks
                    elif gt_locus.mutations:
                        undetected = True
                    else:
                        no_mutations = True

                else:
                    print(f"MISMATCH with {stacks_locus.data[0].chrom}")
                    print(format_alignment(*aln))
                outdata["Loci"][gt_name]["stacks_loci"].append(
                    stacks_locus_info)

            if not stacks_loci:
                # print("No matching stack locus found", file=outfile)
                outdata["Loci"][gt_name]["stacks_loci"] = "No matches found"
                nr_undiscovered_gt_loci += 1

            if successfully_detected:
                nr_loci_with_discovered_mutations += 1
            elif undetected:
                nr_loci_with_undiscovered_mutations += 1
                print(
                    f"The variants\n{gt_locus.alleles}\nin RAGE locus:\n{gt_name:<10} {gt_seq} were not detected in stacks loci\n{stacks_loci}",
                    file=sys.stderr)
            elif no_mutations:
                ...

            nr_evaluated_loci += 1
            if successfully_aligned:
                nr_successfully_aligned_loci += 1

        # find stacks loci that are not in the RAGE loci (singletons and HRLs?)
        # create one entry for each locus assembled by stacks.
        # then find out in the assembly which ones were never assigned to a
        # rage locus:
        #
        # 1. create a dictionary with zero for all stacks locus names
        # 2. iterate through the gt_locus -> stacks-locus mapping
        # 3. increment the counter of each stacks locus, every time it occurs
        #    in the mapping
        stacks_locus_occurence_count = Counter(
            {rec.name: 0
             for rec in stacks_data})
        for (gt_name, gt_seq), (gt_locus, stacks_loci) in assembly.items():
            for locus in stacks_loci:
                # count every gt locus that has an assigned stacks locus
                stacks_locus_occurence_count[locus.name] += 1

        # Report stacks loci that have no counterpart in the ground truth
        # (RAGE) loci
        stacks_only_loci = [(l, c)
                            for l, c in stacks_locus_occurence_count.items()
                            if c == 0]
        if stacks_only_loci:
            print(
                f"The following {len(stacks_only_loci)} loci were not "
                "simulated by rage, but identified by stacks. These might "
                "include incompletely digested reads, Null Alleles, "
                "Singletons, and HRLs/ Lumberjack stacks.",
                file=sys.stderr)
            print([name for name, _ in stacks_only_loci], file=sys.stderr)

        # Check to how many RAGE loci each Stacks loci was assigned to.
        # This should always be 1.
        # A value >1 would suggest that a stacks locus is similar to two or
        # more RAGE loci, which is highly unlikely
        most_assigned_gt_loci = stacks_locus_occurence_count.most_common(5)
        stacks_locus_name, assigned_gt_loci = most_assigned_gt_loci[0]
        if assigned_gt_loci > 1:
            print(
                f"The 5 most assigned stacks locus id. This should always "
                f"be 1:\n  {most_assigned_gt_loci}\n",
                file=sys.stderr)
        else:
            print(
                f"Each stacks locus was assigned to at most one ground "
                f"truth (RAGE) loci",
                file=sys.stderr)

        outdata["metadata"] = {
            "Loci with mutations that were successfully discovered":
            nr_loci_with_discovered_mutations,
            "Total simulated mutation loci":
            gt_stats.nr_loci_with_muts,
            "Loci with mutations that were not discovered by stacks":
            nr_loci_with_undiscovered_mutations,
            "SNP discovery ratio":
            nr_loci_with_discovered_mutations / gt_stats.nr_loci_with_muts,
        }

        outfile.write(
            dump(outdata,
                 default_flow_style=False,
                 Dumper=Dumper,
                 explicit_start=True,
                 sort_keys=False))
Exemplo n.º 9
0

ref_seq =    "ACGTTACGGTACCGTA"
ref_seq2 =   "ACCGTTGCGGGTACCGTA"
get_graph(ref_seq, ref_seq2).to_file("ref_graph.tex")


query_seq = "TTGCGGAC"
smems2 = ["TTGCGG", "AC"]

smems = [
    "TT",
    "TT",
    "G",
    "CGG",
    "CGG",
    "CGG",
    "CGG",
    "AC",
    "CC"]

smems = [
    ("TT", 0, 3),
    ("G", 2, [2, 7, 8, 13]),
    ("CGG", 3, 6),
    ("AC", 6, [0, 5, 10, 15])]

ref_row, query_row = align.globalms(ref_seq, query_seq, 0, -1, -1, -1, penalize_end_gaps=(True, False))[0][:2]
print(ref_row)
print(query_row)
Exemplo n.º 10
0
def report_dupe(s1, s1_name, s2, s2_name):
    # identical chars 2 points, -1 for non-identical, -2 for opening a gap, -1 for extending it
    alignments = align.globalms(s1, s2, 2, -1, -2, -1, one_alignment_only=True)
    alignment = format_aln(format_alignment(
        *alignments[0]), s1_name, s2_name, 50) if len(alignments) > 0 else ''
    return (alignment)