Пример #1
0
def test_transform_multiple():
    assert transform("ATG") == (
        [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
        [0, 0.5, 0, -0.5, -1.0, -0.5, 0.0],
    )
    assert transform("TTC") == (
        [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
        [0, -0.5, -1.0, -1.5, -2.0, -2.5, -2.0],
    )
Пример #2
0
def test_end_y_value(s):
    assert transform(s, method="yau")[1][-1] == approx(
        (-(3 ** 0.5) / 2 * s.count("A"))
        + ((3 ** 0.5) / 2 * s.count("T"))
        + (0.5 * s.count("C"))
        + (-0.5 * s.count("G"))
    )
Пример #3
0
def test_endpoint(s):
    # "If the function n(Z) denotes the number of occurrences of nucleotide Z
    # in a given sequence, the end-point of the sequence lies at coordinate
    # position [(n(G) - n(C)), (n(T) - n(A))]" (Gates, J. theor. Biol. 1986)
    transformed = transform(s, method="gates")
    assert transformed[0][-1] == s.count("G") - s.count("C")
    assert transformed[1][-1] == s.count("T") - s.count("A")
Пример #4
0
def test_qi(s):

    transformed = transform(s, method="qi")
    for i in range(len(s)):
        try:
            assert transformed[1][i] == qi[s[i:i + 2]]
        except IndexError:
            pass
Пример #5
0
def transform_route():
    sequence = request.form["seq"]
    seq_name = request.form["seq_name"]
    method = request.form["method"]

    logging.debug("Hashing seq")
    seq_hash = str(xxhash.xxh64(sequence).intdigest())

    if LOCAL:
        exists = os.path.exists(f"data/{seq_hash}.{method}.parquet.sz")
        logging.debug(f"Found {seq_hash} locally")
    else:
        exists = exists_on_s3(f"{seq_hash}.{method}.parquet.sz")
        logging.debug(f"Found {seq_hash} on S3")

    if exists:
        if LOCAL:
            df = pd.read_parquet(f"data/{seq_hash}.{method}.parquet.sz")
        else:
            df = query_x_range(f"{seq_hash}.{method}.parquet.sz")

    else:
        logging.debug(
            f"No previous transformation for {seq_name} found. Transforming..."
        )
        transformed = transform(sequence, method=method)

        logging.debug("Saving transformed data for " + seq_name)
        df = pd.DataFrame(dict(x=transformed[0], y=transformed[1]))
        df.to_parquet(f"data/{seq_hash}.{method}.parquet.sz")

        if not LOCAL:
            logging.debug(f"Uploading {seq_hash} to S3")
            upload(f"{seq_hash}.{method}.parquet.sz")

    logging.debug(f"Got the overview data for {seq_hash}")

    zone = df.values.tolist()
    return jsonify((seq_hash, downsample(zone)))
Пример #6
0
def test_C():
    assert (
        transform("C", method="gates")
        == transform("c", method="gates")
        == ([0, -1], [0, 0])
    )
Пример #7
0
def test_G():
    assert (
        transform("G", method="gates")
        == transform("g", method="gates")
        == ([0, 1], [0, 0])
    )
Пример #8
0
def test_bad_seq():
    with pytest.raises(ValueError):
        transform("INVALID", method="yau")
Пример #9
0
def test_end_x_value(s):
    assert transform(s, method="yau")[0][-1] == approx(
        ((3 ** 0.5) / 2 * (s.count("C") + s.count("G")))
        + (0.5 * (s.count("A") + s.count("T")))
    )
Пример #10
0
def test_G():
    assert (
        transform("G", method="yau")
        == transform("g", method="yau")
        == ([0, (3 ** 0.5) / 2], [0, -0.5])
    )
Пример #11
0
def test_basic():
    assert transform("ATGC", method="yau-bp") == ([0, 1, 2, 3,
                                                   4], [0, -1, 0, -0.5, 0])
Пример #12
0
def test_end_y_value(s):
    assert transform(s, method="yau-bp")[1][-1] == approx(
        s.count("T") - s.count("A") + 0.5 * (s.count("C") - s.count("G")))
Пример #13
0
def test_invalid_seq():
    assert transform("N") == ([0, 0.5, 1.0], [0, 0, 0])
Пример #14
0
def test_transform_C():
    assert transform("C") == transform("c") == ([0, 0.5, 1.0], [0, -0.5, 0])
Пример #15
0
def test_transform_G():
    assert transform("G") == transform("g") == ([0, 0.5, 1.0], [0, 0.5, 1.0])
Пример #16
0
def test_transform_T():
    assert transform("T") == transform("t") == ([0, 0.5, 1.0], [0, -0.5, -1.0])
Пример #17
0
def test_end_x_value(s):
    transformed = transform(s, method="yau-bp")
    assert transformed[0][-1] == approx(
        s.count("A") + s.count("T") + s.count("G") + s.count("C"))
Пример #18
0
def test_A():
    assert (
        transform("A", method="gates")
        == transform("a", method="gates")
        == ([0, 0], [0, -1])
    )
Пример #19
0
def test_length(s):
    transformed = transform(s, method="yau-bp")
    assert (len(transformed[0]) == len(transformed[1]) == len(s) + 1
            )  # the extra 1 is for the starting (0, 0) coord
Пример #20
0
def test_invalid():
    with pytest.raises(ValueError):
        transform("invalid", method="yau-bp")
Пример #21
0
def test_length(s):
    transformed = transform(s)
    assert (len(transformed[0]) == len(transformed[1]) == 2 * len(s) + 1
            )  # the extra 1 is for the starting (0, 0) coord
Пример #22
0
def test_T():
    assert (
        transform("T", method="yau")
        == transform("t", method="yau")
        == ([0, 0.5], [0, (3 ** 0.5) / 2])
    )
Пример #23
0
def test_invalid_method():
    with pytest.raises(ValueError):
        transform("", method="invalid")
Пример #24
0
def test_C():
    assert (
        transform("C", method="yau")
        == transform("c", method="yau")
        == ([0, (3 ** 0.5) / 2], [0, 0.5])
    )
Пример #25
0
def test_transform_A():
    assert transform("A") == transform("a") == ([0, 0.5, 1.0], [0, 0.5, 0])
Пример #26
0
def visualize(fasta, width, palette, color, hide, bar, title, separate, cols, link_x, link_y, output, offline, method, dimensions, skip, mode):
    # check filetype
    if fasta is None:
        raise ValueError("Must provide FASTA file.")

    # handle selecting the palette
    palette = small_palettes[palette]

    # handle setting the dimensions automatically if not specified
    if not dimensions:
        dimensions = (750, 500)

    if len([record for _f in fasta for record in Fasta(_f)]) > len(palette) and mode != "file":
        if len(fasta) > 1 and mode == "auto":
            if not skip:
                print("Visualizing each file in separate color. To override, provide mode selection.")
            mode = "file"
        else:
            print("Visualizing each sequence in black.")
            color = False
    elif mode == "auto":
        mode = "seq"

    # get all the sequences
    seqs = []
    color_counter = 0
    warned = False
    for i, _f in enumerate(fasta):
        for j, seq in enumerate(Fasta(_f, sequence_always_upper=True)):
            seqs.append(Box(color=palette[color_counter + 1 if color_counter > 2 else 3][color_counter] if color else "black",
                            name=_f if mode == "file" else seq.name,
                            raw_seq=seq))

            # check the length of the seq
            if len(seq) > 10000 and not skip and not warned:
                click.confirm("You are plotting long sequence ({} bp). This may be very slow. "
                              "Do you want to continue?".format(len(seq)), abort=True)
                warned = True

            if mode == "seq":
                color_counter += 1
        if mode == "file":
            color_counter += 1

    # warn if plotting a large number of seqs
    if len(seqs) > 500 and not skip:
        click.confirm("You are plotting a large number of sequences ({}). This may be very slow. "
                      "Do you want to continue?".format(len(seqs)), abort=True)

    # warn if using a bad method
    if max([len(seq.raw_seq) for seq in seqs]) > 25 and method in ["qi", "randic"] and not skip:
        click.confirm("This method is not well suited to a sequence of this length. "
                      "Do you want to continue?", abort=True)

    axis_labels = {
        "squiggle": {"x": "position (BP)",
                     "y": None},
        "gates": {"x": "C-G axis",
                  "y": "A-T axis"},
        "yau": {"x": None,
                "y": None},
        "yau-bp": {"x": "position (BP)",
                   "y": None},
        "randic": {"x": "position (BP)",
                   "y": "nucleotide"},
        "qi": {"x": "position (BP)",
               "y": "dinucleotide"}
    }

    # the number of figures to draw is either the number of sequences or files (or 1)
    if separate:
        if mode == "seq":
            fig_count = len(seqs)
        elif mode == "file":
            fig_count = len(fasta)
    else:
        fig_count = 1

    fig = []
    for i in range(fig_count):

        # link the axes, if requested
        if i > 0 and link_x:
            x_range = fig[i - 1].x_range
        else:
            x_range = None
        if i > 0 and link_y:
            y_range = fig[i - 1].y_range
        else:
            y_range = None

        # the y axes for randic and qi are bases
        if method == "randic":
            y_range = ["A", "T", "G", "C"]
        elif method == "qi":
            y_range = ['AA',
                       'AC',
                       'AG',
                       'AT',
                       'CA',
                       'CC',
                       'CG',
                       'CT',
                       'GA',
                       'GC',
                       'GG',
                       'GT',
                       'TA',
                       'TC',
                       'TG',
                       'TT']

        fig.append(figure(x_axis_label=axis_labels[method]["x"],
                          y_axis_label=axis_labels[method]["y"],
                          title=title,
                          x_range=x_range,
                          y_range=y_range,
                          plot_width=dimensions[0],
                          plot_height=dimensions[1]))

    # show a progress bar if processing multiple files
    if len(seqs) > 1 and bar:
        _seqs = tqdm(seqs, unit=" seqs", leave=False)
    else:
        _seqs = seqs

    for i, seq in enumerate(_seqs):
        # perform the actual transformation
        transformed = transform(str(seq.raw_seq), method=method)

        # figure (no pun intended) which figure to plot the data on
        if separate:
            if mode == "seq":
                _fig = fig[i]
            elif mode == "file":
                _fig = fig[fasta.index(seq.name)]

            # add a title to the plot
            _fig.title = annotations.Title()
            if mode == "seq":
                _fig.title.text = seq.name
            elif mode == "file":
                _fig.title.text = click.format_filename(seq.name, shorten=True)
        else:
            _fig = fig[0]
            _fig.title = annotations.Title()

            # if only plotting on one figure, set up the title
            if title:
                _fig.title.text = title
            elif len(seqs) > 1 and not title and len(fasta) == 1:
                _fig.title.text = click.format_filename(fasta[0], shorten=True)
            elif len(seqs) == 1:
                # if just plotting one sequence, title it with the name of the sequence
                _fig.title.text = seq.name

        # randic and qi method's have categorical y axes
        if method == "randic":
            y = list(seq.raw_seq)
        elif method == "qi":
            y = [seq.raw_seq[i:i + 2] for i in range(len(seq.raw_seq))]
            y = [str(i) for i in y if len(i) == 2]
        else:
            y = transformed[1]

        # figure out whether to add a legend
        if (separate or not color or mode == "file" or len(seqs) == 1) and not hide:
            legend = None
        else:
            legend = click.format_filename(seq.name, shorten=True)

        # optimization for comparing large FASTA files without hiding
        try:
            if mode == "file" and seqs[i + 1].color != seq.color and not separate:
                legend = click.format_filename(seq.name, shorten=True)
        except IndexError:
            if mode == "file" and not separate:
                legend = click.format_filename(seq.name, shorten=True)

    # do the actual plotting
        _fig.line(x=transformed[0],
                  y=y,
                  line_width=width,
                  legend=legend,
                  color=seq.color)

        # set up the legend
        _fig.legend.location = "top_left"
        if hide:
            _fig.legend.click_policy = "hide"

    # clean up the tqdm bar
    try:
        _seqs.close()
    except AttributeError:
        pass

    # lay out the figure
    if separate:
        plot = gridplot(fig,
                        ncols=math.ceil(len(fig)**0.5) if cols == 0 else cols,
                        toolbar_options=dict(logo=None)) # note that 0 denotes the automatic default
    else:
        plot = fig[0]

    if output is not None and output.endswith(".html"):
        output_file(output, title="Squiggle Visualization" if title is not None else title)
        save(plot, resources=INLINE if offline else None)
    else:
        show(plot)
Пример #27
0
def test_T():
    assert (
        transform("T", method="gates")
        == transform("t", method="gates")
        == ([0, 0], [0, 1])
    )
Пример #28
0
def test_A():
    assert (
        transform("A", method="yau")
        == transform("a", method="yau")
        == ([0, 0.5], [0, -(3 ** 0.5) / 2])
    )
Пример #29
0
def test_randic(s):
    transformed = transform(s, method="randic")
    for i, letter in enumerate(s):
        assert transformed[1][i] == randic[letter]