Exemplo n.º 1
0
    def generate_matching_sequence(self, sequence, core, width):
        """
        Returns sub-sequences of width, that match the core in the middle.
        :param sequence: The the sequence to search, such as the whole sequence for a chromosome.
                Can be a string or a Bio.Seq
        :param core: The bases for which to search, in the center
        :param width: The desired sub-sequence width, e.g. 36
        :return: Generator, returning one sub-sequence per call
        """

        # Need to search for core and reverse complement in the window region
        # If RC is found in the region, return the reverse-complement of the window instead
        # Also, if core is palindromic, need to return both regions and return best score
        core_rc = bio.revcompstr(core)
        max_start = len(sequence) - width
        core_start = (width - len(core)) // 2
        for start in range(max_start + 1):
            end = start + width
            window_sequence = sequence[start:end]
            # If any of the bases in the window are unknown, we cannot predict on the sequence
            if 'N' in window_sequence:
                continue
            window_core = window_sequence[core_start:core_start + len(core)]
            #print("seq and core",window_sequence,window_core)
            # If core is palindromic, return two sequences and let the caller decide which to use
            core_pos = core_start + start
            if core == core_rc and window_core == core:
                yield start, core_pos, (
                    str(window_sequence),
                    str(bio.revcompstr(window_sequence)),
                )
            elif window_core == core:
                yield start, core_pos, (window_sequence, )
            elif window_core == core_rc:
                yield start, core_pos, (str(bio.revcompstr(window_sequence)), )
Exemplo n.º 2
0
def fixdup_single_name(nm, df):
    dseq1 = df[(df["type"] == "wt")][["Sequence"]].rename({
        "Sequence":
        "Sequence_x"
    }).drop_duplicates()
    dseq2 = df[["Sequence"]].rename({
        "Sequence": "Sequence_y"
    }).drop_duplicates()

    dseq1_o1_seqs = df[(df["type"] == "wt") & (
        df["ori"] == "o1")]["Sequence"].drop_duplicates().tolist()
    rcgrp = {}
    for i in range(len(dseq1_o1_seqs)):
        rcgrp[dseq1_o1_seqs[i]] = i
        rcgrp[bio.revcompstr(dseq1_o1_seqs[i])] = i

    dseq1["key"], dseq2["key"] = 1, 1
    comb = dseq1.merge(dseq2, on="key").drop(
        "key", 1)  # get cartesian product between the two

    comb["simscore"] = comb.apply(lambda row: SequenceMatcher(
        None, row["Sequence_x"], row["Sequence_y"]).ratio(),
                                  axis=1)
    comb = comb.sort_values(["Sequence_x", "simscore"],
                            ascending=[True, False])
    selectedcmb = comb.groupby("Sequence_x").head(4).rename(
        columns={"Sequence_y": "Sequence"})
    selectedcmb["group"] = selectedcmb["Sequence_x"].apply(lambda x: "%s_%s" %
                                                           (nm, rcgrp[x]))
    selectedcmb = selectedcmb[["Sequence", "group"]].drop_duplicates()

    named = df.merge(selectedcmb, on="Sequence")
    return named
Exemplo n.º 3
0
def gen_training(df, pwm, kompas):
    train = get_sites_pos(df, kompas, pwm)
    # reverse -- to ++
    train00 = train[train["orientation"] == "-/-"][[
        "Name", "Sequence", "label"
    ]]
    train00["Sequence"] = train00["Sequence"].apply(
        lambda x: bio.revcompstr(x))
    train00 = get_sites_pos(train00, kompas, pwm)
    train = pd.concat([train[train["orientation"] != "-/-"], train00])
    return train.drop_duplicates()
Exemplo n.º 4
0
def check_val(indf, wt, m1, m2, m3):
    df = indf[(indf["Sequence"] == wt) | (indf["Sequence"] == m1) | (indf["Sequence"] == m2) \
            | (indf["Sequence"] == m3) | (indf["Sequence"] == bio.revcompstr(wt)) \
            | (indf["Sequence"] == bio.revcompstr(m1)) | (indf["Sequence"] == bio.revcompstr(m2)) \
            | (indf["Sequence"] == bio.revcompstr(m3))
             ] \
         .sort_values(["ori", "type", "rep"])
    twodict = df[df["type"] == "wt"].groupby('ori')['Alexa488Adjusted'].apply(
        list).to_dict()
    print(twodict)

    m1df = df[df["type"] == "m1"][['ori', 'Alexa488Adjusted']]
    m2df = df[df["type"] == "m2"][['ori', 'Alexa488Adjusted']]
    m3df = df[df["type"] == "m3"][['ori', 'Alexa488Adjusted']]

    onedf = m1df.merge(m2df, on='ori').merge(m3df, on='ori')
    onedf['indiv'] = onedf["Alexa488Adjusted_x"] + onedf[
        "Alexa488Adjusted_y"] - onedf["Alexa488Adjusted"]
    onedict = onedf.groupby('ori')['indiv'].apply(list).to_dict()

    for ori in ['o1', 'o2']:
        p = st.wilcox(twodict[ori], onedict[ori], "greater")
        print(ori, p)
Exemplo n.º 5
0
def get_relative_orientation(sequence, predictor, htth=True):
    """
    Get orientation of the 2 sites in a sequence

    Desc

    Args:
        sequence
        predictor
        htth: if True, predict HT or TH as HT/TH
    Return:
        orientation
    Example:
    """
    sites = predictor.predict_sequence(sequence)
    p1 = sequence[sites[0]["core_mid"] - predictor.corewidth//2:sites[0]["core_mid"] + predictor.corewidth//2]
    p2 = sequence[sites[1]["core_mid"] - predictor.corewidth//2:sites[1]["core_mid"] + predictor.corewidth//2]
    pos_cores = [m.core for m in predictor.models]
    neg_cores = [bio.revcompstr(p) for p in pos_cores]
    p = [p1,p2]
    s = [0,0]
    for i in range(len(p)):
        if p[i] in pos_cores:
            s[i] = 1
        elif p[i] in neg_cores:
            s[i] = -1
        else:
            s[i] = 0
            print("couldn't find the site %s in %s in the core list" % (p[i],sequence))
    if s[0] == 1 and s[1] == 1:
        return 'HT/TH' if htth else "HT"
    elif s[0] == -1 and s[1] == -1:
        return 'HT/TH' if htth else "TH"
    elif s[0] == 1 and s[1] == -1:
        return 'HH'
    elif s[0] == -1 and s[1] == 1:
        return 'TT'
    else:
        return '-1'
                     basepath)[["Name",
                                "Sequence"]].rename(columns={
                                    "Sequence": "sequence"
                                }).drop_duplicates()
    df.merge(fullpos, on="sequence")[[
        "Name", "ets_score", "runx_score"
    ]].drop_duplicates().sort_values(by=["Name"]).to_csv("pwm_allseq.csv",
                                                         index=False)

    dft = dft.merge(fullpos, on="sequence")
    dft = dft.drop(columns=["fullseq", "ori"]).drop_duplicates()
    dft = dft[dft["distance"] != 4]
    # flip ets runx position
    for index, row in dft.iterrows():
        if row['ets_pos'] > row['runx_pos']:
            dft.at[index, 'sequence'] = bio.revcompstr(row["sequence"])
            dft.at[index,
                   'ets_start'] = len(row["sequence"]) - row["ets_pos"] - 4
            dft.at[index, 'ets_pos'] = dft.at[index, 'ets_start'] + 1
            dft.at[index, 'ets_ori'] = 1 if row['ets_ori'] == -1 else 0
            dft.at[index,
                   'runx_start'] = len(row["sequence"]) - row["runx_pos"] - 5
            dft.at[index, 'runx_pos'] = dft.at[index, 'runx_start'] + 2
            dft.at[index, 'runx_ori'] = 1 if row['runx_ori'] == -1 else 0
        else:
            dft.at[index, 'ets_ori'] = 0 if row['ets_ori'] == -1 else 1
            dft.at[index, 'runx_ori'] = 0 if row['ets_ori'] == -1 else 1

    orimap = {0: "-", 1: "+"}
    #dft["orientation"] = dft.apply(lambda x: "%s%s" % (str(x["ets_ori"]), str(x["runx_ori"])),axis=1)
    dft["orientation"] = dft.apply(
Exemplo n.º 7
0
def mutate_orientation(seqdf,
                       imads,
                       escore,
                       deep=0,
                       escore_cutoff=0.4,
                       escore_gap=0,
                       idcol="id"):
    """
    Make mutation for orientation

    Flip one or both sites, flip the whole 12 mer. We only use HH, HT, TT
    orientation (i.e. no TH).

    Args:
        1. seqdf: input data frame with the wt sequences to mutate
        2. imads: imads model to predict the strength of the mutants
        3. deep: how far we permit distance to go under imads.sitewidth. The
            minimum distance is set to imads.sitewidth - deep. The flip length
            is changed from sitewidth to (sitewidth-deep)//2*2.
     Returns:
        A data frame with changed orientations
    """
    # we need to get orientation information, this already filter if each sequence has 2 sites
    ct = CoopTrain(seqdf["sequence"].values.tolist(),
                   corelen=4,
                   flip_th=True,
                   imads=imads,
                   ignore_sites_err=True)
    om = ct.df.join(seqdf.set_index("sequence"), on="sequence",
                    how="inner")  # this already include the orientation
    mutres = []
    orilist = {"HH", "TT", "HT/TH"}
    flipsites = [[0], [1], [0, 1]]  # which sites to flip
    iter = 0
    nrow = om.shape[0]
    div = 1 if nrow // 100 == 0 else nrow // 100
    mindist = imads.sitewidth - deep
    for index, row in om.iterrows():
        if iter % div == 0:
            print("Mutating orientation, progress {:.2f}% ({}/{})".format(
                iter * 100 / nrow, iter, nrow))
        iter += 1
        mutres_cur = []
        sites, sites_specific = DNASequence(row["sequence"], imads, escore,
                                            escore_cutoff,
                                            escore_gap).get_sites()
        if len(sites) != 2 or sites[1]["core_mid"] - sites[0][
                "core_mid"] < mindist:  # or len(sites_specific) != 2
            continue
        curdist = sites[1]["core_mid"] - sites[0]["core_mid"]
        mutres_cur.append({
            "seqid": row[idcol],
            "sequence": str(row["sequence"]),
            "site1_pos": sites[0]["core_mid"],
            "site1_affinity": sites[0]["score"],
            "site2_pos": sites[1]["core_mid"],
            "site2_affinity": sites[1]["score"],
            "distance": curdist,
            "muttype": "orientation",
            "comment": "wt",
            "wtlabel": row["label"],
            "orientation": row["orientation"]
        })
        for fs in flipsites:
            newseq = row["sequence"]
            adjust = 0 if curdist >= imads.sitewidth else int(
                math.ceil(float(imads.sitewidth - curdist) / 2))
            for i in fs:
                start, end = sites[i]["site_start"] + adjust, sites[i][
                    "site_start"] + sites[i]["site_width"] - adjust
                toflip = bio.revcompstr(row["sequence"][start:end])
                newseq = newseq[:start] + toflip + newseq[end:]
            newsites, newsites_specific = DNASequence(newseq, imads, escore,
                                                      escore_cutoff,
                                                      escore_gap).get_sites()
            if len(
                    newsites
            ) != 2:  #or len(newsites_specific) != 2: # we ignore if there are new sites
                continue
            newori = cg.get_relative_orientation(newseq, imads, htth=False)
            if newori == "HT":
                newori = "HT/TH"
            elif newori == "TH":
                continue  # skip if TH since we use HT
            mutres_cur.append({
                "seqid":
                row[idcol],
                "sequence":
                str(newseq),
                "site1_pos":
                newsites[0]["core_mid"],
                "site1_affinity":
                newsites[0]["score"],
                "site2_pos":
                newsites[1]["core_mid"],
                "site2_affinity":
                newsites[1]["score"],
                "distance":
                newsites[1]["core_mid"] - newsites[0]["core_mid"],
                "muttype":
                "orientation",
                "comment":
                "to_%s" % newori,
                "wtlabel":
                row["label"],
                "orientation":
                newori
            })
        # if len(mutres_cur) != 3: # 3 orientations
        #     print("Found predictions with number of orientation probes != 3", len(mutres_cur), row["sequence"])
        if len(mutres_cur) > 1:
            mutres.extend(mutres_cur)
    return pd.DataFrame(mutres)
Exemplo n.º 8
0
def mutate_affinity(seqdf,
                    imads,
                    escore,
                    deep=0,
                    escore_cutoff=0.4,
                    escore_gap=0,
                    idcol="id"):
    """
    Make mutation to change the affinity (i.e. strength) prediction.

    First, mutations are made for each core to its other core versions, e.g. if
    the core is GGAA and the alternate is GGAT, we simply change GGAAA -> GGAT.
    Then mutate the core flanking regions up to imads.sitewidth, e.g. if the
    core length is 4 and sitewidth is 12 then we can mutate up to (12-4)/2=4bp
    to each side. When 'deep' is set to be more than 0, set barrier to
    sitewidth - distance on the other binding site.

    Args:
        1. seqdf: input data frame with the wt sequences to mutate
        2. imads: imads model to predict the strength of the mutants
        3. deep: the minimum distance between sequence is set to be
            imads.sitewidth - deep. Default is 0, which means we keep sitewidth
            as minimum distance. When deep is > 0, we make barrier at the other
            site so we don't change its affinity prediction.
     Returns:
        A data frame of sequences with SNPs that change its affinity.
    """
    if deep < 0:
        raise ValueError("Minimum deep is 0")

    ct = CoopTrain(seqdf["sequence"].values.tolist(),
                   corelen=4,
                   flip_th=True,
                   imads=imads,
                   ignore_sites_err=True)
    om = ct.df.join(seqdf.set_index("sequence"), on="sequence",
                    how="inner")  # this already include the orientation

    # first make map for mutating between core
    mdlcores_fw = [m.core for m in imads.models]
    fwdict = {e[0]: e[1] for e in list(itertools.permutations(mdlcores_fw, 2))}
    mdlcores_rc = [bio.revcompstr(m) for m in mdlcores_fw]
    rcdict = {e[0]: e[1] for e in list(itertools.permutations(mdlcores_rc, 2))}
    coremap = {**fwdict, **rcdict}

    # prepare the variable
    mindist = imads.sitewidth - deep
    mutres = []

    iter = 0
    nrow = om.shape[0]
    div = 1 if nrow // 100 == 0 else nrow // 100
    for index, row in om.iterrows():
        if iter % div == 0:
            print("Mutating affinity, progress {:.2f}% ({}/{})".format(
                iter * 100 / nrow, iter, nrow))
        iter += 1
        mutres_cur = []
        # we use DNASequence object to have overlap with escore
        sites, sites_specific = DNASequence(row["sequence"], imads, escore,
                                            escore_cutoff,
                                            escore_gap).get_sites()
        if len(sites) != 2 or sites[1]["core_mid"] - sites[0][
                "core_mid"] < mindist:  #or len(sites_specific) != 2
            continue
        mutres_cur.append({
            "seqid":
            row[idcol],
            "sequence":
            str(row["sequence"]),
            "site1_pos":
            sites[0]["core_mid"],
            "site1_affinity":
            sites[0]["score"],
            "site2_pos":
            sites[1]["core_mid"],
            "site2_affinity":
            sites[1]["score"],
            "distance":
            sites[1]["core_mid"] - sites[0]["core_mid"],
            "muttype":
            "affinity",
            "comment":
            "wt",
            "wtlabel":
            row["label"],
            "orientation":
            row["orientation"]
        })
        mids = [s["core_mid"] for s in sites]

        # 1. Mutate the core to the other version
        coremt = mutate_cores(row["sequence"], mids, coremap)

        # 2. Mutate the flanks up to the sitewidth
        barrierlen = imads.sitewidth - row["distance"] if row[
            "distance"] < imads.sitewidth else 0
        flankmt = mutate_flanks(row["sequence"],
                                mids,
                                imads.corewidth,
                                imads.sitewidth,
                                barrier=barrierlen)

        allmt = coremt + flankmt
        for i in range(len(allmt)):
            newsites, newsites_specific = DNASequence(allmt[i]["sequence"],
                                                      imads, escore,
                                                      escore_cutoff,
                                                      escore_gap).get_sites()
            if len(newsites) != 2:  #or len(newsites_specific) != 2:
                continue
            newori = cg.get_relative_orientation(allmt[i]["sequence"],
                                                 imads,
                                                 htth=True)
            mutres_cur.append({
                "seqid":
                row[idcol],
                "sequence":
                allmt[i]["sequence"],
                "site1_pos":
                newsites[0]["core_mid"],
                "site1_affinity":
                newsites[0]["score"],
                "site2_pos":
                newsites[1]["core_mid"],
                "site2_affinity":
                newsites[1]["score"],
                "distance":
                newsites[1]["core_mid"] - newsites[0]["core_mid"],
                "muttype":
                "affinity",
                "comment":
                allmt[i]["comment"],
                "wtlabel":
                row["label"],
                "orientation":
                newori
            })
        if len(mutres_cur) > 1:
            mutres.extend(mutres_cur)
    return pd.DataFrame(mutres)
Exemplo n.º 9
0
    negdf = pd.read_csv(
        "output/array_design_files/Coop1Ets/Coop1Ets_NegCtrl.txt",
        sep="\t",
        header=None)
    print(negdf)
    import sys
    sys.exit(0)

    negctrl = negdf[0].tolist()
    negids = [x.split("_")[3] for x in negdf[2].tolist()]
    validneg = []
    for neg_idx in range(0, len(negctrl)):
        cur_pos_seq = negctrl[neg_idx]
        for ori in ["o1", "o2"]:
            checkseq = cur_pos_seq if ori == "o1" else bio.revcompstr(
                cur_pos_seq)
            cj = clean_junction(checkseq, primer, imads12, escore)
            if not cj:
                print("Problem in clean junction")
                flag = False
                break
            if cj != checkseq + primer:
                if ori == "o1":
                    cur_pos_seq = str(cj[:len(cur_pos_seq)])
                elif ori == "o2":  # change o1
                    cur_pos_seq = bio.revcompstr(str(cj[:len(cur_pos_seq)]))
        if len(imads12.predict_sequence(cur_pos_seq + primer)) > 0:
            print("Error processing sequence", cur_pos_seq)
        else:
            validneg.append({"sequence": cur_pos_seq, "type": negids[neg_idx]})