def generate_matching_sequence(self, sequence, core, width): """ Returns sub-sequences of width, that match the core in the middle. :param sequence: The the sequence to search, such as the whole sequence for a chromosome. Can be a string or a Bio.Seq :param core: The bases for which to search, in the center :param width: The desired sub-sequence width, e.g. 36 :return: Generator, returning one sub-sequence per call """ # Need to search for core and reverse complement in the window region # If RC is found in the region, return the reverse-complement of the window instead # Also, if core is palindromic, need to return both regions and return best score core_rc = bio.revcompstr(core) max_start = len(sequence) - width core_start = (width - len(core)) // 2 for start in range(max_start + 1): end = start + width window_sequence = sequence[start:end] # If any of the bases in the window are unknown, we cannot predict on the sequence if 'N' in window_sequence: continue window_core = window_sequence[core_start:core_start + len(core)] #print("seq and core",window_sequence,window_core) # If core is palindromic, return two sequences and let the caller decide which to use core_pos = core_start + start if core == core_rc and window_core == core: yield start, core_pos, ( str(window_sequence), str(bio.revcompstr(window_sequence)), ) elif window_core == core: yield start, core_pos, (window_sequence, ) elif window_core == core_rc: yield start, core_pos, (str(bio.revcompstr(window_sequence)), )
def fixdup_single_name(nm, df): dseq1 = df[(df["type"] == "wt")][["Sequence"]].rename({ "Sequence": "Sequence_x" }).drop_duplicates() dseq2 = df[["Sequence"]].rename({ "Sequence": "Sequence_y" }).drop_duplicates() dseq1_o1_seqs = df[(df["type"] == "wt") & ( df["ori"] == "o1")]["Sequence"].drop_duplicates().tolist() rcgrp = {} for i in range(len(dseq1_o1_seqs)): rcgrp[dseq1_o1_seqs[i]] = i rcgrp[bio.revcompstr(dseq1_o1_seqs[i])] = i dseq1["key"], dseq2["key"] = 1, 1 comb = dseq1.merge(dseq2, on="key").drop( "key", 1) # get cartesian product between the two comb["simscore"] = comb.apply(lambda row: SequenceMatcher( None, row["Sequence_x"], row["Sequence_y"]).ratio(), axis=1) comb = comb.sort_values(["Sequence_x", "simscore"], ascending=[True, False]) selectedcmb = comb.groupby("Sequence_x").head(4).rename( columns={"Sequence_y": "Sequence"}) selectedcmb["group"] = selectedcmb["Sequence_x"].apply(lambda x: "%s_%s" % (nm, rcgrp[x])) selectedcmb = selectedcmb[["Sequence", "group"]].drop_duplicates() named = df.merge(selectedcmb, on="Sequence") return named
def gen_training(df, pwm, kompas): train = get_sites_pos(df, kompas, pwm) # reverse -- to ++ train00 = train[train["orientation"] == "-/-"][[ "Name", "Sequence", "label" ]] train00["Sequence"] = train00["Sequence"].apply( lambda x: bio.revcompstr(x)) train00 = get_sites_pos(train00, kompas, pwm) train = pd.concat([train[train["orientation"] != "-/-"], train00]) return train.drop_duplicates()
def check_val(indf, wt, m1, m2, m3): df = indf[(indf["Sequence"] == wt) | (indf["Sequence"] == m1) | (indf["Sequence"] == m2) \ | (indf["Sequence"] == m3) | (indf["Sequence"] == bio.revcompstr(wt)) \ | (indf["Sequence"] == bio.revcompstr(m1)) | (indf["Sequence"] == bio.revcompstr(m2)) \ | (indf["Sequence"] == bio.revcompstr(m3)) ] \ .sort_values(["ori", "type", "rep"]) twodict = df[df["type"] == "wt"].groupby('ori')['Alexa488Adjusted'].apply( list).to_dict() print(twodict) m1df = df[df["type"] == "m1"][['ori', 'Alexa488Adjusted']] m2df = df[df["type"] == "m2"][['ori', 'Alexa488Adjusted']] m3df = df[df["type"] == "m3"][['ori', 'Alexa488Adjusted']] onedf = m1df.merge(m2df, on='ori').merge(m3df, on='ori') onedf['indiv'] = onedf["Alexa488Adjusted_x"] + onedf[ "Alexa488Adjusted_y"] - onedf["Alexa488Adjusted"] onedict = onedf.groupby('ori')['indiv'].apply(list).to_dict() for ori in ['o1', 'o2']: p = st.wilcox(twodict[ori], onedict[ori], "greater") print(ori, p)
def get_relative_orientation(sequence, predictor, htth=True): """ Get orientation of the 2 sites in a sequence Desc Args: sequence predictor htth: if True, predict HT or TH as HT/TH Return: orientation Example: """ sites = predictor.predict_sequence(sequence) p1 = sequence[sites[0]["core_mid"] - predictor.corewidth//2:sites[0]["core_mid"] + predictor.corewidth//2] p2 = sequence[sites[1]["core_mid"] - predictor.corewidth//2:sites[1]["core_mid"] + predictor.corewidth//2] pos_cores = [m.core for m in predictor.models] neg_cores = [bio.revcompstr(p) for p in pos_cores] p = [p1,p2] s = [0,0] for i in range(len(p)): if p[i] in pos_cores: s[i] = 1 elif p[i] in neg_cores: s[i] = -1 else: s[i] = 0 print("couldn't find the site %s in %s in the core list" % (p[i],sequence)) if s[0] == 1 and s[1] == 1: return 'HT/TH' if htth else "HT" elif s[0] == -1 and s[1] == -1: return 'HT/TH' if htth else "TH" elif s[0] == 1 and s[1] == -1: return 'HH' elif s[0] == -1 and s[1] == 1: return 'TT' else: return '-1'
basepath)[["Name", "Sequence"]].rename(columns={ "Sequence": "sequence" }).drop_duplicates() df.merge(fullpos, on="sequence")[[ "Name", "ets_score", "runx_score" ]].drop_duplicates().sort_values(by=["Name"]).to_csv("pwm_allseq.csv", index=False) dft = dft.merge(fullpos, on="sequence") dft = dft.drop(columns=["fullseq", "ori"]).drop_duplicates() dft = dft[dft["distance"] != 4] # flip ets runx position for index, row in dft.iterrows(): if row['ets_pos'] > row['runx_pos']: dft.at[index, 'sequence'] = bio.revcompstr(row["sequence"]) dft.at[index, 'ets_start'] = len(row["sequence"]) - row["ets_pos"] - 4 dft.at[index, 'ets_pos'] = dft.at[index, 'ets_start'] + 1 dft.at[index, 'ets_ori'] = 1 if row['ets_ori'] == -1 else 0 dft.at[index, 'runx_start'] = len(row["sequence"]) - row["runx_pos"] - 5 dft.at[index, 'runx_pos'] = dft.at[index, 'runx_start'] + 2 dft.at[index, 'runx_ori'] = 1 if row['runx_ori'] == -1 else 0 else: dft.at[index, 'ets_ori'] = 0 if row['ets_ori'] == -1 else 1 dft.at[index, 'runx_ori'] = 0 if row['ets_ori'] == -1 else 1 orimap = {0: "-", 1: "+"} #dft["orientation"] = dft.apply(lambda x: "%s%s" % (str(x["ets_ori"]), str(x["runx_ori"])),axis=1) dft["orientation"] = dft.apply(
def mutate_orientation(seqdf, imads, escore, deep=0, escore_cutoff=0.4, escore_gap=0, idcol="id"): """ Make mutation for orientation Flip one or both sites, flip the whole 12 mer. We only use HH, HT, TT orientation (i.e. no TH). Args: 1. seqdf: input data frame with the wt sequences to mutate 2. imads: imads model to predict the strength of the mutants 3. deep: how far we permit distance to go under imads.sitewidth. The minimum distance is set to imads.sitewidth - deep. The flip length is changed from sitewidth to (sitewidth-deep)//2*2. Returns: A data frame with changed orientations """ # we need to get orientation information, this already filter if each sequence has 2 sites ct = CoopTrain(seqdf["sequence"].values.tolist(), corelen=4, flip_th=True, imads=imads, ignore_sites_err=True) om = ct.df.join(seqdf.set_index("sequence"), on="sequence", how="inner") # this already include the orientation mutres = [] orilist = {"HH", "TT", "HT/TH"} flipsites = [[0], [1], [0, 1]] # which sites to flip iter = 0 nrow = om.shape[0] div = 1 if nrow // 100 == 0 else nrow // 100 mindist = imads.sitewidth - deep for index, row in om.iterrows(): if iter % div == 0: print("Mutating orientation, progress {:.2f}% ({}/{})".format( iter * 100 / nrow, iter, nrow)) iter += 1 mutres_cur = [] sites, sites_specific = DNASequence(row["sequence"], imads, escore, escore_cutoff, escore_gap).get_sites() if len(sites) != 2 or sites[1]["core_mid"] - sites[0][ "core_mid"] < mindist: # or len(sites_specific) != 2 continue curdist = sites[1]["core_mid"] - sites[0]["core_mid"] mutres_cur.append({ "seqid": row[idcol], "sequence": str(row["sequence"]), "site1_pos": sites[0]["core_mid"], "site1_affinity": sites[0]["score"], "site2_pos": sites[1]["core_mid"], "site2_affinity": sites[1]["score"], "distance": curdist, "muttype": "orientation", "comment": "wt", "wtlabel": row["label"], "orientation": row["orientation"] }) for fs in flipsites: newseq = row["sequence"] adjust = 0 if curdist >= imads.sitewidth else int( math.ceil(float(imads.sitewidth - curdist) / 2)) for i in fs: start, end = sites[i]["site_start"] + adjust, sites[i][ "site_start"] + sites[i]["site_width"] - adjust toflip = bio.revcompstr(row["sequence"][start:end]) newseq = newseq[:start] + toflip + newseq[end:] newsites, newsites_specific = DNASequence(newseq, imads, escore, escore_cutoff, escore_gap).get_sites() if len( newsites ) != 2: #or len(newsites_specific) != 2: # we ignore if there are new sites continue newori = cg.get_relative_orientation(newseq, imads, htth=False) if newori == "HT": newori = "HT/TH" elif newori == "TH": continue # skip if TH since we use HT mutres_cur.append({ "seqid": row[idcol], "sequence": str(newseq), "site1_pos": newsites[0]["core_mid"], "site1_affinity": newsites[0]["score"], "site2_pos": newsites[1]["core_mid"], "site2_affinity": newsites[1]["score"], "distance": newsites[1]["core_mid"] - newsites[0]["core_mid"], "muttype": "orientation", "comment": "to_%s" % newori, "wtlabel": row["label"], "orientation": newori }) # if len(mutres_cur) != 3: # 3 orientations # print("Found predictions with number of orientation probes != 3", len(mutres_cur), row["sequence"]) if len(mutres_cur) > 1: mutres.extend(mutres_cur) return pd.DataFrame(mutres)
def mutate_affinity(seqdf, imads, escore, deep=0, escore_cutoff=0.4, escore_gap=0, idcol="id"): """ Make mutation to change the affinity (i.e. strength) prediction. First, mutations are made for each core to its other core versions, e.g. if the core is GGAA and the alternate is GGAT, we simply change GGAAA -> GGAT. Then mutate the core flanking regions up to imads.sitewidth, e.g. if the core length is 4 and sitewidth is 12 then we can mutate up to (12-4)/2=4bp to each side. When 'deep' is set to be more than 0, set barrier to sitewidth - distance on the other binding site. Args: 1. seqdf: input data frame with the wt sequences to mutate 2. imads: imads model to predict the strength of the mutants 3. deep: the minimum distance between sequence is set to be imads.sitewidth - deep. Default is 0, which means we keep sitewidth as minimum distance. When deep is > 0, we make barrier at the other site so we don't change its affinity prediction. Returns: A data frame of sequences with SNPs that change its affinity. """ if deep < 0: raise ValueError("Minimum deep is 0") ct = CoopTrain(seqdf["sequence"].values.tolist(), corelen=4, flip_th=True, imads=imads, ignore_sites_err=True) om = ct.df.join(seqdf.set_index("sequence"), on="sequence", how="inner") # this already include the orientation # first make map for mutating between core mdlcores_fw = [m.core for m in imads.models] fwdict = {e[0]: e[1] for e in list(itertools.permutations(mdlcores_fw, 2))} mdlcores_rc = [bio.revcompstr(m) for m in mdlcores_fw] rcdict = {e[0]: e[1] for e in list(itertools.permutations(mdlcores_rc, 2))} coremap = {**fwdict, **rcdict} # prepare the variable mindist = imads.sitewidth - deep mutres = [] iter = 0 nrow = om.shape[0] div = 1 if nrow // 100 == 0 else nrow // 100 for index, row in om.iterrows(): if iter % div == 0: print("Mutating affinity, progress {:.2f}% ({}/{})".format( iter * 100 / nrow, iter, nrow)) iter += 1 mutres_cur = [] # we use DNASequence object to have overlap with escore sites, sites_specific = DNASequence(row["sequence"], imads, escore, escore_cutoff, escore_gap).get_sites() if len(sites) != 2 or sites[1]["core_mid"] - sites[0][ "core_mid"] < mindist: #or len(sites_specific) != 2 continue mutres_cur.append({ "seqid": row[idcol], "sequence": str(row["sequence"]), "site1_pos": sites[0]["core_mid"], "site1_affinity": sites[0]["score"], "site2_pos": sites[1]["core_mid"], "site2_affinity": sites[1]["score"], "distance": sites[1]["core_mid"] - sites[0]["core_mid"], "muttype": "affinity", "comment": "wt", "wtlabel": row["label"], "orientation": row["orientation"] }) mids = [s["core_mid"] for s in sites] # 1. Mutate the core to the other version coremt = mutate_cores(row["sequence"], mids, coremap) # 2. Mutate the flanks up to the sitewidth barrierlen = imads.sitewidth - row["distance"] if row[ "distance"] < imads.sitewidth else 0 flankmt = mutate_flanks(row["sequence"], mids, imads.corewidth, imads.sitewidth, barrier=barrierlen) allmt = coremt + flankmt for i in range(len(allmt)): newsites, newsites_specific = DNASequence(allmt[i]["sequence"], imads, escore, escore_cutoff, escore_gap).get_sites() if len(newsites) != 2: #or len(newsites_specific) != 2: continue newori = cg.get_relative_orientation(allmt[i]["sequence"], imads, htth=True) mutres_cur.append({ "seqid": row[idcol], "sequence": allmt[i]["sequence"], "site1_pos": newsites[0]["core_mid"], "site1_affinity": newsites[0]["score"], "site2_pos": newsites[1]["core_mid"], "site2_affinity": newsites[1]["score"], "distance": newsites[1]["core_mid"] - newsites[0]["core_mid"], "muttype": "affinity", "comment": allmt[i]["comment"], "wtlabel": row["label"], "orientation": newori }) if len(mutres_cur) > 1: mutres.extend(mutres_cur) return pd.DataFrame(mutres)
negdf = pd.read_csv( "output/array_design_files/Coop1Ets/Coop1Ets_NegCtrl.txt", sep="\t", header=None) print(negdf) import sys sys.exit(0) negctrl = negdf[0].tolist() negids = [x.split("_")[3] for x in negdf[2].tolist()] validneg = [] for neg_idx in range(0, len(negctrl)): cur_pos_seq = negctrl[neg_idx] for ori in ["o1", "o2"]: checkseq = cur_pos_seq if ori == "o1" else bio.revcompstr( cur_pos_seq) cj = clean_junction(checkseq, primer, imads12, escore) if not cj: print("Problem in clean junction") flag = False break if cj != checkseq + primer: if ori == "o1": cur_pos_seq = str(cj[:len(cur_pos_seq)]) elif ori == "o2": # change o1 cur_pos_seq = bio.revcompstr(str(cj[:len(cur_pos_seq)])) if len(imads12.predict_sequence(cur_pos_seq + primer)) > 0: print("Error processing sequence", cur_pos_seq) else: validneg.append({"sequence": cur_pos_seq, "type": negids[neg_idx]})