def translate(word): return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word] def translateRaw(array): return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array]) scoreFolder = sys.argv[1] upstreamList = sys.argv[2] fullUpstreamList = sys.argv[3] if not os.path.exists("temp"): os.mkdir("temp") current = PatternFinder(6, "temp") current.loadSequences(upstreamList) full = PatternFinder(6, scoreFolder) full.loadSequences(fullUpstreamList) temp = PatternFinder(6, scoreFolder) def makeTextShuffleControl(filename, minLen=10000): cdata = np.array([translate(i.replace("\n", "").replace("\r", "")) for i in open(filename).readlines() if len(i) > 10]).T controls = [] repeat = 1 + minLen / len(cdata[0])
[latent, coeff] = scipy.sparse.linalg.eigsh(covM, numPCs) if verbose: print "Eigenvalues are:", latent return (np.transpose(coeff[:, ::-1]), latent[::-1]) def translate(word): return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word] def translateRaw(array): return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array]) a = PatternFinder(6, scoreFolder) a.loadSequences(sequenceFile) allSeqs = a.rawSequences data = pd.read_csv(os.path.join(scoreFolder, "sortedBy/Best10000_sortBy_ScoreNew_8.csv")) lef = data["Pos l"].values rig = data["Pos r"].values mask = (abs(rig - POSITION_RIGHT) < 4) * (abs(lef - POSITION_LEFT) < 4) data = data[mask] patLeftBest = data["Patt l"].values[0] patRightBest = data["Patt r"].values[0] data = data[:300] assert len(data) > 90 # check that we have at least 90 unique patterns
This is where all the the other constants are defined for this part only. ALPHA is the slope of the score for mismatches, defined in the paper BEST_SINGLE is the number of best n-mers to use for each side of the pattern. Then all pairwise combinations of BEST_SINGLE x BEST_SINGLE 6-mers will be evaluated at every pari of positions in the upstream MAX_SCORE is the maximum score (offset + extension + mismatches) for the whole pattern MAX_SCORE_SINGLE is the maximum score for scoring separate n-mers (used for selection of BEST_SINGLE only) MAX_SUBS and MAX_SHIFT are maximum number of subs and maximum offset of a pattern """ a = PatternFinder(6, sys.argv[2]) # create a patternFinder object bound to folder provided as a second command line argument a.loadSequences(sys.argv[1]) # if third argument is provided, we interpret it as a set of locations where to evaluate pattern if len(sys.argv) >= 4: locations = [map(int, j.split("-")) for j in sys.argv[3].split(",")] print locations # If not provided, the program will automatically select it else: locations = None