def LDA(line_list, temp): """ :param line_list: list of SAM object :param temp: temperature :return: """ temp_list = [32, 37, 42, 47, 52, 57] coef_list = [[[-0.14494789, 0.18791679, 0.02588474]], [[-0.13364364, 0.22510179, 0.05494031]], [[-0.09006122, 0.25660706, 0.1078303]], [[-0.01593182, 0.24498485, 0.15753649]], [[0.01860365, 0.1750174, 0.17003374]], [[0.03236755, 0.11624593, 0.24306498]]] inter_list = [-1.17545204, -5.40436344, -12.45549846, -19.32670233, -20.11992898, -23.98652919] class_list = [-1, 1] try: classfier_index = temp_list.index(temp) except ValueError: print("The given temperature was not in temp_list:", temp_list) sys.exit() coef_array = np.asarray(coef_list) inter_array = np.asarray(inter_list) class_array = np.asarray(class_list) lda_classifer = LinearDiscriminantAnalysis() lda_classifer.coef_ = coef_array[classfier_index] lda_classifer.intercept_ = inter_array[classfier_index] lda_classifer.classes_ = class_array test_list = [] for sub_line in line_list: if sub_line.xs_tag: test_list.append([np.float(len(sub_line)), sub_line.xs_tag, sub_line.gc_content]) else: return False lda_prob = lda_classifer.predict_proba(np.asarray(test_list))[:, 1] lda_prob = map(lambda x: x < 0.5, lda_prob) if all(lda_prob): return True return False
# Convert lists to ndarrays. coefArray = np.asarray(coefList) interArray = np.asarray(interList) classArray = np.asarray(classList) # Determine which index to reference for model values. # default tempVal = 57 np_index = tempList.index(tempVal) # Build model from encoded values. clf = LinearDiscriminantAnalysis() clf.coef_ = coefArray[np_index] clf.intercept_ = interArray[np_index] clf.classes_ = classArray # Determine which classifier parameters to use. clfT = tempList.index(tempVal) # Make lists to hold data about candidates. testList = [] testSet = set() candsInfo = [] # the realignment result samf = "/Users/yeweijian/Downloads/data/t.sam" # Make a list to hold the output. outList = []
def cleanOutput(inputFile, uniqueVal, zeroVal, probVal, tempVal, sal, form, reportVal, debugVal, metaVal, outNameVal, startTime): # Determine the stem of the input filename. fileName = str(inputFile).split('.')[0] # Open input file for reading. with open(inputFile, 'r') as f: file_read = [line.strip() for line in f] # Determine how many unique candidates are in the .sam file samIDs = [x.split('\t')[0].split(':')[1].split('-')[0] \ if x[0] is not '@' else ' ' for x in file_read] candsSet = set() for x in samIDs: if x is not ' ': candsSet.add(x) # Make a list to hold the output. outList = [] # Make lists to hold Report info if desired. if reportVal or debugVal is True: rejectList = [] reportList = [] if uniqueVal or zeroVal is True: # Process .sam file, keeping probes with only 0 or 1 unique alignment. for i in range(0, len(file_read), 1): if file_read[i][0] is not '@': chromField = file_read[i].split('\t')[2] chrom = file_read[i].split('\t')[0].split(':')[0] start = file_read[i].split('\t')[0].split(':')[1].split('-')[0] stop = file_read[i].split('\t')[0].split('-')[1].strip(' ') seq = file_read[i].split('\t')[9] Tm = probeTm(seq, sal, form) # For unique mode. if uniqueVal is True: if re.match('\*', chromField) is None \ and re.search('XS', file_read[i]) is None: outList.append('%s\t%s\t%s\t%s\t%s' \ % (chrom, start, stop, seq, Tm)) # Report info on selected probe if desired. if reportVal is True: reportList.append('Candidate probe at %s:%s-%s ' 'aligned 1 time, added to output' \ % (chrom, start, stop)) if debugVal is True: print( 'Candidate probe at %s:%s-%s aligned 1 time, ' 'added to output' % (chrom, start, stop)) else: # Report info on rejected candidates if desired. if reportVal or debugVal is True: if start not in rejectList: rejectList.append(start) if re.match('\*', chromField) is not None: if reportVal is True: reportList.append('Candidate probe at ' '%s:%s-%s aligned 0 ' 'times, was not added ' 'to output' \ % (chrom, start, stop)) if debugVal is True: print( 'Candidate probe at %s:%s-%s ' 'aligned 0 times, was not added to ' 'output' % (chrom, start, stop)) elif re.search('XS', file_read[i]) is not None: if reportVal is True: reportList.append('Candidate probe at ' '%s:%s-%s aligned >1 ' 'time, was not added ' 'to output' \ % (chrom, start, stop)) if debugVal is True: print( 'Candidate probe at %s:%s-%s ' 'aligned >1 time, was not added to ' 'output' % (chrom, start, stop)) # For zero mode. elif zeroVal is True: if re.match('\*', chromField) is not None: outList.append('%s\t%s\t%s\t%s\t%s' \ % (chrom, start, stop, seq, Tm)) # Report info on selected probe if desired. if reportVal is True: reportList.append('Candidate probe at %s:%s-%s ' 'aligned 0 times, added to output ' '(Zero mode active)' \ % (chrom, start, stop)) if debugVal is True: print('Candidate probe at %s:%s-%s aligned 0 times, ' 'added to output (Zero mode active)' \ % (chrom, start, stop)) else: # Report info on rejected candidates if desired. if reportVal or debugVal is True: if start not in rejectList: rejectList.append(start) if reportVal is True: reportList.append('Candidate probe at ' '%s:%s-%s aligned >0 ' 'times, was not added to ' 'output (Zero mode ' 'active)' \ % (chrom, start, stop)) if debugVal is True: print('Candidate probe at %s:%s-%s aligned ' '>0 times, was not added to output ' '(Zero mode active)' \ % (chrom, start, stop)) # Else use LDA model. else: # Import scikit-learn LDA module. # Note the module name changed between sklearn versions 0.16 and 0.17 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Import numpy module. import numpy as np # LDA model information. tempList = [32, 37, 42, 47, 52, 57] coefList = [[[-0.14494789, 0.18791679, 0.02588474]], [[-0.13364364, 0.22510179, 0.05494031]], [[-0.09006122, 0.25660706, 0.1078303]], [[-0.01593182, 0.24498485, 0.15753649]], [[0.01860365, 0.1750174, 0.17003374]], [[0.03236755, 0.11624593, 0.24306498]]] interList = [ -1.17545204, -5.40436344, -12.45549846, -19.32670233, -20.11992898, -23.98652919 ] classList = [-1, 1] # Convert lists to ndarrays. coefArray = np.asarray(coefList) interArray = np.asarray(interList) classArray = np.asarray(classList) # Determine which index to reference for model values. np_index = tempList.index(tempVal) # Build model from encoded values. clf = LinearDiscriminantAnalysis() clf.coef_ = coefArray[np_index] clf.intercept_ = interArray[np_index] clf.classes_ = classArray # Determine which classifier parameters to use. clfT = tempList.index(tempVal) # Make lists to hold data about candidates. testList = [] testSet = set() candsInfo = [] # Process .sam file and extract information about each candidate probe. for i in range(0, len(file_read), 1): if file_read[i][0] is not '@': chromField = file_read[i].split('\t')[2] chrom = file_read[i].split('\t')[0].split(':')[0] start = file_read[i].split('\t')[0].split(':')[1].split('-')[0] stop = file_read[i].split('\t')[0].split('-')[1].strip(' ') seq = file_read[i].split('\t')[9] Tm = probeTm(seq, sal, form) # First look for candidate probes with only one unique alignment. if re.match('\*', chromField) is None \ and re.search('XS', file_read[i]) is None: outList.append('%s\t%s\t%s\t%s\t%s' \ % (chrom, start, stop, seq, Tm)) # Record info on selected probe if desired. if reportVal is True: reportList.append('Candidate probe at %s:%s-%s aligned ' '1 time, added to output' \ % (chrom, start, stop)) if debugVal is True: print( 'Candidate probe at %s:%s-%s aligned 1 time, ' 'added to output' % (chrom, start, stop)) # Populate lists that will be used to make the classification # model input. else: if re.match('\*', chromField) is None \ and start not in testSet: t = [ float(len(seq)), float(file_read[i].split('\t')[12].split(':')[2]), GC(seq) ] testList.append(t) testSet.add(start) candsInfo.append('%s\t%s\t%s\t%s\t%s' \ % (chrom, start, stop, seq, Tm)) else: # Report info on rejected candidates if desired. if reportVal or debugVal is True: if re.match('\*', chromField) is not None: if start not in rejectList: rejectList.append(start) if reportVal is True: reportList.append('Candidate probe at ' '%s:%s-%s aligned 0 ' 'times, was not added ' 'to output' \ % (chrom, start, stop)) if debugVal is True: print( 'Candidate probe at %s:%s-%s ' 'aligned 0 times, was not added to ' 'output' % (chrom, start, stop)) # Make ndarray for input into classifier. testArray = np.asarray(testList) # Create classifier clf = LinearDiscriminantAnalysis() # Load temperature-specific model information. clf.coef_ = coefArray[clfT] clf.intercept_ = interArray[clfT] clf.classes_ = classArray # Use model to predict the probability that candidate # probes will have thermodynamically relevant # off-target binding sites unless all have just 1 # alignment in the .sam file. if len(testArray) > 1: probs = clf.predict_proba(testArray)[:, 1] # Filter through tested candidates using # based on user-specified probability threshold. for i in range(0, len(probs), 1): if float(probs[i]) < probVal: outList.append(candsInfo[i]) if reportVal is True: reportList.append('Candidate probe at %s:%s-%s added to ' 'output with %0.4f < %0.4f probability of ' 'having off-target sites' \ % (candsInfo[i].split('\t')[0], candsInfo[i].split('\t')[1], candsInfo[i].split('\t')[2], probs[i], probVal)) if debugVal is True: print( 'Candidate probe at %s:%s-%s added to output with ' '%0.4f < %0.4f probability of having off-target sites' % (candsInfo[i].split('\t')[0], candsInfo[i].split('\t')[1], candsInfo[i].split('\t')[2], probs[i], probVal)) else: if reportVal is True: reportList.append('Candidate probe at %s:%s-%s filtered with ' '%0.4f => %0.4f probability of having ' 'off-target sites' \ % (candsInfo[i].split('\t')[0], candsInfo[i].split('\t')[1], candsInfo[i].split('\t')[2], probs[i], probVal)) if debugVal is True: print( 'Candidate probe at %s:%s-%s filtered with ' '%0.4f => %0.4f probability of having off-target sites' % (candsInfo[i].split('\t')[0], candsInfo[i].split('\t')[1], candsInfo[i].split('\t')[2], probs[i], probVal)) # Sort output list. outList.sort(key=lambda x: [int(x.split('\t')[1])]) # Determine the name of the output file. if outNameVal is None: outName = '%s_probes' % fileName else: outName = outNameVal # Create the output file. output = open('%s.bed' % outName, 'w') # Write the output file. output.write('\n'.join(outList)) output.close() # Print info about the results to terminal. candsNum = len(candsSet) cleanNum = len(outList) if zeroVal is True: print('outputClean identified %d of %d / %0.4f%% candidate probes as ' 'having zero alignments' \ % (cleanNum, candsNum, float(cleanNum) / float(candsNum) * 100)) elif uniqueVal is True: print( 'outputClean identified %d of %d / %0.4f%% candidate probes as ' 'unique' % (cleanNum, candsNum, float(cleanNum) / float(candsNum) * 100)) else: print('outputClean passed %d of %d / %0.4f%% candidate probes through ' 'specificity filtering using the %dC LDA model' \ % (cleanNum, candsNum, float(cleanNum) / float(candsNum) * 100, tempVal)) # Write meta information to a .txt file if desired. if metaVal is True: metaText = open('%s_outputClean_meta.txt' % outName, 'w') metaText.write('%s\t%f\t%s\t%d\t%d' \ % (inputFile, timeit.default_timer() - startTime, Version, cleanNum, candsNum)) metaText.close() # If desired, create report file. if reportVal is True: reportOut = open('%s_outputClean_log.txt' % outName, 'w') reportList.sort(key=lambda x: [int(x.split(':')[1].split('-')[0])]) reportList.insert(0, 'Results produced by %s %s' % (scriptName, Version)) reportList.insert(1, '-' * 100) if uniqueVal is True: reportList.insert(2, 'outputClean returned %d of %d / %0.4f%% ' 'candidate probes as having exactly 1 ' 'alignment' \ % (cleanNum, candsNum, float(cleanNum) / float(candsNum) * 100)) elif zeroVal is True: reportList.insert(2, 'outputClean returned %d of %d / %0.4f%% ' 'candidate probes as having 0 alignments (Zero ' 'mode active)' \ % (cleanNum, candsNum, float(cleanNum) / float(candsNum) * 100)) else: reportList.insert(2, 'outputClean passed %d of %d / %0.4f%% ' 'candidate probes through specificity filtering ' 'using the %dC LDA model' \ % (cleanNum, candsNum, float(cleanNum) / float(candsNum) * 100, tempVal)) reportList.insert(3, '-' * 100) reportOut.write('\n'.join(reportList)) reportOut.close()