def best_match(SEQ1, LIST, MAX = float("inf"), IGNORE_N = 0, PRINT = 0 ): """finds the best match for a sequence in a list of sequences. MAX sets the number of max number of mismatches before it moves on. Lowering MAX increases performance. IGNORE_N = 1 will ignore mismatches with N.""" x = [] xcount = [] y = MAX no_exact_match = 0 #first search for exact matach for i in range(len(LIST)): if SEQ1 == LIST[i]: no_exact_match = 0 return i break if no_exact_match: for i in range(len(LIST)): z = BC.mismatches(SEQ1, LIST[i], y, IGNORE_N) if z < y: y = z x.append(i) xcount.append(z) if z == 0: break if len(x) > 0: comp = "==" + str(min(xcount)) best = [a for a,b in enumerate(xcount) if eval(str(b) + comp)] if PRINT == 1: print SEQ1 print LIST[x[0]] print x[best[0]], xcount[best[0]] return x[best[0]] else: return -1
def calc(self, a=1, b=1, c=1): a = int(self.entry.get()) b = int(self.entry2.get()) c = int(self.entry3.get()) final = BC.convert(a,b,c) self.result.delete(0,END) self.result.insert(END,final)
def calc(self, a=1, b=1, c=1): a = int(self.entry.get()) b = int(self.entry2.get()) c = int(self.entry3.get()) final = BC.convert(a, b, c) self.result.delete(0, END) self.result.insert(END, final)
def test_bc(self): poissonForm = PoissonFormulation.PoissonFormulation(2, True) poissonBF = poissonForm.bf() mesh = MeshFactory.MeshFactory_rectilinearMesh(poissonBF, [1.0, 1.0], [2, 3], 4) s = Solution.Solution_solution(mesh) bc = BC.BC_bc() s.setBC(bc) #self.assertEqual(bc, s.bc()) # i want to test that what I set is equal to what I used to set it with, but # since that won't work, I can at least test that they behave in the same way self.assertEqual(bc.singlePointBC(0), s.bc().singlePointBC(0))
def testBC(self): poissonForm = PoissonFormulation.PoissonFormulation(2, True) poissonBF = poissonForm.bf() mesh = MeshFactory.MeshFactory_rectilinearMesh(poissonBF, [1.0, 1.0], [2, 3], 4) soln = Solution.Solution_solution(mesh) vf = VarFactory.VarFactory() fv = vf.fieldVar("Hello") testBC = BC.BC_bc() soln.setBC(testBC) self.assertEqual(testBC.bcsImposed(fv.ID()), soln.bc().bcsImposed(fv.ID()))
def main(): print( "Please select an option below: \n 1) Create test user\n 2) create user \n 0) Quit\n" ) user_input = input() print(user_input) if (user_input == '1'): test_case = BC() # print(test_case) elif (user_input == '2'): create = c.create_user() else: print("Exiting......\n")
def testZeroMeanConstraint(self): #Initial Test Values & Set up of Dummy variable vf = VarFactory.VarFactory() bc = BC.BC_bc() testVar = vf.fieldVar("testVar", 2) ID = testVar.ID() bc.addZeroMeanConstraint(testVar) #Test to see if ZeroMeanConstraint has been added correctly self.assertTrue(bc.imposeZeroMeanConstraint(ID), "No Zero Mean Constraint Imposed") #Test to see if one can correctly remove ZeroMeanConstraint bc.removeZeroMeanConstraint(ID) self.assertFalse(bc.imposeZeroMeanConstraint(ID), "Zero Mean Constraint not removed")
def testSinglePoint(self): #Initial Test Values & Set up of Dummy variable vf = VarFactory.VarFactory() testVar = vf.fieldVar("testVar", 2) ID = testVar.ID() testVertex = 4294967295 testFieldID = 9 testValue = 17.1 bc = BC.BC_bc() bc.addSinglePointBC(testFieldID, testValue) #Test to see if Single Point BC has been added correctly self.assertFalse(bc.bcsImposed(testFieldID), "Single Point BC not Imposed") self.assertTrue(bc.singlePointBC(testFieldID), "No Single Point BC") self.assertTrue(testValue == bc.valueForSinglePointBC(testFieldID), "Value on Single Point BC not maintained") self.assertEquals(testVertex, bc.vertexForSinglePointBC(testFieldID), "Vertex on Single Point BC not maintained")
def testDirichlet(self): #Initial Test Values & Set up of Dummy variable testSpatialFilter = SpatialFilter.SpatialFilter.allSpace() testFunction = Function.Function.xn() vf = VarFactory.VarFactory() bc = BC.BC_bc() traceVar = vf.traceVar("traceVar", 2) ID = traceVar.ID() a = [testSpatialFilter, testFunction] bc.addDirichlet(traceVar, testSpatialFilter, testFunction) #Tests to see if Dirichlet has been added correctly self.assertTrue( testFunction.evaluate(2, 3) == bc.getDirichletBC(ID)[1].evaluate( 2, 3), "Dirichlet BC failed") #Maybe same thing as line 60 for spatial filter? self.assertTrue( testFunction.evaluate(4, 3) == bc.getSpatiallyFilteredFunctionForDirichletBC(ID).evaluate(4, 3), "Dirichlet Spatially Filtered Funtion failed")
def testExporter(self): #Initial Test Values & Set Up Dummy Variables poissonForm = PoissonFormulation.PoissonFormulation(2, True) poissonBF = poissonForm.bf( ) #ToDo Give the VarFactory a field & test variable testMesh = MeshFactory.MeshFactory_rectilinearMesh( poissonBF, [1.2, 1.4], [2, 3], 2) testFunction = Function.Function.xn() testFunction2 = Function.Function.yn() testVector = [testFunction, testFunction2] testVector2 = ["function1", "function2"] testBC = BC.BC_bc() testSolutionPtr = Solution.Solution_solution(testMesh) testExport = HDF5Exporter.HDF5Exporter(testMesh, "output", ".") #Tests exportFunction using definition #1 testExport.exportFunction(testFunction, "function", 0) #Tests exportFunction using definition #2 testExport.exportFunction(testVector, testVector2, 0) #Tests exportSolution testExport.exportSolution(testSolutionPtr, 0)
def linear_equation_system( Rwx, Rwy, Rwz, Tg, Tyz, Pg, P): # Generates coefficient matrix A and constant matrix C # WALL BOUNDARY CONDITIONS ARE NOT IMPOSED IN THIS FUNCTION a = I.a b = I.b c = I.c A = np.zeros([a * b * c, a * b * c]) C = np.zeros([a * b * c, 1]) for index in range(0, np.size(C)): i, j, k = find.find_equation_id(index) # print("i j k",i,j,k) i = int(i) j = int(j) k = int(k) if (j != 0 and j != b - 1 and k != 0 and k != c - 1): # except boundary walls if (np.mod(j, 4) == 1 and np.mod(k, 2) == 1): # Hot channel # print("Hot channel") # print("Node of balance",i,j,k) # print(index) C11 = 1 / (Rwy + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k)) C12 = 1 / (Rwy + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k)) C21 = 1 / (Rwz + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k)) C22 = 1 / (Rwz + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k)) C01 = -C11 - C21 + I.m_channel * find.Cp( Tg[i, j, k], Pg[i, j, k], i, j, k) C02 = -C11 - C21 - I.m_channel * find.Cp( Tg[i, j, k], Pg[i, j, k], i, j, k) A[index, find.find_index(i, j - 1, k)] = C11 # Tg j-1 term A[index, find.find_index(i, j + 1, k)] = C12 # Tg j+1 term A[index, find.find_index(i, j, k - 1)] = C21 # Tg k-1 term A[index, find.find_index(i, j, k + 1)] = C22 # Tg k+1 term if (i == 0): # Ti term is hot in BC C[index, 0] = -I.T_hot_in * C01 # BC = Thot in A[index, find.find_index(i + 1, j, k)] = C02 # Tyz i+1 term else: # interior A[index, find.find_index(i, j, k)] = C01 #Tyz i term A[index, find.find_index(i + 1, j, k)] = C02 # Tyz i+1 term #end of Hot channel ############################################################################################# #Cold Channel if (np.mod(j, 4) == 3 and np.mod(k, 2) == 1): # Cold channel C11 = 1 / (Rwy + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k)) C12 = 1 / (Rwy + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k)) C21 = 1 / (Rwz + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k)) C22 = 1 / (Rwz + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k)) C01 = -C11 - C21 - I.m_channel * find.Cp( Tg[i, j, k], Pg[i, j, k], i, j, k) C02 = -C11 - C21 + I.m_channel * find.Cp( Tg[i, j, k], Pg[i, j, k], i, j, k) A[index, find.find_index(i, j - 1, k)] = C11 # Tg j-1 term A[index, find.find_index(i, j + 1, k)] = C12 # Tg j+1 term A[index, find.find_index(i, j, k - 1)] = C21 # Tg k-1 term A[index, find.find_index(i, j, k + 1)] = C22 # Tg k+1 term if (i == a - 1): # Ti-1 term is adjusted - Cold outlet A[index, find.find_index(i, j, k)] = C01 #Tyz i term C[index, 0] = -I.T_cold_in * C02 #Cold in BC-> Tyz i+1 term else: # interior A[index, find.find_index(i, j, k)] = C01 #Tyz i term A[index, find.find_index(i + 1, j, k)] = C02 # Tyz i+1 term #end of cold channel ############################################################################################# #Wall nodes if (np.mod(j, 2) == 0 and np.mod(k, 2) == 1): # Type 1 Solid node C01 = 1 / (2 * Rwx) C02 = 1 / (2 * Rwx) C11 = 0.5 / (Rwy + Rc(Tg[i, j - 1, k], Pg[i, j - 1, k], i, j - 1, k)) C12 = 0.5 / (Rwy + Rc(Tg[i, j + 1, k], Pg[i, j + 1, k], i, j + 1, k)) C21 = 1 / (2 * Rwz) C22 = 1 / (2 * Rwz) if (i == 0): # Hot in BC if ( np.mod(j + 1, 4) == 1 ): # hot channet at j+1 and cold at j-1 Hot in BC at i,j+1 C0 = -C02 - 2 * C11 - 2 * C12 - C21 - C22 A[index, find.find_index(i, j, k)] = C0 #i,j,k A[index, find.find_index(i + 1, j, k)] = C02 #i+1,j,k A[index, find.find_index(i, j, k - 1)] = C21 #i,j,k-1 A[index, find.find_index(i, j, k + 1)] = C22 #i,j,k+1 A[index, find.find_index(i, j - 1, k)] = C11 #i,j-1,k A[index, find.find_index(i + 1, j - 1, k)] = C11 A[index, find.find_index(i + 1, j + 1, k)] = C12 C[index, 0] = -I.T_hot_in * C12 elif (np.mod( j - 1, 4) == 1): # Hot channel at j-1 Hot in BC at i,j-1 C0 = -C02 - 2 * C11 - 2 * C12 - C21 - C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i + 1, j, k)] = C02 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 A[index, find.find_index(i + 1, j - 1, k)] = C11 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i + 1, j + 1, k)] = C12 C[index, 0] = -I.T_hot_in * C11 elif (i == a - 1): # Cold in BC if ( np.mod(j + 1, 4) == 1 ): # hot channet at j+1 and cold at j-1 Cold in BC at i+1,j-1 C0 = -C02 - 2 * C11 - 2 * C12 - C21 - C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i - 1, j, k)] = C01 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i + 1, j + 1, k)] = C12 A[index, find.find_index(i, j - 1, k)] = C11 C[index, 0] = -I.T_cold_in * C11 elif (np.mod(j - 1, 4) == 1 ): # Hot channel at j-1 Cold in BC at i+1,j+1 C0 = -C02 - 2 * C11 - 2 * C12 - C21 - C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i - 1, j, k)] = C01 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i, j - 1, k)] = C11 A[index, find.find_index(i + 1, j - 1, k)] = C11 C[index, 0] = -I.T_cold_in * C12 else: C0 = -C02 - C01 - 2 * C11 - 2 * C12 - C21 - C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i - 1, j, k)] = C01 A[index, find.find_index(i + 1, j, k)] = C02 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i + 1, j + 1, k)] = C12 A[index, find.find_index(i, j - 1, k)] = C11 A[index, find.find_index(i + 1, j - 1, k)] = C11 if (np.mod(j, 2) == 0 and np.mod(k, 2) == 0): # Type 2 Solid node C01 = 1 / (2 * Rwx) C02 = 1 / (2 * Rwx) C11 = 1 / (2 * Rwy) C12 = 1 / (2 * Rwy) C21 = 1 / (2 * Rwz) C22 = 1 / (2 * Rwz) if (i == 0): C0 = -C02 - C11 - C12 - C21 - C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i + 1, j, k)] = C02 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i, j - 1, k)] = C11 elif (i == a - 1): C0 = -C01 - C11 - C12 - C21 - C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i - 1, j, k)] = C01 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i, j - 1, k)] = C11 else: C0 = -C01 - C02 - C11 - C12 - C21 - C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i - 1, j, k)] = C01 A[index, find.find_index(i + 1, j, k)] = C02 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i, j - 1, k)] = C11 if (np.mod(j, 2) == 1 and np.mod(k, 2) == 0): # Type 3 Solid node C01 = 1 / (2 * Rwx) C02 = 1 / (2 * Rwx) C11 = 1 / (2 * Rwy) C12 = 1 / (2 * Rwy) C21 = 0.5 / (Rwz + Rc(Tg[i, j, k - 1], Pg[i, j, k - 1], i, j, k - 1)) C22 = 0.5 / (Rwz + Rc(Tg[i, j, k + 1], Pg[i, j, k + 1], i, j, k + 1)) if (i == 0): # Hot in BC if ( np.mod(j, 4) == 1 ): # hot channel at j and cold at j-1, j+1-> Hot in BC at i,k-1 k+1 C0 = -C02 - C11 - C12 - 2 * C21 - 2 * C22 A[index, find.find_index(i, j, k)] = C0 #i,j,k A[index, find.find_index(i + 1, j, k)] = C02 #i+1,j,k A[index, find.find_index(i, j - 1, k)] = C11 #i,j-1,k A[index, find.find_index(i, j + 1, k)] = C12 #i,j+1,k A[index, find.find_index(i + 1, j, k - 1)] = C21 #i+1,j,k-1 A[index, find.find_index(i + 1, j, k + 1)] = C22 #i+1,j,k+1 C[index, 0] = -I.T_hot_in * (C21 + C22) elif (np.mod( j, 4) == 3): # Cold channet at j and hot at j-1, j+1 C0 = -C02 - C11 - C12 - 2 * C21 - 2 * C22 A[index, find.find_index(i, j, k)] = C0 #i,j,k A[index, find.find_index(i + 1, j, k)] = C02 #i+1,j,k A[index, find.find_index(i, j - 1, k)] = C11 #i,j-1,k A[index, find.find_index(i, j + 1, k)] = C12 #i,j+1,k A[index, find.find_index(i + 1, j, k - 1)] = C21 #Tyz i+1,j,k-1 A[index, find.find_index(i + 1, j, k + 1)] = C22 #Tyz i+1,j,k+1 A[index, find.find_index(i, j, k - 1)] = C21 #Tyz i,j,k-1 A[index, find.find_index(i, j, k + 1)] = C22 #Tyz i+,j,k+1 elif (i == a - 1): # Cold in BC if (np.mod(j, 4) == 1): # hot channet at j C0 = -C01 - C11 - C12 - 2 * C21 - 2 * C22 A[index, find.find_index(i, j, k)] = C0 #ijk A[index, find.find_index(i - 1, j, k)] = C01 #i-1,j,k A[index, find.find_index(i, j - 1, k)] = C11 #i,j-1,k A[index, find.find_index(i, j + 1, k)] = C12 #i,j+1,k A[index, find.find_index(i, j, k - 1)] = C21 #i,j,k-1 A[index, find.find_index(i, j, k + 1)] = C22 #i,j,k+1 A[index, find.find_index(i + 1, j, k - 1)] = C21 #i+1,j,k-1 A[index, find.find_index(i + 1, j, k + 1)] = C22 #i+1,j,k+1 elif (np.mod(j, 4) == 3 ): # Cold channel at j Cold in BC at i+1,k-1,+1 C0 = -C01 - C11 - C12 - 2 * C21 - 2 * C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i - 1, j, k)] = C01 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i, j - 1, k)] = C11 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 C[index, 0] = -I.T_cold_in * (C21 + C22) else: C0 = -C02 - C01 - C11 - C12 - 2 * C21 - 2 * C22 A[index, find.find_index(i, j, k)] = C0 A[index, find.find_index(i - 1, j, k)] = C01 A[index, find.find_index(i + 1, j, k)] = C02 A[index, find.find_index(i, j, k - 1)] = C21 A[index, find.find_index(i, j, k + 1)] = C22 A[index, find.find_index(i + 1, j, k + 1)] = C22 A[index, find.find_index(i + 1, j, k - 1)] = C21 A[index, find.find_index(i, j + 1, k)] = C12 A[index, find.find_index(i, j - 1, k)] = C11 A, C = BC.adiabatic_wall_BC(A, C, Rwx, Rwy, Rwz, Tg, Tyz, Pg, P) return (A, C)
nd=nd, VF_model=1, LS_model=LS_model, Closure_0_model=Closure_0_model, Closure_1_model=Closure_1_model, epsFact_density=epsFact_density, stokes=False, useVF=useVF, useRBLES=useRBLES, useMetrics=useMetrics, eb_adjoint_sigma=1.0, forceStrongDirichlet=ns_forceStrongDirichlet, turbulenceClosureModel=ns_closure) setBC = BC.boundaryConditions() def getDBC_p(x,flag): BCType = "pDirichlet" if flag == boundaryTags['top']: return outflowPressure if flag == boundaryTags['left']: return None if flag == boundaryTags['right']: return outflowPressure if flag == boundaryTags['bottom']: return setBC.freeSlip(BCType) def getDBC_u(x,flag): BCType = "uDirichlet"
def parse_fastq_by_multitag(directory, f_gzipped_fastqfile, r_gzipped_fastqfile, q = "fastq", f_seqtag_length = 8, r_seqtag_length = 8, f_multitag_length = 6, r_multitag_length = 6, f_lintag_length = 38, r_lintag_length = 38, f_spacer_length = 43, #distance to first barcode in forward read (ignoring the length the thie multitag and the seqtag) r_spacer_length = 29, #distance second barcode in reverse read (ignoring the length the thie multitag and the seqtag) min_qs = 30, #the minimum avareage quality score for both lineage tags lintag_grep_filter1 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?AA\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #first barcode lintag_grep_filter2 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?TT\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #second barcode clip_ends = 1, #logical of whether or not to clip the front and back ends off of lintag1 and lintag2 lintag1_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag1 after this sequence lintag2_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag2 after this sequence lintag1_rear_clipper = '(.TAA|A.AA|AT.A|ATA.)', #only report lintag1 before this sequence, this must be the COMPLIMENT of the true sequence lintag2_rear_clipper = '(.TAA|A.AA|AT.A|ATA.)', #only report lintag2 before this sequence, this must be the COMPLIMENT of the true sequence multitags = ["TAGCTTGCGTAC", "CGATGTGAGACG"], #concatenated multiplexing tags from the first and second reads that uniquely identify a sample, currently must have 2 or more multitags write_multitags = False): #write multitags to file """ Parses a F and R gzipped FastQ files and saves the UMIs, multiplexing tags, and barcodes Removes reeads where the mean quality score for each lineage tag is not greater than min_qs Removes reeads where both lineage tags do not match the regular expression """ from Bio import SeqIO import os import gzip import numpy import BC import re from itertools import izip os.chdir(directory) print("Loading " + f_gzipped_fastqfile + " and " + r_gzipped_fastqfile + " and parsing") print( "Saving the combined forward and reverse sequencing tags as seqtag.txt") print( "Saving the combined forward and reverse multiplexing tags as multitag.txt") print( "Saving the first lineage tag as lintag1.txt") print( "Saving the first lineage tag as lintag2.txt") #assign boundries f_boundries = (0, f_seqtag_length , f_multitag_length + f_seqtag_length, f_multitag_length + f_seqtag_length + f_spacer_length, f_multitag_length + f_seqtag_length + f_spacer_length + f_lintag_length) r_boundries = (0, r_seqtag_length , r_multitag_length + r_seqtag_length, r_multitag_length + r_seqtag_length + r_spacer_length, r_multitag_length + r_seqtag_length + r_spacer_length + r_lintag_length) #open files for writing #reads that sort to a multiplexing tag for i in multitags: vars()[i+'_seqtag'] = open(directory + i + '_seqtag.txt', 'w') vars()[i+'_lintag1'] = open(directory + i + '_lintag1.txt', 'w') vars()[i+'_lintag2'] = open(directory + i + '_lintag2.txt', 'w') if write_multitags: vars()[i+'_multitag'] = open(directory + i + '_multitag.txt', 'w') #reads that do not sort to a multiplexing tag unmatched_seqtag = open(directory + 'unmatched_seqtag.txt', 'w') unmatched_lintag1 = open(directory + 'unmatched_lintag1.txt', 'w') unmatched_lintag2 = open(directory + 'unmatched_lintag2.txt', 'w') unmatched_multitag = open(directory + 'unmatched_multitag.txt', 'w') #open files for reading by SeqIO f_file = SeqIO.parse(gzip.open(directory + f_gzipped_fastqfile, "rU"), q) r_file = SeqIO.parse(gzip.open(directory + r_gzipped_fastqfile, "rU"), q) #eliminate low quality reads and reads that don't pass a quality filter, optionally clip off ends of lintags # sort by multiplexing tags quality_reads = 0 total_reads = 0 for f, r in izip(f_file, r_file): fq = f.letter_annotations["phred_quality"] rq = r.letter_annotations["phred_quality"] total_reads = total_reads + 1 if numpy.mean(fq[f_boundries[3]:f_boundries[4]]) > min_qs and numpy.mean(rq[r_boundries[3]:r_boundries[4]]) > min_qs: #checks that the quality scores of forward and reverse lintags are OK #print "quality ok" fr = str(f.seq) #print fr rr = str(r.seq) #print rr if BC.grep(fr[f_boundries[3]:f_boundries[4]], lintag_grep_filter1) and BC.grep(rr[r_boundries[3]:r_boundries[4]], lintag_grep_filter2): #checks the both lineage tags meet the regular expression filter #print "grep ok" quality_reads = quality_reads + 1 #these are reads where both lintags pass the quality and grep filters #next, find the closest matching multitag m = fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] #the concatintated multiplexing tag #print m j = BC.best_match(m, multitags, MAX = (f_multitag_length + r_multitag_length + 1)/3) #best matched multiplexing tag #print j if j > -1: tm = BC.mismatches(m, multitags[j]) #distance to this tag else: tm = 1000 if tm < (f_multitag_length + r_multitag_length + 1)/4: #A multitag match has been found ftag = fr[f_boundries[3]:f_boundries[4]] rtag = rr[r_boundries[3]:r_boundries[4]] if(clip_ends): fstart = re.search(lintag1_front_clipper, ftag).span()[1] fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1 if fend == 0: fend = len(ftag) ftag = ftag[fstart:fend] rstart = re.search(lintag2_front_clipper, rtag).span()[1] rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1 if rend == 0: rend = len(rtag) rtag = rtag[rstart:rend] vars()[multitags[j]+'_lintag1'].write(ftag + '\n') vars()[multitags[j]+'_lintag2'].write(rtag + '\n') vars()[multitags[j]+'_seqtag'].write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] + '\n') if write_multitags: vars()[multitags[j]+'_multitag'].write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + '\n') #if (len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]) < 12 #or len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]) < 16 #or len(ftag) < 20 #or len(rtag) < 20): # print rea # print "match to " + multitags[j] # print "multitag = " + fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + " " + str(len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]])) # print "seqtag = " + fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] + " " + str(len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]])) # print "lintag1 = " + ftag + " " + str(len(ftag)) # print "lintag2 = " + rtag + " " + str(len(rtag)) # break else: ftag = fr[f_boundries[3]:f_boundries[4]] rtag = rr[r_boundries[3]:r_boundries[4]] if(clip_ends): fstart = re.search(lintag1_front_clipper, ftag).span()[1] fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1 if fend == 0: fend = len(ftag) ftag = ftag[fstart:fend] rstart = re.search(lintag2_front_clipper, rtag).span()[1] rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1 if rend == 0: rend = len(rtag) rtag = rtag[rstart:rend] unmatched_lintag1.write(ftag + '\n') unmatched_lintag2.write(rtag + '\n') unmatched_seqtag.write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] +'\n') unmatched_multitag.write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] +'\n') #if (len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]) < 12 #or len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]) < 16 #or len(ftag) < 20 #or len(rtag) < 20): # print rea # print "match to " + multitags[j] # print "multitag = " + fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + " " + str(len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]])) # print "seqtag = " + fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] + " " + str(len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]])) # print "lintag1 = " + ftag + " " + str(len(ftag)) # print "lintag2 = " + rtag + " " + str(len(rtag)) # break print ( str(quality_reads) + " out of " + str(total_reads) +" reads passed grep and quality filters") for i in multitags: vars()[str(i)+'_seqtag'].close() vars()[str(i)+'_lintag1'].close() vars()[str(i)+'_lintag2'].close() if write_multitags: vars()[str(i)+'_multitag'].close() unmatched_seqtag.close() unmatched_lintag1.close() unmatched_lintag2.close() unmatched_multitag.close() f_file.close() r_file.close()
def parse_fastq(directory, f_gzipped_fastqfile, r_gzipped_fastqfile, q = "fastq", #the type of fastq file coming off of the sequencer f_seqtag_length = 8, #the length of the sequencing tag on the first read (UMI1) r_seqtag_length = 8, #the length of the sequencing tag on the second read (UMI2) f_multitag_length = 6, #the length of the multiplexing tag on the first read r_multitag_length = 6, #the length of the multiplexing tag on the second read f_lintag_length = 38, #the length of the lineage tag on the first read (first barcode) r_lintag_length = 38, #the length of the lineage tag on the second read (second barcode) f_spacer_length = 43, #distance to first barcode in forward read (ignoring the length the thie multitag and the seqtag) r_spacer_length = 29, #distance second barcode in reverse read (ignoring the length the thie multitag and the seqtag) min_qs = 30, #the minimum avareage quality score for both lineage tags lintag_grep_filter1 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?AA\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #first barcode lintag_grep_filter2 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?TT\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #second barcode clip_ends = 1, #logical of whether or not to clip the front and back ends off of lintag1 and lintag2 lintag1_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag1 after this sequence lintag2_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag2 after this sequence lintag1_rear_clipper = '(.ATA|A.TA|AA.A|AAT.)', #only report lintag1 before this sequence, this must be the COMPLIMENT of the true sequence lintag2_rear_clipper = '(.ATA|A.TA|AA.A|AAT.)'):#only report lintag1 before this sequence, this must be the COMPLIMENT of the true sequence """ Parses a F and R gzipped FastQ files and saves the UMIs, multiplexing tags, and barcodes Removes reeads where the mean quality score for each lineage tag is not greater than min_qs Removes reeads where both lineage tags do not match the regular expression """ from Bio import SeqIO import os import gzip import numpy import BC import re from itertools import izip os.chdir(directory) print("Loading " + f_gzipped_fastqfile + " and " + r_gzipped_fastqfile + " and parsing") print( "Saving the combined forward and reverse sequencing tags as seqtag.txt") print( "Saving the combined forward and reverse multiplexing tags as multitag.txt") print( "Saving the first lineage tag as lintag1.txt") print( "Saving the first lineage tag as lintag2.txt") #assign boundries f_boundries = (0, f_seqtag_length , f_multitag_length + f_seqtag_length, f_multitag_length + f_seqtag_length + f_spacer_length, f_multitag_length + f_seqtag_length + f_spacer_length + f_lintag_length) r_boundries = (0, r_seqtag_length , r_multitag_length + r_seqtag_length, r_multitag_length + r_seqtag_length + r_spacer_length, r_multitag_length + r_seqtag_length + r_spacer_length + r_lintag_length) #open files for writing seqtag = open(directory + '_seqtag.txt', 'w') multitag = open(directory + '_multitag.txt', 'w') lintag1 = open(directory + '_lintag1.txt', 'w') lintag2 = open(directory + '_lintag2.txt', 'w') #open files for reading by SeqIO f_file = SeqIO.parse(gzip.open(directory + f_gzipped_fastqfile, "rU"), q) r_file = SeqIO.parse(gzip.open(directory + r_gzipped_fastqfile, "rU"), q) #eliminate low quality reads and reads that don't pass a quality filter, optionally clip off ends of lintags quality_reads = 0 total_reads = 0 for f, r in izip(f_file, r_file): fq = f.letter_annotations["phred_quality"] rq = r.letter_annotations["phred_quality"] total_reads = total_reads + 1 if numpy.mean(fq[f_boundries[3]:f_boundries[4]]) > min_qs and numpy.mean(rq[r_boundries[3]:r_boundries[4]]) > min_qs: #checks that the quality scores of forward and reverse lintags are OK fr = str(f.seq) rr = str(r.seq) if BC.grep(fr[f_boundries[3]:f_boundries[4]], lintag_grep_filter1) and BC.grep(rr[r_boundries[3]:r_boundries[4]], lintag_grep_filter2): #checks the both lineage tags meet the regular expression filter quality_reads = quality_reads + 1 seqtag.write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] +'\n') multitag.write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] +'\n') ftag = fr[f_boundries[3]:f_boundries[4]] rtag = rr[r_boundries[3]:r_boundries[4]] if (clip_ends): fstart = re.search(lintag1_front_clipper, ftag).span()[1] fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1 if fend == 0: fend = len(ftag) ftag = ftag[fstart:fend] rstart = re.search(lintag2_front_clipper, rtag).span()[1] rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1 if rend == 0: rend = len(rtag) rtag = rtag[rstart:rend] lintag1.write(ftag + '\n') lintag2.write(rtag + '\n') print ( str(quality_reads) + " out of " + str(total_reads) +" reads passed grep and quality filters") seqtag.close() multitag.close() lintag1.close() lintag2.close() f_file.close() r_file.close()