Пример #1
0
def best_match(SEQ1, LIST, MAX = float("inf"), IGNORE_N = 0, PRINT = 0 ):
	"""finds the best match for a sequence in a list of sequences.
	MAX sets the number of max number of mismatches before it moves on.
	Lowering MAX increases performance.
	IGNORE_N = 1 will ignore mismatches with N."""
	x = []
	xcount = []
	y = MAX
	no_exact_match = 0
	#first search for exact matach
	for i in range(len(LIST)):
		if SEQ1 ==  LIST[i]:
			no_exact_match = 0
			return i
			break 
	if no_exact_match:
		for i in range(len(LIST)):
		   z = BC.mismatches(SEQ1, LIST[i], y, IGNORE_N)
		   if z < y:
			  y = z
			  x.append(i)
			  xcount.append(z)
		   if z == 0:
			  break
		if len(x) > 0:
		   comp = "==" + str(min(xcount))
		   best =  [a for a,b in enumerate(xcount) if eval(str(b) + comp)]
		   if PRINT == 1:
			  print SEQ1
			  print LIST[x[0]]
			  print x[best[0]], xcount[best[0]]
		   return x[best[0]]
		else:
		   return -1
Пример #2
0
 def calc(self, a=1, b=1, c=1):
     a = int(self.entry.get())
     b = int(self.entry2.get())
     c = int(self.entry3.get())
     final = BC.convert(a,b,c)
     self.result.delete(0,END)
     self.result.insert(END,final)
Пример #3
0
 def calc(self, a=1, b=1, c=1):
     a = int(self.entry.get())
     b = int(self.entry2.get())
     c = int(self.entry3.get())
     final = BC.convert(a, b, c)
     self.result.delete(0, END)
     self.result.insert(END, final)
Пример #4
0
 def test_bc(self):
     poissonForm = PoissonFormulation.PoissonFormulation(2, True)
     poissonBF = poissonForm.bf()
     mesh = MeshFactory.MeshFactory_rectilinearMesh(poissonBF, [1.0, 1.0],
                                                    [2, 3], 4)
     s = Solution.Solution_solution(mesh)
     bc = BC.BC_bc()
     s.setBC(bc)
     #self.assertEqual(bc, s.bc())
     # i want to test that what I set is equal to what I used to set it with, but
     # since that won't work, I can at least test that they behave in the same way
     self.assertEqual(bc.singlePointBC(0), s.bc().singlePointBC(0))
Пример #5
0
 def testBC(self):
     poissonForm = PoissonFormulation.PoissonFormulation(2, True)
     poissonBF = poissonForm.bf()
     mesh = MeshFactory.MeshFactory_rectilinearMesh(poissonBF, [1.0, 1.0],
                                                    [2, 3], 4)
     soln = Solution.Solution_solution(mesh)
     vf = VarFactory.VarFactory()
     fv = vf.fieldVar("Hello")
     testBC = BC.BC_bc()
     soln.setBC(testBC)
     self.assertEqual(testBC.bcsImposed(fv.ID()),
                      soln.bc().bcsImposed(fv.ID()))
Пример #6
0
def main():

    print(
        "Please select an option below: \n 1) Create test user\n 2) create user \n 0) Quit\n"
    )
    user_input = input()
    print(user_input)

    if (user_input == '1'):
        test_case = BC()
        # print(test_case)
    elif (user_input == '2'):
        create = c.create_user()
    else:
        print("Exiting......\n")
Пример #7
0
    def testZeroMeanConstraint(self):
        #Initial Test Values & Set up of Dummy variable
        vf = VarFactory.VarFactory()
        bc = BC.BC_bc()
        testVar = vf.fieldVar("testVar", 2)
        ID = testVar.ID()
        bc.addZeroMeanConstraint(testVar)

        #Test to see if ZeroMeanConstraint has been added correctly
        self.assertTrue(bc.imposeZeroMeanConstraint(ID),
                        "No Zero Mean Constraint Imposed")

        #Test to see if one can correctly remove ZeroMeanConstraint
        bc.removeZeroMeanConstraint(ID)
        self.assertFalse(bc.imposeZeroMeanConstraint(ID),
                         "Zero Mean Constraint not removed")
Пример #8
0
 def testSinglePoint(self):
     #Initial Test Values & Set up of Dummy variable
     vf = VarFactory.VarFactory()
     testVar = vf.fieldVar("testVar", 2)
     ID = testVar.ID()
     testVertex = 4294967295
     testFieldID = 9
     testValue = 17.1
     bc = BC.BC_bc()
     bc.addSinglePointBC(testFieldID, testValue)
     #Test to see if Single Point BC has been added correctly
     self.assertFalse(bc.bcsImposed(testFieldID),
                      "Single Point BC not Imposed")
     self.assertTrue(bc.singlePointBC(testFieldID), "No Single Point BC")
     self.assertTrue(testValue == bc.valueForSinglePointBC(testFieldID),
                     "Value on Single Point BC not maintained")
     self.assertEquals(testVertex, bc.vertexForSinglePointBC(testFieldID),
                       "Vertex on Single Point BC not maintained")
Пример #9
0
    def testDirichlet(self):
        #Initial Test Values & Set up of Dummy variable
        testSpatialFilter = SpatialFilter.SpatialFilter.allSpace()
        testFunction = Function.Function.xn()
        vf = VarFactory.VarFactory()
        bc = BC.BC_bc()
        traceVar = vf.traceVar("traceVar", 2)
        ID = traceVar.ID()
        a = [testSpatialFilter, testFunction]
        bc.addDirichlet(traceVar, testSpatialFilter, testFunction)

        #Tests to see if Dirichlet has been added correctly
        self.assertTrue(
            testFunction.evaluate(2, 3) == bc.getDirichletBC(ID)[1].evaluate(
                2, 3), "Dirichlet BC failed")
        #Maybe same thing as line 60 for spatial filter?
        self.assertTrue(
            testFunction.evaluate(4, 3) ==
            bc.getSpatiallyFilteredFunctionForDirichletBC(ID).evaluate(4, 3),
            "Dirichlet Spatially Filtered Funtion failed")
Пример #10
0
    def testExporter(self):
        #Initial Test Values & Set Up Dummy Variables
        poissonForm = PoissonFormulation.PoissonFormulation(2, True)
        poissonBF = poissonForm.bf(
        )  #ToDo Give the VarFactory a field & test variable
        testMesh = MeshFactory.MeshFactory_rectilinearMesh(
            poissonBF, [1.2, 1.4], [2, 3], 2)
        testFunction = Function.Function.xn()
        testFunction2 = Function.Function.yn()
        testVector = [testFunction, testFunction2]
        testVector2 = ["function1", "function2"]
        testBC = BC.BC_bc()
        testSolutionPtr = Solution.Solution_solution(testMesh)
        testExport = HDF5Exporter.HDF5Exporter(testMesh, "output", ".")

        #Tests exportFunction using definition #1
        testExport.exportFunction(testFunction, "function", 0)

        #Tests exportFunction using definition #2
        testExport.exportFunction(testVector, testVector2, 0)

        #Tests exportSolution
        testExport.exportSolution(testSolutionPtr, 0)
Пример #11
0
def linear_equation_system(
        Rwx, Rwy, Rwz, Tg, Tyz, Pg,
        P):  # Generates coefficient matrix A and constant matrix C
    # WALL BOUNDARY CONDITIONS ARE NOT IMPOSED IN THIS FUNCTION
    a = I.a
    b = I.b
    c = I.c
    A = np.zeros([a * b * c, a * b * c])
    C = np.zeros([a * b * c, 1])
    for index in range(0, np.size(C)):
        i, j, k = find.find_equation_id(index)
        # print("i j k",i,j,k)
        i = int(i)
        j = int(j)
        k = int(k)

        if (j != 0 and j != b - 1 and k != 0
                and k != c - 1):  # except boundary walls

            if (np.mod(j, 4) == 1 and np.mod(k, 2) == 1):  # Hot channel

                # print("Hot channel")
                # print("Node of balance",i,j,k)
                # print(index)
                C11 = 1 / (Rwy + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k))
                C12 = 1 / (Rwy + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k))
                C21 = 1 / (Rwz + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k))
                C22 = 1 / (Rwz + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k))
                C01 = -C11 - C21 + I.m_channel * find.Cp(
                    Tg[i, j, k], Pg[i, j, k], i, j, k)
                C02 = -C11 - C21 - I.m_channel * find.Cp(
                    Tg[i, j, k], Pg[i, j, k], i, j, k)

                A[index, find.find_index(i, j - 1, k)] = C11  # Tg j-1 term
                A[index, find.find_index(i, j + 1, k)] = C12  # Tg j+1 term
                A[index, find.find_index(i, j, k - 1)] = C21  # Tg k-1 term
                A[index, find.find_index(i, j, k + 1)] = C22  # Tg k+1 term

                if (i == 0):  # Ti term is hot in BC
                    C[index, 0] = -I.T_hot_in * C01  # BC = Thot in
                    A[index,
                      find.find_index(i + 1, j, k)] = C02  # Tyz i+1 term

                else:  # interior
                    A[index, find.find_index(i, j, k)] = C01  #Tyz i term
                    A[index,
                      find.find_index(i + 1, j, k)] = C02  # Tyz i+1 term

#end of Hot channel
#############################################################################################
#Cold Channel

            if (np.mod(j, 4) == 3 and np.mod(k, 2) == 1):  # Cold channel

                C11 = 1 / (Rwy + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k))
                C12 = 1 / (Rwy + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k))
                C21 = 1 / (Rwz + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k))
                C22 = 1 / (Rwz + Rc(Tg[i, j, k], Pg[i, j, k], i, j, k))
                C01 = -C11 - C21 - I.m_channel * find.Cp(
                    Tg[i, j, k], Pg[i, j, k], i, j, k)
                C02 = -C11 - C21 + I.m_channel * find.Cp(
                    Tg[i, j, k], Pg[i, j, k], i, j, k)

                A[index, find.find_index(i, j - 1, k)] = C11  # Tg j-1 term
                A[index, find.find_index(i, j + 1, k)] = C12  # Tg j+1 term
                A[index, find.find_index(i, j, k - 1)] = C21  # Tg k-1 term
                A[index, find.find_index(i, j, k + 1)] = C22  # Tg k+1 term

                if (i == a - 1):  # Ti-1 term is adjusted - Cold outlet
                    A[index, find.find_index(i, j, k)] = C01  #Tyz i term
                    C[index,
                      0] = -I.T_cold_in * C02  #Cold in BC-> Tyz i+1 term

                else:  # interior
                    A[index, find.find_index(i, j, k)] = C01  #Tyz i term
                    A[index,
                      find.find_index(i + 1, j, k)] = C02  # Tyz i+1 term

#end of cold channel
#############################################################################################
#Wall nodes

            if (np.mod(j, 2) == 0 and np.mod(k, 2) == 1):  # Type 1 Solid node
                C01 = 1 / (2 * Rwx)
                C02 = 1 / (2 * Rwx)
                C11 = 0.5 / (Rwy +
                             Rc(Tg[i, j - 1, k], Pg[i, j - 1, k], i, j - 1, k))
                C12 = 0.5 / (Rwy +
                             Rc(Tg[i, j + 1, k], Pg[i, j + 1, k], i, j + 1, k))
                C21 = 1 / (2 * Rwz)
                C22 = 1 / (2 * Rwz)

                if (i == 0):  # Hot in BC

                    if (
                            np.mod(j + 1, 4) == 1
                    ):  # hot channet at j+1 and cold at j-1 Hot in BC at i,j+1
                        C0 = -C02 - 2 * C11 - 2 * C12 - C21 - C22
                        A[index, find.find_index(i, j, k)] = C0  #i,j,k
                        A[index, find.find_index(i + 1, j, k)] = C02  #i+1,j,k
                        A[index, find.find_index(i, j, k - 1)] = C21  #i,j,k-1
                        A[index, find.find_index(i, j, k + 1)] = C22  #i,j,k+1
                        A[index, find.find_index(i, j - 1, k)] = C11  #i,j-1,k
                        A[index, find.find_index(i + 1, j - 1, k)] = C11
                        A[index, find.find_index(i + 1, j + 1, k)] = C12
                        C[index, 0] = -I.T_hot_in * C12

                    elif (np.mod(
                            j - 1,
                            4) == 1):  # Hot channel at j-1 Hot in BC at i,j-1
                        C0 = -C02 - 2 * C11 - 2 * C12 - C21 - C22
                        A[index, find.find_index(i, j, k)] = C0
                        A[index, find.find_index(i + 1, j, k)] = C02
                        A[index, find.find_index(i, j, k - 1)] = C21
                        A[index, find.find_index(i, j, k + 1)] = C22
                        A[index, find.find_index(i + 1, j - 1, k)] = C11
                        A[index, find.find_index(i, j + 1, k)] = C12
                        A[index, find.find_index(i + 1, j + 1, k)] = C12
                        C[index, 0] = -I.T_hot_in * C11

                elif (i == a - 1):  # Cold in BC

                    if (
                            np.mod(j + 1, 4) == 1
                    ):  # hot channet at j+1 and cold at j-1 Cold in BC at i+1,j-1
                        C0 = -C02 - 2 * C11 - 2 * C12 - C21 - C22
                        A[index, find.find_index(i, j, k)] = C0
                        A[index, find.find_index(i - 1, j, k)] = C01
                        A[index, find.find_index(i, j, k - 1)] = C21
                        A[index, find.find_index(i, j, k + 1)] = C22
                        A[index, find.find_index(i, j + 1, k)] = C12
                        A[index, find.find_index(i + 1, j + 1, k)] = C12
                        A[index, find.find_index(i, j - 1, k)] = C11
                        C[index, 0] = -I.T_cold_in * C11

                    elif (np.mod(j - 1, 4) == 1
                          ):  # Hot channel at j-1 Cold in BC at i+1,j+1
                        C0 = -C02 - 2 * C11 - 2 * C12 - C21 - C22
                        A[index, find.find_index(i, j, k)] = C0
                        A[index, find.find_index(i - 1, j, k)] = C01
                        A[index, find.find_index(i, j, k - 1)] = C21
                        A[index, find.find_index(i, j, k + 1)] = C22
                        A[index, find.find_index(i, j + 1, k)] = C12
                        A[index, find.find_index(i, j - 1, k)] = C11
                        A[index, find.find_index(i + 1, j - 1, k)] = C11
                        C[index, 0] = -I.T_cold_in * C12
                else:
                    C0 = -C02 - C01 - 2 * C11 - 2 * C12 - C21 - C22
                    A[index, find.find_index(i, j, k)] = C0
                    A[index, find.find_index(i - 1, j, k)] = C01
                    A[index, find.find_index(i + 1, j, k)] = C02
                    A[index, find.find_index(i, j, k - 1)] = C21
                    A[index, find.find_index(i, j, k + 1)] = C22
                    A[index, find.find_index(i, j + 1, k)] = C12
                    A[index, find.find_index(i + 1, j + 1, k)] = C12
                    A[index, find.find_index(i, j - 1, k)] = C11
                    A[index, find.find_index(i + 1, j - 1, k)] = C11

            if (np.mod(j, 2) == 0 and np.mod(k, 2) == 0):  # Type 2 Solid node
                C01 = 1 / (2 * Rwx)
                C02 = 1 / (2 * Rwx)
                C11 = 1 / (2 * Rwy)
                C12 = 1 / (2 * Rwy)
                C21 = 1 / (2 * Rwz)
                C22 = 1 / (2 * Rwz)
                if (i == 0):
                    C0 = -C02 - C11 - C12 - C21 - C22
                    A[index, find.find_index(i, j, k)] = C0
                    A[index, find.find_index(i + 1, j, k)] = C02
                    A[index, find.find_index(i, j, k - 1)] = C21
                    A[index, find.find_index(i, j, k + 1)] = C22
                    A[index, find.find_index(i, j + 1, k)] = C12
                    A[index, find.find_index(i, j - 1, k)] = C11

                elif (i == a - 1):
                    C0 = -C01 - C11 - C12 - C21 - C22
                    A[index, find.find_index(i, j, k)] = C0
                    A[index, find.find_index(i - 1, j, k)] = C01
                    A[index, find.find_index(i, j, k - 1)] = C21
                    A[index, find.find_index(i, j, k + 1)] = C22
                    A[index, find.find_index(i, j + 1, k)] = C12
                    A[index, find.find_index(i, j - 1, k)] = C11

                else:
                    C0 = -C01 - C02 - C11 - C12 - C21 - C22
                    A[index, find.find_index(i, j, k)] = C0
                    A[index, find.find_index(i - 1, j, k)] = C01
                    A[index, find.find_index(i + 1, j, k)] = C02
                    A[index, find.find_index(i, j, k - 1)] = C21
                    A[index, find.find_index(i, j, k + 1)] = C22
                    A[index, find.find_index(i, j + 1, k)] = C12
                    A[index, find.find_index(i, j - 1, k)] = C11

            if (np.mod(j, 2) == 1 and np.mod(k, 2) == 0):  # Type 3 Solid node

                C01 = 1 / (2 * Rwx)
                C02 = 1 / (2 * Rwx)
                C11 = 1 / (2 * Rwy)
                C12 = 1 / (2 * Rwy)
                C21 = 0.5 / (Rwz +
                             Rc(Tg[i, j, k - 1], Pg[i, j, k - 1], i, j, k - 1))
                C22 = 0.5 / (Rwz +
                             Rc(Tg[i, j, k + 1], Pg[i, j, k + 1], i, j, k + 1))

                if (i == 0):  # Hot in BC

                    if (
                            np.mod(j, 4) == 1
                    ):  # hot channel at j and cold at j-1, j+1-> Hot in BC at i,k-1 k+1
                        C0 = -C02 - C11 - C12 - 2 * C21 - 2 * C22
                        A[index, find.find_index(i, j, k)] = C0  #i,j,k
                        A[index, find.find_index(i + 1, j, k)] = C02  #i+1,j,k
                        A[index, find.find_index(i, j - 1, k)] = C11  #i,j-1,k
                        A[index, find.find_index(i, j + 1, k)] = C12  #i,j+1,k
                        A[index,
                          find.find_index(i + 1, j, k - 1)] = C21  #i+1,j,k-1
                        A[index,
                          find.find_index(i + 1, j, k + 1)] = C22  #i+1,j,k+1

                        C[index, 0] = -I.T_hot_in * (C21 + C22)

                    elif (np.mod(
                            j,
                            4) == 3):  # Cold channet at j and hot at j-1, j+1
                        C0 = -C02 - C11 - C12 - 2 * C21 - 2 * C22
                        A[index, find.find_index(i, j, k)] = C0  #i,j,k
                        A[index, find.find_index(i + 1, j, k)] = C02  #i+1,j,k
                        A[index, find.find_index(i, j - 1, k)] = C11  #i,j-1,k
                        A[index, find.find_index(i, j + 1, k)] = C12  #i,j+1,k
                        A[index, find.find_index(i + 1, j, k -
                                                 1)] = C21  #Tyz i+1,j,k-1
                        A[index, find.find_index(i + 1, j, k +
                                                 1)] = C22  #Tyz i+1,j,k+1
                        A[index,
                          find.find_index(i, j, k - 1)] = C21  #Tyz i,j,k-1
                        A[index,
                          find.find_index(i, j, k + 1)] = C22  #Tyz i+,j,k+1

                elif (i == a - 1):  # Cold in BC

                    if (np.mod(j, 4) == 1):  # hot channet at j
                        C0 = -C01 - C11 - C12 - 2 * C21 - 2 * C22
                        A[index, find.find_index(i, j, k)] = C0  #ijk
                        A[index, find.find_index(i - 1, j, k)] = C01  #i-1,j,k
                        A[index, find.find_index(i, j - 1, k)] = C11  #i,j-1,k
                        A[index, find.find_index(i, j + 1, k)] = C12  #i,j+1,k
                        A[index, find.find_index(i, j, k - 1)] = C21  #i,j,k-1
                        A[index, find.find_index(i, j, k + 1)] = C22  #i,j,k+1
                        A[index,
                          find.find_index(i + 1, j, k - 1)] = C21  #i+1,j,k-1
                        A[index,
                          find.find_index(i + 1, j, k + 1)] = C22  #i+1,j,k+1

                    elif (np.mod(j, 4) == 3
                          ):  # Cold channel at j Cold in BC at i+1,k-1,+1
                        C0 = -C01 - C11 - C12 - 2 * C21 - 2 * C22
                        A[index, find.find_index(i, j, k)] = C0
                        A[index, find.find_index(i - 1, j, k)] = C01
                        A[index, find.find_index(i, j + 1, k)] = C12
                        A[index, find.find_index(i, j - 1, k)] = C11
                        A[index, find.find_index(i, j, k - 1)] = C21
                        A[index, find.find_index(i, j, k + 1)] = C22
                        C[index, 0] = -I.T_cold_in * (C21 + C22)
                else:
                    C0 = -C02 - C01 - C11 - C12 - 2 * C21 - 2 * C22
                    A[index, find.find_index(i, j, k)] = C0
                    A[index, find.find_index(i - 1, j, k)] = C01
                    A[index, find.find_index(i + 1, j, k)] = C02
                    A[index, find.find_index(i, j, k - 1)] = C21
                    A[index, find.find_index(i, j, k + 1)] = C22
                    A[index, find.find_index(i + 1, j, k + 1)] = C22
                    A[index, find.find_index(i + 1, j, k - 1)] = C21
                    A[index, find.find_index(i, j + 1, k)] = C12
                    A[index, find.find_index(i, j - 1, k)] = C11

    A, C = BC.adiabatic_wall_BC(A, C, Rwx, Rwy, Rwz, Tg, Tyz, Pg, P)
    return (A, C)
Пример #12
0
                                   nd=nd,
                                   VF_model=1,
                                   LS_model=LS_model,
                                   Closure_0_model=Closure_0_model,
                                   Closure_1_model=Closure_1_model,
                                   epsFact_density=epsFact_density,
                                   stokes=False,
                                   useVF=useVF,
				   useRBLES=useRBLES,
				   useMetrics=useMetrics,
                                   eb_adjoint_sigma=1.0,
                                   forceStrongDirichlet=ns_forceStrongDirichlet,
                                   turbulenceClosureModel=ns_closure)


setBC = BC.boundaryConditions()

def getDBC_p(x,flag):
    BCType = "pDirichlet"
    if flag == boundaryTags['top']:
        return outflowPressure
    if flag == boundaryTags['left']:
        return None
    if flag == boundaryTags['right']:
        return outflowPressure
    if flag == boundaryTags['bottom']:
        return setBC.freeSlip(BCType)


def getDBC_u(x,flag):
    BCType = "uDirichlet"
Пример #13
0
def parse_fastq_by_multitag(directory, f_gzipped_fastqfile, r_gzipped_fastqfile,
		q = "fastq",
		f_seqtag_length = 8,
		r_seqtag_length = 8,
		f_multitag_length = 6,
		r_multitag_length = 6,
		f_lintag_length = 38,
		r_lintag_length = 38, 
		f_spacer_length = 43, #distance to first barcode in forward read (ignoring the length the thie multitag and the seqtag)
		r_spacer_length = 29, #distance second barcode in reverse read (ignoring the length the thie multitag and the seqtag)
		min_qs = 30, #the minimum avareage quality score for both lineage tags
		lintag_grep_filter1 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?AA\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #first barcode
		lintag_grep_filter2 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?TT\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #second barcode
		clip_ends = 1, #logical of whether or not to clip the front and back ends off of lintag1 and lintag2
		lintag1_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag1 after this sequence
		lintag2_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag2 after this sequence
		lintag1_rear_clipper = '(.TAA|A.AA|AT.A|ATA.)', #only report lintag1 before this sequence, this must be the COMPLIMENT of the true sequence
		lintag2_rear_clipper = '(.TAA|A.AA|AT.A|ATA.)', #only report lintag2 before this sequence, this must be the COMPLIMENT of the true sequence
		multitags = ["TAGCTTGCGTAC", "CGATGTGAGACG"], #concatenated multiplexing tags from the first and second reads that uniquely identify a sample, currently must have 2 or more multitags
		write_multitags = False): #write multitags to file
		
	"""
	Parses a F and R gzipped FastQ files and saves the UMIs, multiplexing tags, and barcodes
	Removes reeads where the mean quality score for each lineage tag is not greater than min_qs
	Removes reeads where both lineage tags do not match the regular expression 
	"""

	from Bio import SeqIO
	import os
	import gzip
	import numpy
	import BC
	import re
	from itertools import izip
	os.chdir(directory)
	print("Loading " + f_gzipped_fastqfile + " and " + r_gzipped_fastqfile + " and parsing")
	print( "Saving the combined forward and reverse sequencing tags as seqtag.txt")
	print( "Saving the combined forward and reverse multiplexing tags  as multitag.txt")
	print( "Saving the first lineage tag as lintag1.txt")
	print( "Saving the first lineage tag as lintag2.txt")
	
	#assign boundries
	f_boundries = (0, f_seqtag_length , f_multitag_length + f_seqtag_length,
			f_multitag_length + f_seqtag_length + f_spacer_length,
			f_multitag_length + f_seqtag_length + f_spacer_length + f_lintag_length)
	r_boundries = (0, r_seqtag_length , r_multitag_length + r_seqtag_length,
			r_multitag_length + r_seqtag_length + r_spacer_length,
			r_multitag_length + r_seqtag_length + r_spacer_length + r_lintag_length)
	
	
	#open files for writing
	#reads that sort to a multiplexing tag
	for i in multitags:
		vars()[i+'_seqtag'] = open(directory + i + '_seqtag.txt', 'w')
		vars()[i+'_lintag1'] = open(directory + i + '_lintag1.txt', 'w')
		vars()[i+'_lintag2'] = open(directory + i + '_lintag2.txt', 'w')
		if write_multitags: vars()[i+'_multitag'] = open(directory + i + '_multitag.txt', 'w')
	
	#reads that do not sort to a multiplexing tag
	unmatched_seqtag = open(directory + 'unmatched_seqtag.txt', 'w')
	unmatched_lintag1 = open(directory + 'unmatched_lintag1.txt', 'w')
	unmatched_lintag2 = open(directory + 'unmatched_lintag2.txt', 'w')
	unmatched_multitag = open(directory + 'unmatched_multitag.txt', 'w')
	
	#open files for reading by SeqIO
	f_file = SeqIO.parse(gzip.open(directory + f_gzipped_fastqfile, "rU"), q)
	r_file = SeqIO.parse(gzip.open(directory + r_gzipped_fastqfile, "rU"), q)
	
	#eliminate low quality reads and reads that don't pass a quality filter, optionally clip off ends of lintags
	# sort by multiplexing tags
	quality_reads = 0
	total_reads = 0
	for f, r in izip(f_file, r_file):
		fq = f.letter_annotations["phred_quality"]
		rq = r.letter_annotations["phred_quality"]
		total_reads = total_reads + 1
		if numpy.mean(fq[f_boundries[3]:f_boundries[4]]) > min_qs and numpy.mean(rq[r_boundries[3]:r_boundries[4]]) > min_qs:
			#checks that the quality scores of forward and reverse lintags are OK
			#print "quality ok"
			fr = str(f.seq)
			#print fr
			rr = str(r.seq)
			#print rr
			if BC.grep(fr[f_boundries[3]:f_boundries[4]], lintag_grep_filter1) and BC.grep(rr[r_boundries[3]:r_boundries[4]], lintag_grep_filter2):
				#checks the both lineage tags meet the regular expression filter
				#print "grep ok"
				quality_reads = quality_reads + 1 #these are reads where both lintags pass the quality and grep filters
				#next, find the closest matching multitag
				m = fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] #the concatintated multiplexing tag
				#print m
				j = BC.best_match(m, multitags, MAX = (f_multitag_length + r_multitag_length + 1)/3) #best matched multiplexing tag
				#print j
				if j > -1:
					tm = BC.mismatches(m, multitags[j]) #distance to this tag
				else:
					tm = 1000
				if tm < (f_multitag_length + r_multitag_length + 1)/4: #A multitag match has been found
					ftag = fr[f_boundries[3]:f_boundries[4]]
					rtag = rr[r_boundries[3]:r_boundries[4]] 
					if(clip_ends):
						fstart = re.search(lintag1_front_clipper, ftag).span()[1]
						fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1
						if fend == 0: fend = len(ftag)
						ftag = ftag[fstart:fend]
						rstart = re.search(lintag2_front_clipper, rtag).span()[1]
						rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1
						if rend == 0: rend = len(rtag)
						rtag = rtag[rstart:rend]
					vars()[multitags[j]+'_lintag1'].write(ftag + '\n')
					vars()[multitags[j]+'_lintag2'].write(rtag + '\n')
					vars()[multitags[j]+'_seqtag'].write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] + '\n')
					if write_multitags: vars()[multitags[j]+'_multitag'].write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + '\n')
					#if (len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]) < 12
					#or len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]) < 16
					#or len(ftag) < 20
					#or len(rtag) < 20): 
					#    print rea
					#    print "match to " + multitags[j]
					#    print "multitag = " + fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + " " + str(len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]))
					#    print "seqtag = " + fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]  + " " + str(len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]))
					#    print "lintag1 = " + ftag + " " + str(len(ftag))
					#    print "lintag2 = " + rtag + " " + str(len(rtag))
					#    break
					else:
						ftag = fr[f_boundries[3]:f_boundries[4]]
						rtag = rr[r_boundries[3]:r_boundries[4]] 
						if(clip_ends):
							fstart = re.search(lintag1_front_clipper, ftag).span()[1]
							fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1
							if fend == 0: fend = len(ftag)
							ftag = ftag[fstart:fend]
							rstart = re.search(lintag2_front_clipper, rtag).span()[1]
							rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1
							if rend == 0: rend = len(rtag)
							rtag = rtag[rstart:rend]
						unmatched_lintag1.write(ftag + '\n')
						unmatched_lintag2.write(rtag + '\n')
						unmatched_seqtag.write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] +'\n')
						unmatched_multitag.write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] +'\n')
						#if (len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]) < 12
						#or len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]) < 16
						#or len(ftag) < 20
						#or len(rtag) < 20): 
						#    print rea
						#    print "match to " + multitags[j]
						#    print "multitag = " + fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + " " + str(len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]))
						#    print "seqtag = " + fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]  + " " + str(len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]))
						#    print "lintag1 = " + ftag + " " + str(len(ftag))
						#    print "lintag2 = " + rtag + " " + str(len(rtag))
						#    break
	print ( str(quality_reads) + " out of " + str(total_reads) +" reads passed grep and quality filters")
	for i in multitags:
		vars()[str(i)+'_seqtag'].close() 
		vars()[str(i)+'_lintag1'].close() 
		vars()[str(i)+'_lintag2'].close()
		if write_multitags: vars()[str(i)+'_multitag'].close()
	
	unmatched_seqtag.close()
	unmatched_lintag1.close()
	unmatched_lintag2.close()
	unmatched_multitag.close()
	f_file.close()
	r_file.close()
Пример #14
0
def parse_fastq(directory, f_gzipped_fastqfile, r_gzipped_fastqfile,
		q = "fastq", #the type of fastq file coming off of the sequencer
		f_seqtag_length = 8, #the length of the sequencing tag on the first read (UMI1)
		r_seqtag_length = 8, #the length of the sequencing tag on the second read (UMI2)
		f_multitag_length = 6, #the length of the multiplexing tag on the first read
		r_multitag_length = 6, #the length of the multiplexing tag on the second read
		f_lintag_length = 38, #the length of the lineage tag on the first read (first barcode)
		r_lintag_length = 38,  #the length of the lineage tag on the second read (second barcode)
		f_spacer_length = 43, #distance to first barcode in forward read (ignoring the length the thie multitag and the seqtag)
		r_spacer_length = 29, #distance second barcode in reverse read (ignoring the length the thie multitag and the seqtag)
		min_qs = 30, #the minimum avareage quality score for both lineage tags
		lintag_grep_filter1 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?AA\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #first barcode
		lintag_grep_filter2 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?TT\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #second barcode
		clip_ends = 1, #logical of whether or not to clip the front and back ends off of lintag1 and lintag2
		lintag1_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag1 after this sequence
		lintag2_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag2 after this sequence
		lintag1_rear_clipper = '(.ATA|A.TA|AA.A|AAT.)', #only report lintag1 before this sequence, this must be the COMPLIMENT of the true sequence
		lintag2_rear_clipper = '(.ATA|A.TA|AA.A|AAT.)'):#only report lintag1 before this sequence, this must be the COMPLIMENT of the true sequence
		
	"""
	Parses a F and R gzipped FastQ files and saves the UMIs, multiplexing tags, and barcodes
	Removes reeads where the mean quality score for each lineage tag is not greater than min_qs
	Removes reeads where both lineage tags do not match the regular expression 
	"""

	from Bio import SeqIO
	import os
	import gzip
	import numpy
	import BC
	import re
	from itertools import izip
	os.chdir(directory)
	print("Loading " + f_gzipped_fastqfile + " and " + r_gzipped_fastqfile + " and parsing")
	print( "Saving the combined forward and reverse sequencing tags as seqtag.txt")
	print( "Saving the combined forward and reverse multiplexing tags  as multitag.txt")
	print( "Saving the first lineage tag as lintag1.txt")
	print( "Saving the first lineage tag as lintag2.txt")
	
	
	#assign boundries
	f_boundries = (0, f_seqtag_length , f_multitag_length + f_seqtag_length,
			f_multitag_length + f_seqtag_length + f_spacer_length,
			f_multitag_length + f_seqtag_length + f_spacer_length + f_lintag_length)
	r_boundries = (0, r_seqtag_length , r_multitag_length + r_seqtag_length,
			r_multitag_length + r_seqtag_length + r_spacer_length,
			r_multitag_length + r_seqtag_length + r_spacer_length + r_lintag_length)
	
	
	#open files for writing
	seqtag = open(directory + '_seqtag.txt', 'w')
	multitag = open(directory + '_multitag.txt', 'w')
	lintag1 = open(directory + '_lintag1.txt', 'w')
	lintag2 = open(directory + '_lintag2.txt', 'w')
	
	
	#open files for reading by SeqIO
	f_file = SeqIO.parse(gzip.open(directory + f_gzipped_fastqfile, "rU"), q)
	r_file = SeqIO.parse(gzip.open(directory + r_gzipped_fastqfile, "rU"), q)
	
	#eliminate low quality reads and reads that don't pass a quality filter, optionally clip off ends of lintags
	quality_reads = 0
	total_reads = 0
	for f, r in izip(f_file, r_file):
		fq = f.letter_annotations["phred_quality"]
		rq = r.letter_annotations["phred_quality"]
		total_reads = total_reads + 1
		if numpy.mean(fq[f_boundries[3]:f_boundries[4]]) > min_qs and numpy.mean(rq[r_boundries[3]:r_boundries[4]]) > min_qs: #checks that the quality scores of forward and reverse lintags are OK
			fr = str(f.seq)
			rr = str(r.seq)
			if BC.grep(fr[f_boundries[3]:f_boundries[4]], lintag_grep_filter1) and BC.grep(rr[r_boundries[3]:r_boundries[4]], lintag_grep_filter2):
				#checks the both lineage tags meet the regular expression filter
				quality_reads = quality_reads + 1
				seqtag.write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] +'\n')
				multitag.write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] +'\n')
				ftag = fr[f_boundries[3]:f_boundries[4]]
				rtag = rr[r_boundries[3]:r_boundries[4]] 
				if (clip_ends):
					fstart = re.search(lintag1_front_clipper, ftag).span()[1]
					fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1
					if fend == 0: fend = len(ftag)
					ftag = ftag[fstart:fend]
					rstart = re.search(lintag2_front_clipper, rtag).span()[1]
					rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1
					if rend == 0: rend = len(rtag)
					rtag = rtag[rstart:rend]
				lintag1.write(ftag + '\n')
				lintag2.write(rtag + '\n')
			
	print ( str(quality_reads) + " out of " + str(total_reads) +" reads  passed grep and quality filters")
	seqtag.close()
	multitag.close()
	lintag1.close()
	lintag2.close()
	f_file.close()
	r_file.close()