Exemplo n.º 1
0
def findUnique():
    fastaReader = sequenceAnalysis.FastAreader()
    uniqueFinder = UniqueFinder()
    organizedFile = []

    for header, sequence in fastaReader.readFasta():
        organizedFile.append((header[8:11], header, sequence))
        uniqueFinder.addSequence(sequence)
        uniqueFinder.getPowerSet()
    organizedFile.sort()

    for aa, header, sequence in organizedFile:
        essentials = []
        uniqueFinder.addSequence(sequence)
        print(header)
        print(uniqueFinder.sequence)

        for essential in uniqueFinder.getEssentials():
            match = allindices(uniqueFinder.sequence, essential)
            for pos in match:
                tup = (essential, pos)
                essentials.append(tup)
        essentials.sort(key=lambda tup: tup[1])
        for sub in essentials:
            print("{0}{1}".format(sub[1] * ".", sub[0]))
Exemplo n.º 2
0
def main():
    # make sure to change this to use stdin
    myReader = sequenceAnalysis.FastAreader()
    myNuc = sequenceAnalysis.NucParams()
    for head, seq in myReader.readFasta():
        myNuc.addSequence(seq)

    # print sequence length
    print("\nsequence length: {:.2f} Mb\n".format(myNuc.nucCount() / (10**6)))
    # find gc content
    nucComp = myNuc.nucComposition()
    nucCount = myNuc.nucCount()
    GC = nucComp.get('G') + nucComp.get('C')
    # print gc content
    print("GC content: {:.1f}%\n".format((GC / nucCount) * 100))
    # sort codons in alpha order, by Amino Acid
    sortedCodons = sorted(myNuc.rnaCodonTable.items(), \
                    key = operator.itemgetter(1, 0))
    # calculate relative codon usage for each codon and print
    codonComp = myNuc.codonComposition()
    AAcomp = myNuc.aaComposition()
    for codonAApair in sortedCodons:
        nuc = codonAApair[0]
        aa = codonAApair[1]
        val = codonComp.get(nuc) / AAcomp.get(aa)
        print('{:s} : {:s} {:5.1f} ({:6d})'.format(nuc, aa, val * 100,
                                                   codonComp.get(nuc)))
    print('\r')
Exemplo n.º 3
0
def main(myCommandLine=None):
    '''
    Implements the Usage exception handler that can be raised from anywhere in process.  

    '''
    if myCommandLine is None:
        myCommandLine = CommandLine([
            'vulgaris.fasta', 'vulgarisORFdata-ATG-100.txt', '--longestGene',
            '--start=ATG', '--minGene=100'
        ])
    else:
        myCommandLine = CommandLine(myCommandLine)
        ###### replace the code between comments.
        # myCommandLine.args.inFile has the input file name
        myCommandLine.args.inFile = 'vulgaris.fasta'
        # myCommandLine.args.outFile has the output file name
        myCommandLine.args.outFile = 'vulgarisORFdata-ATG-100.txt'
        # myCommandLine.args.longestGene is True if only the longest Gene is desired
        # myCommandLine.args.start is a list of start codons
        myCommandLine.args.start = ['ATG', 'GTG', 'CTG', 'TTG']
        # myCommandLine.args.minGene is the minimum Gene length to include
        myCommandLine.args.minGene = 100
######

    import sequenceAnalysis as x  #importing sequenceAnalysis module with the new name 'x'
    #taking the fastafile (tass2.fa) through FastAreader
    tass = x.FastAreader(myCommandLine.args.inFile)
    output = myCommandLine.args.outFile  #define the output file
    # making the output and preparing to write on it:
    with open(output, 'w') as ORFData:
        #getting the header from the file by using readFasta() method from the FastAreader class.
        for head, seq in tass.readFasta():
            print(
                head, file=ORFData
            )  #note for the print statement to write (,file = the name) is essential
            orf = x.ORF(
            )  # running the object through ORF class, initializing the lists
            #using the ORFfinder method in the class to get the list of the lists of the ORFs
            geneData = orf.ORFfinder(seq)
            #let's combine all the 6 lists together
            dataList = geneData[0] + geneData[1] + geneData[2] + geneData[
                3] + geneData[4] + geneData[5]
            #sort the data
            dataList.sort(key=lambda length: length[3], reverse=True)
            #order the genes by frame,startposition, endposition, and their length
            for genes in dataList:
                frame = genes[0]  #frame is the first list
                start = genes[1]  #start is the 2nd list
                stop = genes[2]  #stop is the 3rd list
                length = genes[3]  #length is the 4th list
                #print statement, note (, file = theName) is essential to write on outputfile
                print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(
                    frame, start, stop, length),
                      file=ORFData)

    if myCommandLine.args.longestGene:
        print('longestGene is', str(myCommandLine.args.longestGene))
    else:
        pass
Exemplo n.º 4
0
 def __init__(self):
     """
     Instantiates two objects. One of FastAreader class and one of NucParams.
     FastAreader object reads fasta file and NucParams does the counting of the
     sequences inside fasta file.
     """
     self.myReader = SA.FastAreader()
     self.thisAnalyzer = SA.NucParams()
Exemplo n.º 5
0
def main():
    # make sure to change this to use stdin
    myReader1 = sequenceAnalysis.FastAreader('testGenome.fa')
    myNuc1 = sequenceAnalysis.NucParams()
    for head, seq in myReader1.readFasta():
        myNuc1.addSequence(seq)
    # make sure to change this to use stdin
    myReader2 = sequenceAnalysis.FastAreader('haloVolc1_1-genes.fa')
    myNuc2 = sequenceAnalysis.NucParams()
    for head, seq in myReader2.readFasta():
        myNuc2.addSequence(seq)

    # find gc content
    nucComp1 = myNuc1.nucComposition()
    nucComp2 = myNuc2.nucComposition()
    nucCount1 = myNuc1.nucCount()
    nucCount2 = myNuc2.nucCount()
    GC1 = nucComp1.get('G') + nucComp2.get('C')
    GC2 = nucComp2.get('G') + nucComp2.get('C')
    log2 = math.log(((GC1/nucCount1) * 100) / \
                    ((GC2/nucCount2) * 100), 2)
    # print gc content
    print("GC content (log2(GC1/GC2)): {:.2f}\n".format(log2))
    # sort codons in alpha order, by Amino Acid
    sortedCodons = sorted(myNuc1.rnaCodonTable.items(), \
                    key = operator.itemgetter(1, 0))
    # calculate relative codon usage & frequency per codon and print
    print("codon : amino log2(usage1/usage2) (log2(count1/count2))")
    codonComp1 = myNuc1.codonComposition()
    codonComp2 = myNuc2.codonComposition()
    AAcomp1 = myNuc1.aaComposition()
    AAcomp2 = myNuc2.aaComposition()
    for codonAApair in sortedCodons:
        nuc = codonAApair[0]
        aa = codonAApair[1]
        relativeComp = math.log(
            (codonComp1.get(nuc)/ \
                AAcomp1.get(aa) / \
            (codonComp2.get(nuc)/ \
                AAcomp2.get(aa))), 2)
        count = math.log((codonComp1.get(nuc) / \
                codonComp2.get(nuc)), 2)
        print('{:s}    :    {:s}     {:5.2f}          ({:6.2f})'.format(
            nuc, aa, relativeComp * 100, count))
    print('\r')
Exemplo n.º 6
0
def analyze(file):
    fastaReader = sequenceAnalysis.FastAreader(file)
    analyzer = sequenceAnalysis.NucParams()

    for header, sequence in fastaReader.readFasta():
        analyzer.addSequence(sequence)

    '''get all the dictionaries we need for analysis'''
    nucComp = analyzer.nucComposition()
    codonComp = analyzer.codonComposition()
    aaComp = analyzer.aaComposition()
    rnaCodonDict = sorted(analyzer.rnaCodonTable.items())

    return nucComp, codonComp, aaComp, rnaCodonDict
Exemplo n.º 7
0
def main(myCommandLine=None):
    """
    Implements the Usage exception handler that can be raised from anywhere in process.
    """
    # myCommandLine.args.inFile has the input file name
    # myCommandLine.args.outFile has the output file name
    # myCommandLine.args.longestGene is True if only the longest Gene is desired

    if myCommandLine is None:
        myCommandLine = CommandLine([
            'tass2.fa', 'tass2ORFdata-ATG-100.txt', '--longestGene',
            '--start=ATG', '--minGene=100'
        ])
    else:

        myCommandLine = CommandLine(myCommandLine)
    # myCommandLine.args.startList is a list of start codons
    # myCommandLine.args.minGene is the minimum Gene length to include

    # Parse the options
    myCommandLine.args.inFile
    outFile = myCommandLine.args.outFile
    myCommandLine.args.longestGene
    myCommandLine.args.start
    myCommandLine.args.minGene

    # Parse inFile and initialize it to be read
    orfReader = sequenceAnalysis.FastAreader(myCommandLine.args.inFile)
    open(myCommandLine.args.outFile, 'w').close()
    f = open(outFile, 'a')

    # loop through sequences in the file
    # find ORFs and write them to file
    for head, seq in orfReader.readFasta():
        nucParams = sequenceAnalysis.NucParams('')
        nucParams.addSequence(seq)
        nucParams.initializeObjects()
        bases = list((''.join(nucParams.codonList)))
        reverseBases = getReverseComplement(bases)

        finder = OrfFinder(bases,
                           head,
                           minGene=myCommandLine.args.minGene,
                           longestGene=myCommandLine.args.longestGene,
                           start=myCommandLine.args.start,
                           revSeq=reverseBases)
        finder.findOrfs(f)
        finder.findRevOrfs(f)
        finder.writeFramesToFile(f)
Exemplo n.º 8
0
    def __init__(self):
        """
        Takes in multiple sequences of tRNA from a FASTA file and finds power set of each tRNA sequence,
        then appends each power set to a list.
        Attributes:
            attr1 (list): List of power sets for each tRNA.
            attr2 (list): List of unique sets for each tRNA.
            attr3 (dict): Dictionary with key: count (0-22) & value: [tRNA header, tRNA sequence]
        """
        self.powerSetList = []  # powerSetList contains all 22 power sets for each tRNA.
        self.uniqueList = []  # Used to save 22 unique substring sets for each tRNA.  
        self.headerSequenceDictionary = {}  # Dictionary to save the header and sequence of each tRNA as a list.
        fastaFile = SA.FastAreader()  # Instantiate object of FastAreader class so we can read in file.
        count = 0

        for header, sequence in fastaFile.readFasta():  # Reads FastA file and yields the tRNA header/sequence.
            filteredSequence = self.removeCharacters(sequence)  # Removes dashes and underscores from sequence.
            self.headerSequenceDictionary[count] = [header, filteredSequence] # Initialize dictionary values with list = [header, squence].
            mypowerSet = self.powerSet(filteredSequence)  # Finds the power set for each tRNA sequence.
            self.powerSetList.append(mypowerSet)  # Add power set for each tRNA to a list.
            count += 1
Exemplo n.º 9
0
    def __init__(self):
        """
        Reads tRNA from the inputted Fasta file. Adds the power set of each sequence to a list

        """
        self.powerSetList = []
        self.uniqueList = []  # List fo tRNA substrings
        self.headerSequenceDictionary = {}  # sequence saving dict.
        fastaFile = seqAnal.FastAreader(
        )  # pulls in fasta reader and initializes here to read in fasta file
        count = 0

        for header, sequence in fastaFile.readFasta():
            filteredSequence = self.removeChar(
                sequence)  # makes sure only valid characters are being used
            self.headerSequenceDictionary[count] = [header, filteredSequence]
            mypowerSet = self.powerSet(filteredSequence)
            self.powerSetList.append(
                mypowerSet
            )  # Add power set (after finding) for each tRNA to a list.
            count += 1
Exemplo n.º 10
0
def main(myCommandLine=None):
    """Reads in a fasta file and outputs the ORFs frame, start, stop, and length position on a output file."""
    if myCommandLine is None:
        myCommandLine = CommandLine()
        if myCommandLine.args.longestGene:  # If the terminal sees lG flag variable start this part of code.
            fastaFile = sequenceAnalysis.FastAreader()
            for header, sequence in fastaFile.readFasta():
                print(header)
                orfData = OrfFinder(sequence)
                orfData.findOrfs()
                orfData.findRevOrfs()
                filteredList = filter(
                    lambda orf: orf[3] > myCommandLine.args.minGene, orfData.
                    orfs)  # Filters out the ORFS depending on the minGene arg.
                for frame, start, stop, length in sorted(
                        filteredList, key=lambda orf: orf[3], reverse=True
                ):  # Unzips filteredList and sorts the list by length.
                    print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(
                        frame, start, stop, length))
    else:
        myCommandLine = CommandLine(myCommandLine)
Exemplo n.º 11
0
def main(inCL=None):
    '''
    Find some genes.  
    '''
    # load FastAreader
    import sequenceAnalysis
    from operator import itemgetter
    ORFreader = sequenceAnalysis.FastAreader()
    if inCL is None:
        myCommandLine = CommandLine()
    else:
        myCommandLine = CommandLine(inCL)
    # locate ORFs in sequences
    for head, seq in ORFreader.readFasta():
        foundORFs = sequenceAnalysis.ORFfinder(seq, myCommandLine.args.start,
                                               myCommandLine.args.stop)
        foundORFs.finder()
        foundORFs.ORFlist.sort(key=lambda x: (x[3], -x[1]), reverse=True)
        print(head)
        indexList = []
        for frame in foundORFs.ORFlist:
            if myCommandLine.args.longestGene:
                # find longest ORF in a presorted ORF list
                if frame[0] < 0:
                    if frame[1] not in indexList and frame[
                            3] >= myCommandLine.args.minGene:
                        indexList.append(frame[1])
                        print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(
                            frame[0], frame[1], frame[2], frame[3]))
                else:
                    if frame[2] not in indexList and frame[
                            3] >= myCommandLine.args.minGene:
                        indexList.append(frame[2])
                        print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(
                            frame[0], frame[1], frame[2], frame[3]))
            else:
                if frame[3] >= myCommandLine.args.minGene:
                    print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(
                        frame[0], frame[1], frame[2], frame[3]))
Exemplo n.º 12
0
def output(infile, outfile):

    fastaReader = sequenceAnalysis.FastAreader(infile)
    orfsFound = []

    f = open(outfile, 'w')
    sys.stdout = f

    for header, sequence in fastaReader.readFasta():
        print(header)
        '''find forward and reverse comp ORFs in sequence'''
        finder = OrfFinder(sequence)
        orfList = finder.findOrfs()
        revOrfList = finder.findRevOrfs()
        '''convert orfList to clear tuples in single list for sorting'''
        for list in orfList:
            for tuple in list:
                frame = tuple[0] + 1
                startPos = tuple[1] + 1
                endPos = tuple[2]
                orfsFound.append((frame, startPos, endPos, tuple[3]))
        '''covert reverse complementary ORFs to top strand coordinates and add to single list'''
        for list in revOrfList:
            for tuple in list:
                frame = tuple[0] + 1
                startPos = tuple[1] + 1
                endPos = tuple[2]
                orfsFound.append((-frame, len(sequence) - endPos + 1,
                                  len(sequence) - startPos + 1, tuple[3]))
    '''sort by largest to smallest and by start position furthest to the left'''
    orfsFound.sort(key=lambda tup: (tup[3], tup[1]), reverse=True)
    '''print out list to file in proper format'''
    for orf in orfsFound:
        print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(orf[0], orf[1], orf[2],
                                                   orf[3]))

    f.close()
Exemplo n.º 13
0
def main():
    '''Main function for the primer design program.  Imports primerDesign, Bio, CommandLine, and sequenceAnalysis.
    Main function takes in inputs and boolean checks from the command line.  These include the target sequence, restriction
    enzymes one and two, any changes to start and stop codons, and a verbosity check which will enable printing either to
    standard output or a output file.  Output would appear as the following after a successful run of main:


    ############################################################
    
    FastA Header
    
    Forward Primer
    Primer Sequence
    {} nucleotides were added to give {} efficiency after {} hours.
    Buffer {} for digestion at {} degrees.
    
    
    Melting Temperature Forward
    GC Content Percentage Forward
    
    Reverse Primer
    Primer Sequence
    {} nucleotides were added to give {} efficiency after {} hours.
    Buffer {} for digestion at {} degrees.
    
    Melting Temperature Reverse
    GC Content Percentage Reverse
    
    
    ############################################################


    Should errors occur such as improper target sequence or (Fill in the check conditions we can think of here),
    a message will be displayed indicating the potential problem to the user:

    Error:
    This program has detected that (Situation).  Please correct your (Situation) and try again.
    
    After the message is displayed, the program will exit and return back to the terminal line.

    '''
    ###################################################################################################

    #Main method variables
    cl = CommandLine.Command_Line()

    gcForward = None
    gcReverse = None

    #Gather the restriction enzymes
    restrictionEnzyme1 = cl.args.enzymeOne
    restrictionEnzyme2 = cl.args.enzymeTwo

    #Gather the start and stop codon
    startCodon = cl.args.start
    stopCodon = cl.args.stop

    #Verbosity Boolean
    verb = cl.args.verbosity

    #File Name
    targetFile = cl.args.target

    #Marker Number
    markerNumber = 60
    #Degree Symbol
    degree = "\u00b0"

    ###################################################################################################
    #Check to see if all of the required elements are in place.  If any value is at none, terminate the program with a message
    if targetFile is None:
        print(
            "No Target Sequence inputted.  Please retry with a proper input file in the appropriate location.  See -h for command line help."
        )
        sys.exit()
    else:
        pass

    if restrictionEnzyme1 is None:
        print(
            "No Enzyme One inputted.  Please retry with an Enzyme One in the appropriate location.  See -h for command line help."
        )
        sys.exit()
    else:
        pass

    if restrictionEnzyme2 is None:
        print(
            "No Enzyme Two inputted.  Please retry with an Enzyme Two in the appropriate location.  See -h for command line help."
        )
        sys.exit()
    else:
        pass

###################################################################################################

    fastA = sequenceAnalysis.FastAreader(
        targetFile)  #Read from the file collected by CommandLine class
    for head, seq in fastA.readFasta(
    ):  #Ideally there should be only one fastA to read given a run.

        #If we decide otherwise, place all of the class method calls within the loop
        createdPrimer = primerDesign.primerDesign(head, seq,
                                                  restrictionEnzyme1,
                                                  restrictionEnzyme2,
                                                  startCodon, stopCodon)

    #createdPrimer.buildPrimers() #Build the primers using the built in buildPrimers method.  Results are stored in the class object

    nucForward = sequenceAnalysis.NucParams(str(createdPrimer.forwardPrimer))
    nucReverse = sequenceAnalysis.NucParams(str(createdPrimer.reversePrimer))

    gcForward = (nucForward.nucComposit["G"] +
                 nucForward.nucComposit["C"]) / nucForward.nucCount()
    gcReverse = (nucReverse.nucComposit["G"] +
                 nucReverse.nucComposit["C"]) / nucReverse.nucCount()

    ###################################################################################################
    #Printing Section
    #Print either to an output file or std out depending upon verbosity condition

    if verb is True:  #Verbosity mode output.  If enabled, writes to a file instead of std out.
        with open("PrimerOut.txt", "w") as p:
            p.write("#" * markerNumber + "\n\n")
            p.write(createdPrimer.header + "\n\n")
            p.write("Forward Primer\n")
            p.write(createdPrimer.finalFwdPrimer + "\n")
            p.write("\n")
            p.write(
                "'{0}' nucleotides were added to give {1} efficiency after {2} hours.\n"
                .format(
                    createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1]
                    [0], createdPrimer.restrictionEnzymeDict[
                        createdPrimer.enzyme1][1],
                    createdPrimer.restrictionEnzymeDict[
                        createdPrimer.enzyme1][2]))
            p.write("Buffer {} for digestion at {} Degrees.\n\n".format(
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][3],
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][4]))
            p.write("Melting Temperature = " + str(createdPrimer.tempOfFwd) +
                    degree + "C" + "\n")
            p.write("\n")
            p.write(str.format("{0:.4f}", gcForward) + " % GC Content\n")
            p.write("\n")
            p.write("Reverse Primer\n")
            p.write(createdPrimer.finalRevPrimer + "\n")
            p.write("\n")
            p.write(
                "'{0}' nucleotides were added to give {1} efficiency after {2} hours.\n"
                .format(
                    createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2]
                    [0], createdPrimer.restrictionEnzymeDict[
                        createdPrimer.enzyme2][1],
                    createdPrimer.restrictionEnzymeDict[
                        createdPrimer.enzyme2][2]))
            p.write("Buffer {} for digestion at {} Degrees.\n\n".format(
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][3],
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][4]))
            p.write("Melting Temperature = " + str(createdPrimer.tempOfRev) +
                    degree + "C" + "\n")
            p.write("\n")
            p.write(str.format("{0:.4f}", gcReverse) + " % GC Content\n")
            p.write("#" * markerNumber)

    else:  #Print to std out instead of to a file
        print("#" * markerNumber)
        print(createdPrimer.header)
        print()
        print("Forward Primer")
        print(createdPrimer.finalFwdPrimer)
        print()
        print(
            "'{0}' nucleotides were added to give {1} efficiency after {2} hours.\n"
            .format(
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][0],
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][1],
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][2]))
        print("Buffer {} for digestion at {} Degrees.\n".format(
            createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][3],
            createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][4]))
        print("Melting Temperature = " + str(createdPrimer.tempOfFwd) + "C")
        print()
        print(str.format("{0:.4f}", gcForward) + " % GC Content\n")
        print()
        print("Reverse Primer")
        print(createdPrimer.finalRevPrimer)
        print()
        print(
            "'{0}' nucleotides were added to give {1} efficiency after {2} hours.\n"
            .format(
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][0],
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][1],
                createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][2]))
        print("Buffer {} for digestion at {} Degrees.\n".format(
            createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][3],
            createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][4]))
        print()
        print("Melting Temperature = " + str(createdPrimer.tempOfRev) + "C")
        print()
        print(str.format("{0:.4f}", gcReverse) + " % GC Content\n")
        print()
        print("#" * markerNumber)
Exemplo n.º 14
0
 def __init__(self):
     self.fastaReader = sequenceAnalysis.FastAreader()
     self.analyzer = sequenceAnalysis.NucParams()