Exemplo n.º 1
0
class Pyro:
    def __init__(self, name, fasta_filename, aln_length=None, rename_dups=False):
        """
		if aln_length is not given then it is *guessed* by looking at the alignment in <fasta_filename>
		"""
        self.name = name
        self.rename_dups = rename_dups
        self.fasta_filename = fasta_filename
        self.fasta_reader = FastaReader(self.fasta_filename, self.rename_dups)
        if aln_length is None:
            key1 = self.fasta_reader.iterkeys().next()
            self.aln_length = len(self.fasta_reader[key1].seq)
        else:
            self.aln_length = aln_length
        self.nseq = len(self.fasta_reader.keys())

    def __getitem__(self, key):
        return self.fasta_reader[key]

    def keys(self):
        return self.fasta_reader.keys()

    def make_DF(self):
        df = DF(self.name, self.aln_length)
        for id in self.fasta_reader.iterkeys():
            r = self.fasta_reader[id]
            for nt in DF.nucleotides():
                # TODO: make find_all_indices iterative to be mem-efficient
                seq = r.seq.tostring().replace("U", "T")
                positions = find_all_indices(seq, nt)
                df.add_to_vec(nt=nt, positions=positions, counts=[1] * len(positions))
        # 			for gapped_pos,nt in enumerate(r.seq):
        # 				df.add_to_vec(nt=nt, positions=[gapped_pos], counts=[1])
        return df
Exemplo n.º 2
0
    def inputFile(self, fname):
        self.fname=fname
        self.uniqueID= ''
        self.drugsGiven=[]
        reader = FastaReader(fname)

        mutationList= []
        for header, seq in reader.readFasta():
            mutationList.append((header,seq))

        self.seqt0= mutationList[0][1]
        self.seqtf= mutationList[-1][1]
        #Shaves '>' 

        self.findMutations()
        self.possibleMutations= findAllPossibleMutations(self.seqt0)
        #Parse the header and put in relevant information
        finalHeader= mutationList[-1][0]
        #print(finalHeader)
        readHeader= True
        firstUnderScore= True
        builtStr=''
        readDrugs=False
        for char in header:
            #print("Char:" +char)
            #print("builtStr: "+builtStr)
            if readHeader:
                if char=='_':
                    if firstUnderScore:
                        builtStr+=char
                        firstUnderScore= False
                    else:
                        readHeader= False
                        self.uniqueID= builtStr
                else:
                    builtStr+=char

            elif readDrugs:
                if char== '_':
                    self.drugsGiven.append(builtStr)
                    builtStr=''
                elif builtStr == 'None':
                    break

                else:
                    builtStr+=char

            elif builtStr== '__':
                readDrugs= True
                builtStr=''
                builtStr+=char

            elif char != '_':
                builtStr= ''
            else:
                builtStr+=char
Exemplo n.º 3
0
    def load_sg_seq(self, fasta_fn):

        all_read_ids = set() # read ids in the graph

        for v, w in self.sg_edges:
            type_ = self.sg_edges[ (v, w) ][-1]
            if type_ != "G":
                continue
            v = v.split(":")[0]
            w = w.split(":")[0]
            all_read_ids.add(v)
            all_read_ids.add(w)

        seqs = {}
        # load all p-read name into memory
        f = FastaReader(fasta_fn)
        for r in f:
            if r.name not in all_read_ids:
                continue
            seqs[r.name] = r.sequence.upper()


        for v, w in self.sg_edges:
            seq_id, s, t = self.sg_edges[ (v, w) ][0]
            type_ = self.sg_edges[ (v, w) ][-1]

            if type_ != "G":
                continue

            if s < t:
                e_seq = seqs[ seq_id ][ s:t ]
            else:
                e_seq = "".join([ RCMAP[c] for c in seqs[ seq_id ][ t:s ][::-1] ])
            self.sg_edge_seqs[ (v, w) ] = e_seq
Exemplo n.º 4
0
    def __init__(self, name, fasta_filename, aln_length=None, rename_dups=False):
        """
		if aln_length is not given then it is *guessed* by looking at the alignment in <fasta_filename>
		"""
        self.name = name
        self.rename_dups = rename_dups
        self.fasta_filename = fasta_filename
        self.fasta_reader = FastaReader(self.fasta_filename, self.rename_dups)
        if aln_length is None:
            key1 = self.fasta_reader.iterkeys().next()
            self.aln_length = len(self.fasta_reader[key1].seq)
        else:
            self.aln_length = aln_length
        self.nseq = len(self.fasta_reader.keys())
Exemplo n.º 5
0
def main():
    fastaDir = os.path.abspath(sys.argv[1])  # file directory
    fastaReader = FastaReader()
    fastaReader.readFastaFile(fastaDir)     # read file
    
    # PROBLEM 1. (Detail Algorithm implemented in Sequence.py (subStringSearch() function))
    searchString = sys.argv[2]
    numOfSubstring = fastaReader.numberOfSubstring(searchString)
    print(f"Problem 1: String, {searchString}, appears {numOfSubstring} times in file {fastaReader.getFileName()}")

    # PROBLEM 2. (Detail Algorithm implemented in FastaReader.py ())
    lengthOfString = int(sys.argv[3])
    highestFreqString, appearTimes, subseqCounter = fastaReader.findHighestOccurrence(lengthOfString)
    print(f"Problem 2: {lengthOfString}-mer subsequence, {highestFreqString}, has highest occurrences, {appearTimes} times, in file {fastaReader.getFileName()}")
    
    # output the all substring counts in substringCount.csv
    with open(os.path.join(os.path.abspath("./"), "Output", "substringCountLarger10.csv"), 'w') as outputFile:
        # column names
        outputFile.write("Subsequence,Counts\n")

        for subseq, counter in subseqCounter.items():
            if counter >= 10:
                outputFile.write(f"{subseq},{counter}\n")
Exemplo n.º 6
0
def sample(chr):
    L=len(chr)
    pos=int(random.uniform(0,L-100000))
    while(pos<L-3):
        codon=chr[pos:pos+3]
        if(codon=="ATG"): break
        pos+=3
    if(codon!="ATG"): return -1
    codons=0
    while(pos<L-3):
        codon=chr[pos:pos+3]
        if(codon[0]=='N' or codon[1]=='N' or codon[2]=='N'): return -1
        if(stops.get(codon,False)): break
        codons+=1
        pos+=3
    return codons

#============================ main() ==============================
reader=FastaReader(filename)
while(True):
    [defline,seq]=reader.nextSequence()
    if(not defline): break
    for i in range(NUM_ORFS):
        length=sample(seq)
        if(length<0): continue
        print(length,flush=True)
reader.close()



Exemplo n.º 7
0
# The above imports should allow this program to run in both Python 2 and
# Python 3.  You might need to update your version of module "future".
import sys
from FastaReader import FastaReader
from FastaWriter import FastaWriter
from GffTranscriptReader import GffTranscriptReader

if(len(sys.argv)!=4):
    exit(sys.argv[0]+" <in.fasta> <in.gff> <out.fasta>")
(fastaFile,gffFile,outFile)=sys.argv[1:]

reader=GffTranscriptReader()
transcripts=reader.loadGFF(gffFile)
keep=set()
for transcript in transcripts:
    if(transcript.getID()[:3]!="ALT"): continue
    keep.add(transcript.getSubstrate())

reader=FastaReader(fastaFile)
writer=FastaWriter()
fh=open(outFile,"wt")
while(True):
    (defline,seq)=reader.nextSequence()
    if(not defline): break
    (id,attr)=FastaReader.parseDefline(defline)
    if(id not in keep): continue
    writer.addToFasta(defline,seq,fh)
fh.close()
print("[done]",file=sys.stderr)

Exemplo n.º 8
0
                        with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii, chr,
                      hex, input, next, oct, open, pow, round, super, filter,
                      map, zip)
# The above imports should allow this program to run in both Python 2 and
# Python 3.  You might need to update your version of module "future".
import sys
import ProgramName
from FastaReader import FastaReader
from FastaWriter import FastaWriter
from Rex import Rex

rex = Rex()

#=========================================================================
# main()
#=========================================================================
if (len(sys.argv) != 3):
    exit(ProgramName.get() + " <in.fasta> <out.fasta>\n")
(infile, outfile) = sys.argv[1:]

OUT = open(outfile, "wt")
writer = FastaWriter()
reader = FastaReader(infile)
while (True):
    (defline, seq) = reader.nextSequence()
    if (not defline): break
    if (not rex.find(">chr", defline)): continue
    writer.addToFasta(defline, seq, OUT)
OUT.close()
Exemplo n.º 9
0
            index_opt=None,
            index_outdir = "./bwa_index/")
    """
    Main function of RefMasker that integrate database creation, blast and homology masking
    * Instantiate Blast database and blastn object
    * Perform iterative blasts of query sequences against the subject database and create a list of
    hits.
    """

    # Try to validate a index from an existing one
    try:
        if not index_path:
            raise Exception("No index provided. An index will be generated")

        print("Existing index provided")
        FastaRef = FastaReader(ref1_path, ref2_path, write_merge=False)
        Index = ExistingIndex(bwa_path, index_path)

    # If no index or if an error occured during validation of the existing index = create a new one
    except Exception as E:
        print (E)

        print("Merge References...")
        mkdir(ref_outdir)

        FastaRef = FastaReader([ref1_path,ref2_path], write_merge=True, output="merged.fa")

        print("Generating index...")
        mkdir(db_outdir)
        Index = GenerateIndex(bwa_path, FastaRef.merge_ref, index_opt)
        remove (FastaRef.merge_ref)
Exemplo n.º 10
0
from __future__ import print_function
from FastaReader import FastaReader

f = FastaReader("tmp.fa")
count = 0
for r in f:
    rlen = len(r.sequence)
    print(">ccs/{}/{}_{}".format(count, 0, rlen))
    for s in xrange(0, rlen, 60):
        print(r.sequence[s:s + 60])
    count += 1
Exemplo n.º 11
0
from FastaWriter import FastaWriter
from Translation import Translation

#filename="/home/bmajoros/1000G/assembly/combined/HG00096/1.fasta"

#reader=FastaReader("/home/bmajoros/1000G/assembly/combined/HG00096/1.fasta")
#while(True):
#    [defline,seq]=reader.nextSequence()
#    if(not defline): break
#    print("defline="+defline);
#    L=len(seq)
#    print("length="+str(L))

#filename="/home/bmajoros/1000G/assembly/BRCA1-NA19782.fasta";
filename="/Users/bmajoros/python/test/data/subset.fasta"
print(FastaReader.getSize(filename))

[defline,seq]=FastaReader.firstSequence(filename)
print(len(seq))

#filename="/home/bmajoros/1000G/assembly/test.fasta"
filename="/Users/bmajoros/python/test/data/subset.fasta"
hash=FastaReader.readAllAndKeepDefs(filename)
for key in hash.keys():
    [defline,seq]=hash[key]
    print(defline)
    [id,attrs]=FastaReader.parseDefline(defline)
    print("id="+id)
    for key,value in attrs.items():
        print(key+"="+value)