def read_te_fasta(self, fasta_file): from fastaIO import FastaReader seq_dict1 = {} seq_dict2 = {} ct = 1 handle = FastaReader(fasta_file) for seq_id, seq in handle: seq_dict1[ct] = seq_id seq_dict2[ct] = seq ct = ct + 1 handle.close() return seq_dict1, seq_dict2
def getPopGenomeStats(inputFile): totleng = 0 totcount = 0 for head, seq in FastaReader(inputFile): totcount += 1 totleng += len(seq) averagegenomesize = float(totleng) / float(totcount) popgenomesamples = totcount return averagegenomesize, popgenomesamples
def read_chesis_fasta(self, fasta_file): from fastaIO import FastaReader seq_dict = {} ct = 1 handle = FastaReader(fasta_file) seq_id = "" seq_length = 0 chr_seq = "" for seq_id, seq in handle: seq_length = len(seq) chr_seq = seq handle.close() return seq_id, seq_length, chr_seq
def load_chasis(inputFile): fr = FastaReader(inputFile) counter = 0 h = None s = None for header, seq in fr: h = header s = seq counter += 1 if counter > 1: raise ValueError( "Chasis file must only contain a single reference genome") return h, s
end=te.end seq=maskSeq(seq,start,end) # mask the TE with Ns novelrefseq[chr]=seq return novelrefseq def printSequences(seq,outfasta): fw=FastaWriter(outfasta,60) for n,s in seq.items(): fw.write(n,s) fw.close() parser = OptionParser() parser.add_option("--gtf",dest="gtfte",help="A gtf file containing the TE annotation") parser.add_option("--input",dest="fastaref",help="A fasta file containing the reference sequence") parser.add_option("--output",dest="outfasta",help="The output of the fasta sequences"), (options, args) = parser.parse_args() print("Loading refseqs..") refseqs = FastaReader.readFastaHash(options.fastaref) print("Loading gtf..") noveltegtf= GTFTEReader.readall(options.gtfte) print("Masking reference sequence..") novelrefseq=maskTEsinSeq(noveltegtf,refseqs) print("Printing masked reference sequence..") printSequences(novelrefseq,options.outfasta)
return novelrefseq def printSequences(seq, outfasta): fw = FastaWriter(outfasta, 60) for n, s in seq.items(): fw.write(n, s) fw.close() parser = OptionParser() parser.add_option("--gtf", dest="gtfte", help="A gtf file containing the TE annotation") parser.add_option("--input", dest="fastaref", help="A fasta file containing the reference sequence") parser.add_option("--output", dest="outfasta", help="The output of the fasta sequences"), (options, args) = parser.parse_args() print("Loading refseqs..") refseqs = FastaReader.readFastaHash(options.fastaref) print("Loading gtf..") noveltegtf = GTFTEReader.readall(options.gtfte) print("Masking reference sequence..") novelrefseq = maskTEsinSeq(noveltegtf, refseqs) print("Printing masked reference sequence..") printSequences(novelrefseq, options.outfasta)
parser.add_argument("--chassis", type=str, required=False, dest="ref_fasta", default=None, help="the chassis, i.e. the sequence into which TEs will be inserted; a fasta file") parser.add_argument("--te-seqs", type=str, required=False, dest="te_fasta", default=None, help="TE sequences in a fasta file") parser.add_argument("--pgd", type=str, required=True, dest="pgd_definition", default=None, help="the definition of the population genome") parser.add_argument("--output", type=str, required=True, dest="output", default=None, help="the output file; will be multi-fasta file") args = parser.parse_args() # read TE sequences from file; if provided tetuples=[] if args.te_fasta is not None: tmp=FastaReader.readAllTuples(args.te_fasta) tetuples=[t[1] for t in tmp] print "Loading TE sequences; Found {0} in file {1}".format(len(tetuples),args.te_fasta) sc=SequenceContainer(tetuples) # read the PGD; must be provided print "Loading population genome defintion" pgdr=PopGenDefinitionReader(args.pgd_definition,sc) tedeftuples=pgdr.read_transposed() print "Found {0} TE defintions".format(sc.get_count_definitions()) print "Will simulate {0} TE insertion sites within a population having {1} haploid genomes".format(pgdr.insertions, pgdr.popsize) # load chasis from the file; if provided otherwise from the PGD; not both though chasis="" if args.ref_fasta is not None: if pgdr.get_chasis() !="":
(options, args) = parser.parse_args() teorder = [ "1360", "17.6", "1731", "297", "3S18", "412", "accord", "accord2", "aurora-element", "baggins", "Bari1", "Bari2", "blood", "BS", "BS3", "BS4", "Burdock", "Circe", "copia", "Cr1a", "diver", "diver2", "Dm88", "Doc", "Doc2-element", "Doc3-element", "Doc4-element", "F-element", "FB", "flea", "frogger", "Fw2", "Fw3", "G-element", "G2", "G3", "G4", "G5", "G5A", "G6", "G7", "GATE", "gtwin", "gypsy", "gypsy10", "gypsy11", "gypsy12", "gypsy2", "gypsy3", "gypsy4", "gypsy5", "gypsy6", "gypsy7", "gypsy8", "gypsy9", "HB", "Helena", "HeT-A", "HMS-Beagle", "HMS-Beagle2", "hobo", "hopper", "hopper2", "I-element", "Idefix", "INE-1", "invader1", "invader2", "invader3", "invader4", "invader5", "invader6", "Ivk", "jockey", "jockey2", "Juan", "looper1", "Mariner", "mariner2", "Max-element", "McClintock", "mdg1", "mdg3", "micropia", "NOF", "opus", "Osvaldo", "P-element", "pogo", "Porto1", "Q-element", "Quasimodo", "R1-2", "R1A1-element", "R2-element", "roo", "rooA", "rover", "Rt1a", "Rt1b", "Rt1c", "S-element", "S2", "springer", "Stalker", "Stalker2", "Stalker3", "Stalker4", "Tabor", "TAHRE", "Tc1", "Tc1-2", "Tc3", "Tirant", "Tom1", "transib1", "transib2", "transib3", "transib4", "Transpac", "X-element", "ZAM" ] print("Loading refseqs..") refseqs = FastaReader.readFastaHash(options.teseqs) f2e = read_famtoentry(options.hier) for fam in teorder: entry = f2e[fam] seq = refseqs[entry] l = len(seq) print "{0}\t{1}".format(fam, l)
a=l.split("\t") entry=a[0] fam=a[2] ord=a[4] fto[fam]=entry return fto parser = OptionParser() parser.add_option("--input",dest="teseqs",help="The TE seqs") parser.add_option("--hier",dest="hier",help="the te hierarchy") (options, args) = parser.parse_args() teorder=["1360","17.6","1731","297","3S18","412","accord","accord2","aurora-element","baggins","Bari1","Bari2","blood","BS","BS3","BS4","Burdock","Circe","copia","Cr1a","diver","diver2","Dm88","Doc","Doc2-element","Doc3-element","Doc4-element", "F-element","FB","flea","frogger","Fw2","Fw3","G-element","G2","G3","G4","G5","G5A","G6","G7","GATE","gtwin","gypsy","gypsy10","gypsy11","gypsy12","gypsy2","gypsy3","gypsy4","gypsy5", "gypsy6","gypsy7","gypsy8","gypsy9","HB","Helena","HeT-A","HMS-Beagle","HMS-Beagle2","hobo","hopper","hopper2","I-element","Idefix","INE-1","invader1","invader2","invader3","invader4", "invader5","invader6","Ivk","jockey","jockey2","Juan","looper1","Mariner","mariner2","Max-element","McClintock","mdg1","mdg3","micropia","NOF","opus","Osvaldo","P-element","pogo", "Porto1","Q-element","Quasimodo","R1-2","R1A1-element","R2-element","roo","rooA","rover","Rt1a","Rt1b","Rt1c","S-element","S2","springer","Stalker","Stalker2","Stalker3","Stalker4", "Tabor","TAHRE","Tc1","Tc1-2","Tc3","Tirant","Tom1","transib1","transib2","transib3","transib4","Transpac","X-element","ZAM"] print("Loading refseqs..") refseqs = FastaReader.readFastaHash(options.teseqs) f2e=read_famtoentry(options.hier) for fam in teorder: entry=f2e[fam] seq=refseqs[entry] l=len(seq) print "{0}\t{1}".format(fam,l)