if args.reference: sys.stderr.write("Parsing reference. This could take a while...\n") try: with open(args.reference + ".fai", "r") as fai: scafs_lengths = [line.split()[:2] for line in fai] except: sys.stderr.write( "WARNING: Could not parse fai file, vcf header will not contain contig entries...\n" ) scafs_lengths = None with gzip.open(args.reference, "r") if args.reference.endswith(".gz") else open( args.reference, "r") as ref: refDict = dict(zip(*genomics.parseFasta(ref.read()))) else: refDict = None ######################################################################################### genoFileReader = genomics.GenoFileReader(genoFile) allNames = genoFileReader.names if not args.samples: namesToUse = allNames else: namesToUse = args.samples.split(",") outFile.write("##fileformat=VCFv4.2\n") if refDict:
#args = parser.parse_args("-n 5 -t test.trees -o test.topos.txt -w test.weights.B.csv -g A a,b,c -g B d,e,f -g C g,h,i -g D j,k,l".split()) if args.phylipIn: inFormat = "phylip" else: inFormat = "fasta" if args.phylipOut: outFormat = "phylip" else: outFormat = "fasta" l = args.lineLen ######################################################################## allText = sys.stdin.read() if inFormat == "fasta": names, seqs = genomics.parseFasta(allText) else: names, seqs = genomics.parsePhylip(allText) if args.truncateNames: names = [name.split()[0] for name in names] regions = [parseRegionText(r) for r in args.regions] if args.regions else [] if args.regionsFile: with open(args.regionsFile, "r") as rf: for line in rf: regions.append(parseRegionList(line.split())) #only filter and chop sequences if necessary if len(regions) >= 1: outNames = [] outSeqs = [] for seqName,start,end,ori in regions:
if args.genoFile: if args.genoFile[-3:] == ".gz": genoFile = gzip.open(args.genoFile, "w") else: genoFile = open(args.genoFile, "w") else: genoFile = sys.stdout if args.randomPhase: import random ############################# #read sequence file seqString = seqFile.read() #parse if args.format == "fasta": seqNames, seqs = genomics.parseFasta(seqString) multi = False elif args.format == "phylip": #with phylip its possible to have multiple alignments, so we need to check if thats the case pieces = genomics.parsePhylip(seqString) if type(pieces) == tuple: seqNames, seqs = pieces multi = False else: _seqNames_, _seqs_ = zip(*pieces) multi = True if not multi: #if there is a single set of sequences we parse it and output either as contigs or individuals #sequences to keep
args = parser.parse_args() ################################################################################ #get gene data sys.stderr.write("Parsing annotation\n") with gzip.open(args.annotation, "rt") if args.annotation.endswith(".gz") else open( args.annotation, "rt") as ann: geneData = genomics.parseGenes(ann.readlines(), fmt=args.format) #get scaffold names sys.stderr.write("Loading reference genome\n") with gzip.open(args.ref, "rt") if args.ref.endswith(".gz") else open( args.ref, "rt") as ref: scaffolds, _sequences_ = genomics.parseFasta(ref.read(), makeUppercase=True) sequences = {} for i, scaffold in enumerate(scaffolds): sequences[scaffold] = _sequences_[i] #open output if not args.outFile: outFile = sys.stdout else: outFile = gzip.open(args.outFile, "wt") if args.outFile.endswith(".gz") else open( args.outFile, "wt") outFile.write("\t".join([ "scaffold", "position", "codon_position", "substitution_type", "degeneracy" ]) + "\n")