def main(): global write4, write5, write6 ########## # parse the command line ########## avgBucket = 10 numBuckets = None anonymous = False doSecondary = False skipHeader = False isWindows = False fileNames = [] bigEndian = False oddBuckets = False keepEmpties = False screwup = [] # (so that we can verify that validation works!) debug = [] progress = None args = sys.argv[1:] while (len(args) > 0): arg = args.pop(0) val = None fields = arg.split("=", 1) if (len(fields) == 2): arg = fields[0] val = fields[1] if (val == ""): usage("missing a value in %s=" % arg) if (arg in ["--help", "-h", "--h", "-help"]) and (val == None): usage() elif (arg == "--bucketsize") and (val != None): try: avgBucket = int(val) if (avgBucket < 1): raise ValueError except ValueError: assert (False), "invalid bucket size: %s" % val elif (arg == "--numbuckets") and (val != None): try: numBuckets = int(val) if (numBuckets < 1): raise ValueError except ValueError: assert (False), "invalid number of buckets: %s" % val elif (arg == "--secondary") and (val == None): doSecondary = True assert (False), "secondary hash is not implemented yet (sorry)" elif (arg == "--anonymous") and (val == None): anonymous = True elif (arg == "--skipheader") and (val == None): skipHeader = True elif (arg == "--windows") and (val == None): isWindows = True elif (arg == "--bigendian") and (val == None): bigEndian = True elif (arg == "--oddbuckets") and (val == None): oddBuckets = True elif (arg == "--keepempties") and (val == None): keepEmpties = True elif (arg == "--screwup") and (val != None): screwup += [val] elif (arg == "--debug") and (val == None): debug += ["debug"] elif (arg == "--debug") and (val != None): debug += [val] elif (arg == "--progress") and (val == None): debug += ["progress"] progress = None elif (arg == "--progress") and (val != None): debug += ["progress"] progress = int(val) elif (arg.startswith("--")): usage("unknown argument: %s" % arg) elif (val == None): fileNames += [arg] else: usage("unknown argument: %s" % arg) # sanity check on file name if (fileNames != []): for fileName in fileNames: try: slash = fileName.rfind("/") dot = fileName.rfind(".") if (dot < 0): raise ValueError if (dot < slash): raise ValueError if (fileName[dot:] not in [".fa", ".fasta"]): raise ValueError except ValueError: assert (False), \ "bad fasta file name (it has to end with .fa or .fasta)" \ % fileName if (anonymous) and (len(fileNames) > 1): assert ( False), "can't use anonymous when you have multiple fasta files" assert (len(fileNames) <= 255), "too many input files (max is 255)" # set up big- or little-endian if (bigEndian): write4 = write4_big_endian write5 = write5_big_endian write6 = write6_big_endian else: write4 = write4_little_endian write5 = write5_little_endian write6 = write6_little_endian ########## # read the fasta file(s) ########## fileNameToNum = {} # read the fasta file(s), collecting names, etc. if (fileNames == []): fileNames += [""] sequences = [] nameSeen = {} for (fileNum, fileName) in enumerate(fileNames): assert (fileName not in fileNameToNum), \ "can't use the same file twice (%s)" % fileName fileNameToNum[fileName] = fileNum if (fileName == ""): f = sys.stdin else: try: f = file(fileName, "rt") except IOError: assert (False), "unable to open %s" % fileName seqNum = 0 for seqInfo in fasta_sequences(f, twoByteLFs=isWindows): (name, length, lineNum, headerOffset, seqOffset) = seqInfo seqNum += 1 assert (name not in nameSeen), \ "%s is used for two sequences (at %s and %s)" \ % (name, line_reference(nameSeen[name]), line_reference((fileName,lineNum))) nameSeen[name] = (fileName, lineNum) if (length == 0): if (keepEmpties): print >>sys.stderr, "WARNING: keeping empty sequence %s (%s)" \ % (name,line_reference((fileName,lineNum))) else: print >>sys.stderr, "WARNING: discarding empty sequence %s (%s)" \ % (name,line_reference((fileName,lineNum))) continue if (skipHeader): sequences += [(name, length, fileNum, seqOffset)] else: sequences += [(name, length, fileNum, headerOffset)] if ("progress" in debug) and (progress != None) and (seqNum % progress == 0): print >> sys.stderr, "read sequence %d (%s)" % (seqNum, name) if (fileName != ""): f.close() if ("progress" in debug): if (fileName != ""): print >> sys.stderr, "finished reading %s" % fileName else: print >> sys.stderr, "finished reading input file" # scan collected sequence info and assign hash values numSequences = len(sequences) assert (numSequences > 0), "input file contains no sequences!" if (numBuckets == None): numBuckets = (numSequences + avgBucket - 1) // avgBucket if (numBuckets == 0): numBuckets += 1 if (oddBuckets) and (numBuckets % 1 == 0): numBuckets += 1 sequences = [(HsxFile.hash(name) % numBuckets,name,length,fileNum,offset) \ for (name,length,fileNum,offset) in sequences] sequences.sort() if ("progress" in debug): print >> sys.stderr, "finished computing hashes" if ("info" in debug): for (hash, name, length, fileNum, offset) in sequences: print >>sys.stderr, "%10d==%08X %2d:%08X %s %d" \ % (HsxFile.hash(name),hash,fileNum,offset,name,length) ########## # write the index ########## # decide how we will write the file names fileNumToOffset = {} fileNumToFastaName = {} fileNumToFastaExt = {} fileInfoLength = 0 for fileName in fileNames: fileNum = fileNameToNum[fileName] fastaName = "" fastaExt = "fa" if (fileName != ""): dot = fileName.rfind(".") fastaExt = fileName[dot + 1:] if (not anonymous): fastaName = fileName[:dot] fileNumToOffset[fileNum] = fileInfoLength fileNumToFastaName[fileNum] = fastaName fileNumToFastaExt[fileNum] = fastaExt fileInfoLength += len(fastaExt) + 1 + len(fastaName) + 1 # determine header and table sizes headerLength = 0x1C headerPad = pad_for_16(8 + headerLength) headerSize = headerLength + headerPad numFiles = len(fileNames) fileTableOffset = 0x08 + headerSize fileTableLength = numFiles * 4 fileTablePad = pad_for_16(fileTableLength) fileTableSize = fileTableLength + fileTablePad fileInfoOffset = fileTableOffset + fileTableSize fileInfoPad = pad_for_16(fileInfoLength) fileInfoSize = fileInfoLength + fileInfoPad hashTableOffset = fileInfoOffset + fileInfoSize hashTableLength = (numBuckets + 1) * 5 hashTablePad = pad_for_16(hashTableLength) if ("hashpad" in screwup): hashTablePad = -1 hashTableSize = hashTableLength + hashTablePad seqTableOffset = hashTableOffset + hashTableSize if ("file" in debug): print >> sys.stderr, "fileTableOffset = %08X (%08X)" % ( fileTableOffset, fileTableSize) print >> sys.stderr, "fileInfoOffset = %08X (%08X)" % (fileInfoOffset, fileInfoSize) print >> sys.stderr, "hashTableOffset = %08X (%08X)" % ( hashTableOffset, hashTableSize) print >> sys.stderr, "seqTableOffset = %08X" % seqTableOffset # determine offsets into the sequence table nameToOffset = {} prevHash = None for (hash, name, length, fileNum, offset) in sequences: if (hash == prevHash): continue nameToOffset[name] = True seqOffset = seqTableOffset for (hash, name, length, fileNum, offset) in sequences: if (name in nameToOffset): nameToOffset[name] = seqOffset seqOffset += 12 + len(name) + 1 nameToOffset[""] = seqOffset # write header write4(HsxFile.magicBig) write4(HsxFile.version) write4(headerLength) write4(numFiles) write4(fileTableOffset) write4(numBuckets) write4(hashTableOffset) write4(numSequences) write4(seqTableOffset) writeZeros(headerPad) if ("progress" in debug): print >> sys.stderr, "finished writing header" # write file table and file info for fileName in fileNames: fileNum = fileNameToNum[fileName] write4(fileInfoOffset + fileNumToOffset[fileNum]) writeZeros(fileTablePad) for fileName in fileNames: fileNum = fileNameToNum[fileName] writeString(fileNumToFastaExt[fileNum]) writeString(fileNumToFastaName[fileNum]) writeZeros(fileInfoPad) if ("progress" in debug): print >> sys.stderr, "finished writing file table" # write hash table msBit5 = 0x80 << (4 * 8) prevHash = None for (hash, name, length, fileNum, offset) in sequences: if (hash == prevHash): bucketSize += 1 continue if (prevHash != None): # output previous bucket write5(seqOffset) if ("progress" in debug) and (progress != None) and ( (hash + 1) % progress == 0): print >> sys.stderr, "wrote hash bucket %d" % (hash + 1) # output intervening empty buckets prevHash += 1 while (prevHash < hash): write5(msBit5 + nameToOffset[name]) prevHash += 1 if ("progress" in debug) and (progress != None) and ( prevHash % progress == 0): print >> sys.stderr, "wrote hash bucket %d" % (prevHash) bucketSize = 1 seqOffset = nameToOffset[name] prevHash = hash # output previous bucket write5(seqOffset) seqOffset = nameToOffset[""] # offset past end of sequence index table # output intervening empty buckets prevHash += 1 while (prevHash < numBuckets): write5(msBit5 + seqOffset) prevHash += 1 # output extra bucket write5(msBit5 + seqOffset) writeZeros(hashTablePad) if ("progress" in debug): print >> sys.stderr, "finished writing hash table" # write sequence table for (seqNum, (hash, name, length, fileNum, offset)) in enumerate(sequences): write5(length) # length of the sequence write1(fileNum) # file number (index into file table) write6(offset) # offset to the sequence data writeString(name) # name of sequence if ("progress" in debug) and (progress != None) and ((seqNum + 1) % progress == 0): print >> sys.stderr, "wrote sequence entry %d" % (seqNum + 1) if ("progress" in debug): print >> sys.stderr, "finished writing index"
def main(): ########## # parse the command line ########## hsxFileName = None seqNames = [] warnOnMissing = True showProgress = False debug = [] args = sys.argv[1:] while (len(args) > 0): arg = args.pop(0) val = None fields = arg.split("=",1) if (len(fields) == 2): arg = fields[0] val = fields[1] if (val == ""): usage("missing a value in %s=" % arg) if (arg in ["--help","-h","--h","-help"]) and (val == None): usage() elif (arg == "--names") and (val != None): f = open(val) seqNames += [line.strip() for line in f] f.close() elif (arg == "--nowarn") and (val == None): warnOnMissing = False elif (arg == "--progress") and (val == None): showProgress = True elif (arg == "--debug") and (val == None): debug += ["debug"] elif (arg == "--debug") and (val != None): debug += [val] elif (arg.startswith("--")): usage("unknown argument: %s" % arg) elif (hsxFileName == None) and (val == None): hsxFileName = arg elif (val == None): seqNames += [arg] else: usage("unknown argument: %s" % arg) if (hsxFileName == None): usage("you must give me an hsx file!") if (seqNames == []): usage("you must give me some sequence names!") ########## # fetch the sequences ########## hsx = HsxFile(hsxFileName,debug=debug) for name in seqNames: seq = hsx.get_sequence(name) if (seq != None): print (seq) if (showProgress): print (name,file=sys.stderr) elif (warnOnMissing): print ("WARNING: %s not found" % name, file=sys.stderr) hsx.close()
def main(): global write4, write5, write6 ########## # parse the command line ########## avgBucket = 10 numBuckets = None anonymous = False doSecondary = False skipHeader = False isWindows = False fileNames = [] bigEndian = False oddBuckets = False keepEmpties = False screwup = [] # (so that we can verify that validation works!) debug = [] progress = None args = sys.argv[1:] while len(args) > 0: arg = args.pop(0) val = None fields = arg.split("=", 1) if len(fields) == 2: arg = fields[0] val = fields[1] if val == "": usage("missing a value in %s=" % arg) if (arg == "--bucketsize") and (val != None): try: avgBucket = int(val) if avgBucket < 1: raise ValueError except ValueError: assert False, "invalid bucket size: %s" % val elif (arg == "--numbuckets") and (val != None): try: numBuckets = int(val) if numBuckets < 1: raise ValueError except ValueError: assert False, "invalid number of buckets: %s" % val elif (arg == "--secondary") and (val == None): doSecondary = True assert False, "secondary hash is not implemented yet (sorry)" elif (arg == "--anonymous") and (val == None): anonymous = True elif (arg == "--skipheader") and (val == None): skipHeader = True elif (arg == "--windows") and (val == None): isWindows = True elif (arg == "--bigendian") and (val == None): bigEndian = True elif (arg == "--oddbuckets") and (val == None): oddBuckets = True elif (arg == "--keepempties") and (val == None): keepEmpties = True elif (arg == "--screwup") and (val != None): screwup += [val] elif (arg == "--debug") and (val == None): debug += ["debug"] elif (arg == "--debug") and (val != None): debug += [val] elif (arg == "--progress") and (val == None): debug += ["progress"] progress = None elif (arg == "--progress") and (val != None): debug += ["progress"] progress = int(val) elif arg.startswith("--"): usage("unknown argument: %s" % arg) elif val == None: fileNames += [arg] else: usage("unknown argument: %s" % arg) # sanity check on file name if fileNames != []: for fileName in fileNames: try: slash = fileName.rfind("/") dot = fileName.rfind(".") if dot < 0: raise ValueError if dot < slash: raise ValueError if fileName[dot:] not in [".fa", ".fasta"]: raise ValueError except ValueError: assert False, "bad fasta file name (it has to end with .fa or .fasta)" % fileName if (anonymous) and (len(fileNames) > 1): assert False, "can't use anonymous when you have multiple fasta files" assert len(fileNames) <= 255, "too many input files (max is 255)" # set up big- or little-endian if bigEndian: write4 = write4_big_endian write5 = write5_big_endian write6 = write6_big_endian else: write4 = write4_little_endian write5 = write5_little_endian write6 = write6_little_endian ########## # read the fasta file(s) ########## fileNameToNum = {} # read the fasta file(s), collecting names, etc. if fileNames == []: fileNames += [""] sequences = [] nameSeen = {} for (fileNum, fileName) in enumerate(fileNames): assert fileName not in fileNameToNum, "can't use the same file twice (%s)" % fileName fileNameToNum[fileName] = fileNum if fileName == "": f = sys.stdin else: try: f = file(fileName, "rt") except IOError: assert False, "unable to open %s" % fileName seqNum = 0 for seqInfo in fasta_sequences(f, twoByteLFs=isWindows): (name, length, lineNum, headerOffset, seqOffset) = seqInfo seqNum += 1 assert name not in nameSeen, "%s is used for two sequences (at %s and %s)" % ( name, line_reference(nameSeen[name]), line_reference((fileName, lineNum)), ) nameSeen[name] = (fileName, lineNum) if length == 0: if keepEmpties: print >>sys.stderr, "WARNING: keeping empty sequence %s (%s)" % ( name, line_reference((fileName, lineNum)), ) else: print >>sys.stderr, "WARNING: discarding empty sequence %s (%s)" % ( name, line_reference((fileName, lineNum)), ) continue if skipHeader: sequences += [(name, length, fileNum, seqOffset)] else: sequences += [(name, length, fileNum, headerOffset)] if ("progress" in debug) and (progress != None) and (seqNum % progress == 0): print >>sys.stderr, "read sequence %d (%s)" % (seqNum, name) if fileName != "": f.close() if "progress" in debug: if fileName != "": print >>sys.stderr, "finished reading %s" % fileName else: print >>sys.stderr, "finished reading input file" # scan collected sequence info and assign hash values numSequences = len(sequences) assert numSequences > 0, "input file contains no sequences!" if numBuckets == None: numBuckets = int(ceil(numSequences / avgBucket)) if (oddBuckets) and (numBuckets % 1 == 0): numBuckets += 1 sequences = [ (HsxFile.hash(name) % numBuckets, name, length, fileNum, offset) for (name, length, fileNum, offset) in sequences ] sequences.sort() if "progress" in debug: print >>sys.stderr, "finished computing hashes" if "info" in debug: for (hash, name, length, fileNum, offset) in sequences: print >>sys.stderr, "%10d==%08X %2d:%08X %s %d" % (HsxFile.hash(name), hash, fileNum, offset, name, length) ########## # write the index ########## # decide how we will write the file names fileNumToOffset = {} fileNumToFastaName = {} fileNumToFastaExt = {} fileInfoLength = 0 for fileName in fileNames: fileNum = fileNameToNum[fileName] fastaName = "" fastaExt = "fa" if fileName != "": dot = fileName.rfind(".") fastaExt = fileName[dot + 1 :] if not anonymous: fastaName = fileName[:dot] fileNumToOffset[fileNum] = fileInfoLength fileNumToFastaName[fileNum] = fastaName fileNumToFastaExt[fileNum] = fastaExt fileInfoLength += len(fastaExt) + 1 + len(fastaName) + 1 # determine header and table sizes headerLength = 0x1C headerPad = pad_for_16(8 + headerLength) headerSize = headerLength + headerPad numFiles = len(fileNames) fileTableOffset = 0x08 + headerSize fileTableLength = numFiles * 4 fileTablePad = pad_for_16(fileTableLength) fileTableSize = fileTableLength + fileTablePad fileInfoOffset = fileTableOffset + fileTableSize fileInfoPad = pad_for_16(fileInfoLength) fileInfoSize = fileInfoLength + fileInfoPad hashTableOffset = fileInfoOffset + fileInfoSize hashTableLength = (numBuckets + 1) * 5 hashTablePad = pad_for_16(hashTableLength) if "hashpad" in screwup: hashTablePad = -1 hashTableSize = hashTableLength + hashTablePad seqTableOffset = hashTableOffset + hashTableSize if "file" in debug: print >>sys.stderr, "fileTableOffset = %08X (%08X)" % (fileTableOffset, fileTableSize) print >>sys.stderr, "fileInfoOffset = %08X (%08X)" % (fileInfoOffset, fileInfoSize) print >>sys.stderr, "hashTableOffset = %08X (%08X)" % (hashTableOffset, hashTableSize) print >>sys.stderr, "seqTableOffset = %08X" % seqTableOffset # determine offsets into the sequence table nameToOffset = {} prevHash = None for (hash, name, length, fileNum, offset) in sequences: if hash == prevHash: continue nameToOffset[name] = True seqOffset = seqTableOffset for (hash, name, length, fileNum, offset) in sequences: if name in nameToOffset: nameToOffset[name] = seqOffset seqOffset += 12 + len(name) + 1 nameToOffset[""] = seqOffset # write header write4(HsxFile.magicBig) write4(HsxFile.version) write4(headerLength) write4(numFiles) write4(fileTableOffset) write4(numBuckets) write4(hashTableOffset) write4(numSequences) write4(seqTableOffset) writeZeros(headerPad) if "progress" in debug: print >>sys.stderr, "finished writing header" # write file table and file info for fileName in fileNames: fileNum = fileNameToNum[fileName] write4(fileInfoOffset + fileNumToOffset[fileNum]) writeZeros(fileTablePad) for fileName in fileNames: fileNum = fileNameToNum[fileName] writeString(fileNumToFastaExt[fileNum]) writeString(fileNumToFastaName[fileNum]) writeZeros(fileInfoPad) if "progress" in debug: print >>sys.stderr, "finished writing file table" # write hash table msBit5 = 0x80 << (4 * 8) prevHash = None for (hash, name, length, fileNum, offset) in sequences: if hash == prevHash: bucketSize += 1 continue if prevHash != None: # output previous bucket write5(seqOffset) if ("progress" in debug) and (progress != None) and ((hash + 1) % progress == 0): print >>sys.stderr, "wrote hash bucket %d" % (hash + 1) # output intervening empty buckets prevHash += 1 while prevHash < hash: write5(msBit5 + nameToOffset[name]) prevHash += 1 if ("progress" in debug) and (progress != None) and (prevHash % progress == 0): print >>sys.stderr, "wrote hash bucket %d" % (prevHash) bucketSize = 1 seqOffset = nameToOffset[name] prevHash = hash # output previous bucket write5(seqOffset) seqOffset = nameToOffset[""] # offset past end of sequence index table # output intervening empty buckets prevHash += 1 while prevHash < numBuckets: write5(msBit5 + seqOffset) prevHash += 1 # output extra bucket write5(msBit5 + seqOffset) writeZeros(hashTablePad) if "progress" in debug: print >>sys.stderr, "finished writing hash table" # write sequence table for (seqNum, (hash, name, length, fileNum, offset)) in enumerate(sequences): write5(length) # length of the sequence write1(fileNum) # file number (index into file table) write6(offset) # offset to the sequence data writeString(name) # name of sequence if ("progress" in debug) and (progress != None) and ((seqNum + 1) % progress == 0): print >>sys.stderr, "wrote sequence entry %d" % (seqNum + 1) if "progress" in debug: print >>sys.stderr, "finished writing index"
def main(): ########## # parse the command line ########## hsxFileName = None seqNames = [] warnOnMissing = True showProgress = False debug = [] args = sys.argv[1:] while (len(args) > 0): arg = args.pop(0) val = None fields = arg.split("=",1) if (len(fields) == 2): arg = fields[0] val = fields[1] if (val == ""): usage("missing a value in %s=" % arg) if (arg == "--names") and (val != None): f = file(val) seqNames += [line.strip() for line in f] f.close() elif (arg == "--nowarn") and (val == None): warnOnMissing = False elif (arg == "--progress") and (val == None): showProgress = True elif (arg == "--debug") and (val == None): debug += ["debug"] elif (arg == "--debug") and (val != None): debug += [val] elif (arg.startswith("--")): usage("unknown argument: %s" % arg) elif (hsxFileName == None) and (val == None): hsxFileName = arg elif (val == None): seqNames += [arg] else: usage("unknown argument: %s" % arg) if (hsxFileName == None): usage("you must give me an hsx file!") if (seqNames == []): usage("you must give me some sequence names!") ########## # fetch the sequences ########## hsx = HsxFile(hsxFileName,debug=debug) for name in seqNames: seq = hsx.get_sequence(name) if (seq != None): print seq if (showProgress): print >>sys.stderr, name elif (warnOnMissing): print >>sys.stderr, "WARNING: %s not found" % name hsx.close()