def __init__(self, seqIdToBp, seqIdToPred, seqIdToTruePred, taxonomy, correctLabelThreshold=None): """ Initializes the accuracy object. @param seqIdToBp: dictionary or a fasta file @param seqIdToPred: dictionary or a prediction file @param seqIdToTruePred: dictionary or a true prediction file @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy """ if isinstance(seqIdToBp, dict): self._seqToBp = seqIdToBp else: assert os.path.isfile(seqIdToBp) self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp) if isinstance(seqIdToPred, dict): self._seqToPred = seqIdToPred else: assert os.path.isfile(seqIdToPred) self._seqToPred = cami.readAssignments(seqIdToPred) if isinstance(seqIdToTruePred, dict): self._seqToTrue = seqIdToTruePred else: assert os.path.isfile(seqIdToTruePred) self._seqToTrue = cami.readAssignments(seqIdToTruePred) if isinstance(taxonomy, _TaxonomyWrapperA): self._taxonomy = taxonomy else: assert os.path.isfile(taxonomy) self._taxonomy = _TaxonomyWrapperA(taxonomy) # correct the predictions self._seqToPred if correctLabelThreshold is not None: self._seqToPred = self._correctPredictions(self._seqToBp, self._seqToPred, self._seqToTrue, self._taxonomy, correctLabelThreshold)
def __init__(self, seqIdToBp, seqIdToPred, seqIdToTruePred, taxonomy, correctLabelThreshold=None): """ Initializes the accuracy object. @param seqIdToBp: dictionary or a fasta file @param seqIdToPred: dictionary or a prediction file @param seqIdToTruePred: dictionary or a true prediction file @param taxonomy: database file in the sqlite3 format, or taxonomy object retrieved from not closed Accuracy """ if isinstance(seqIdToBp, dict): self._seqToBp = seqIdToBp else: assert os.path.isfile(seqIdToBp) self._seqToBp = fasta.getSequenceToBpDict(seqIdToBp) if isinstance(seqIdToPred, dict): self._seqToPred = seqIdToPred else: assert os.path.isfile(seqIdToPred) self._seqToPred = cami.readAssignments(seqIdToPred) if isinstance(seqIdToTruePred, dict): self._seqToTrue = seqIdToTruePred else: assert os.path.isfile(seqIdToTruePred) self._seqToTrue = cami.readAssignments(seqIdToTruePred) if isinstance(taxonomy, _TaxonomyWrapperA): self._taxonomy = taxonomy else: assert os.path.isfile(taxonomy) self._taxonomy = _TaxonomyWrapperA(taxonomy) # correct the predictions self._seqToPred if correctLabelThreshold is not None: self._seqToPred = self._correctPredictions( self._seqToBp, self._seqToPred, self._seqToTrue, self._taxonomy, correctLabelThreshold)
def __init__(self, seqNameToBp, seqNameToPred, seqNameToRefPred, taxonomy, ranksList=None): """ Initializes the main class that computes the confusion matrices. @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file @type seqNameToBp: dict; or a fasta file @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks) @type ranksList: list of str @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy" """ # Check input options and read in the data (if appropriate) self._initFailed = False # replace this with exceptions! if isinstance(seqNameToBp, dict): self._seqNameToBp = seqNameToBp elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp): self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp) else: print("Can't get sequence info from:", seqNameToBp) self._initFailed = True return if isinstance(seqNameToPred, dict): self._seqNameToPred = seqNameToPred elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred): self._seqNameToPred = cami.readAssignments(seqNameToPred) else: print("Can't get prediction info from:", seqNameToPred) self._initFailed = True return if isinstance(seqNameToRefPred, dict): self._seqNameToRefPred = seqNameToRefPred elif isinstance(seqNameToRefPred, str) and os.path.isfile(seqNameToRefPred): self._seqNameToRefPred = cami.readAssignments(seqNameToRefPred) else: print("Can't get reference prediction info from:", seqNameToRefPred) self._initFailed = True return if isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapCM(taxonomy) elif isinstance(taxonomy, _TaxonomyWrapCM): self._taxonomy = taxonomy else: print("Can't use taxonomy: ", taxonomy) if ranksList is None: ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:] # default ranks else: allowedRanksSet = set( taxonomy_ncbi.TAXONOMIC_RANKS[1:]) # custom ranks for rank in ranksList: if rank not in allowedRanksSet: print('Rank: "' + str(rank) + '" is not allowed!') self._initFailed = True return rankIdsList = [] # rankIds that will be considered for rank in ranksList: rankIdsList.append(self._taxonomy.getRankId(rank)) self._allowedRankIdsSet = set(rankIdsList) # get predictions at different taxonomic ranks # rankId -> (seqId -> taxonIdAtRank) self._rankIdToPredMap = {} self._rankIdToRefMap = {} for rankId in rankIdsList: self._rankIdToPredMap[rankId] = {} self._rankIdToRefMap[rankId] = {} # get predictions at given ranks for seqId, taxonId in self._seqNameToPred.iteritems(): while (taxonId is not None) and (taxonId != 1): rankId = self._taxonomy.getRankIdOfTaxonId(taxonId) if rankId in self._allowedRankIdsSet: self._rankIdToPredMap[rankId][seqId] = taxonId taxonId = self._taxonomy.getParent(taxonId) # get reference predictions at given ranks for seqId, taxonId in self._seqNameToRefPred.iteritems(): while (taxonId is not None) and (taxonId != 1): rankId = self._taxonomy.getRankIdOfTaxonId(taxonId) if rankId in self._allowedRankIdsSet: self._rankIdToRefMap[rankId][seqId] = taxonId taxonId = self._taxonomy.getParent(taxonId)
def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy, minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True, ignoreScaffPredToRoot=True): """ Initializes the main Consistency class. @param contigNameToBp: dictionary that maps contig names to bp (int); or a fasta file that contain contigs @param contigNameToNcbid: dictionary that maps contig names to ncbids (int); or a prediction file - first column contig name, last column ncbid @param scaffToContigList: dictionary that maps scaffold names to list of contig names; or a file - first column scaffold name, second column contig name @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp) @param cladesSet: consider only scaffolds that contain at least one contig from this set @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds (as artificial scaffolds) @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative) """ # check input options assert minScaffContigCount is None or isinstance(minScaffContigCount, int) assert minScaffBpLen is None or isinstance(minScaffBpLen, int) assert cladesSet is None or isinstance(cladesSet, set) assert isinstance(considerContigWithNoScaff, bool) assert isinstance(ignoreScaffPredToRoot, bool) if isinstance(contigNameToBp, dict): self._contigNameToBp = contigNameToBp elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp): self._contigNameToBp = getSequenceToBpDict(contigNameToBp) else: print("Can't get contig info from: ", contigNameToBp) return if isinstance(contigNameToNcbid, dict): self._contigToPred = contigNameToNcbid elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid): self._contigToPred = cami.readAssignments(contigNameToNcbid) else: print("Can't get prediction info from: ", contigNameToNcbid) return if isinstance(scaffToContigList, dict): self._scaffToContigsList = scaffToContigList elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList): self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t') else: print("Can't get scaffold config mapping from: ", scaffToContigList) return if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()): self._taxonomy = taxonomy elif isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapper(taxonomy) else: print("Can't use taxonomy:", taxonomy) return # check the consistency of the data! # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it for scaff, contigsList in self._scaffToContigsList.iteritems(): removeList = [] for contig in contigsList: if contig not in self._contigNameToBp: removeList.append(contig) for contig in removeList: contigsList.remove(contig) # if a contig was predicted but there is no scaffold assigned to it then this # contig is assigned to an "artificial scaffold" if considerContigWithNoScaff: scaffContigSet = set() for s, l in self._scaffToContigsList.iteritems(): for c in l: scaffContigSet.add(c) aloneContigSet = set() for c in self._contigToPred: if c not in scaffContigSet: aloneContigSet.add(c) for c in aloneContigSet: scaffName = str('scaffold_' + c) # make up a scaffold name assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!' self._scaffToContigsList[scaffName] = [c] # filter out scaffolds according to the input constrains self._scaffolds = dict() for scaffName, contigsList in self._scaffToContigsList.iteritems(): if minScaffContigCount is not None: if len(contigsList) < minScaffContigCount: continue if minScaffBpLen is not None: sum = 0 for contig in contigsList: sum += self._contigNameToBp[contig] if sum < minScaffBpLen: continue if cladesSet is not None: passScaff = False for contig in contigsList: if (contig in self._contigToPred) and (self._contigToPred[contig] in cladesSet): passScaff = True break if not passScaff: continue # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it! s = self._processScaffold(scaffName) if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot): self._scaffolds[scaffName] = s
def __init__(self, seqNameToBp, seqNameToPred, seqNameToRefPred, taxonomy, ranksList=None): """ Initializes the main class that computes the confusion matrices. @param seqNameToBp: contains mapping, sequence name to bp (as int); or a fasta file @type seqNameToBp: dict; or a fasta file @param seqNameToPred: contains mapping, sequence name to taxonId; or a tab separated prediction file @type seqNameToPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId @param seqNameToRefPred: contains mapping, sequence name to taxon Id; or a tab separated reference file @type seqNameToRefPred: dict; or a tab separated file, first column ~ sequence name, last column taxonId @param ranksList: list of ranks for which the confusion matrices will be computed (None ~ all default ranks) @type ranksList: list of str @param taxonomy: database file in the sqlite3 format; or taxonomy returned by function "getTaxonomy" """ # Check input options and read in the data (if appropriate) self._initFailed = False # replace this with exceptions! if isinstance(seqNameToBp, dict): self._seqNameToBp = seqNameToBp elif isinstance(seqNameToBp, str) and os.path.isfile(seqNameToBp): self._seqNameToBp = fas.getSequenceToBpDict(seqNameToBp) else: print("Can't get sequence info from:", seqNameToBp) self._initFailed = True return if isinstance(seqNameToPred, dict): self._seqNameToPred = seqNameToPred elif isinstance(seqNameToPred, str) and os.path.isfile(seqNameToPred): self._seqNameToPred = cami.readAssignments(seqNameToPred) else: print("Can't get prediction info from:", seqNameToPred) self._initFailed = True return if isinstance(seqNameToRefPred, dict): self._seqNameToRefPred = seqNameToRefPred elif isinstance(seqNameToRefPred, str) and os.path.isfile(seqNameToRefPred): self._seqNameToRefPred = cami.readAssignments(seqNameToRefPred) else: print("Can't get reference prediction info from:", seqNameToRefPred) self._initFailed = True return if isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapCM(taxonomy) elif isinstance(taxonomy, _TaxonomyWrapCM): self._taxonomy = taxonomy else: print("Can't use taxonomy: ", taxonomy) if ranksList is None: ranksList = taxonomy_ncbi.TAXONOMIC_RANKS[1:] # default ranks else: allowedRanksSet = set(taxonomy_ncbi.TAXONOMIC_RANKS[1:]) # custom ranks for rank in ranksList: if rank not in allowedRanksSet: print('Rank: "' + str(rank) + '" is not allowed!') self._initFailed = True return rankIdsList = [] # rankIds that will be considered for rank in ranksList: rankIdsList.append(self._taxonomy.getRankId(rank)) self._allowedRankIdsSet = set(rankIdsList) # get predictions at different taxonomic ranks # rankId -> (seqId -> taxonIdAtRank) self._rankIdToPredMap = {} self._rankIdToRefMap = {} for rankId in rankIdsList: self._rankIdToPredMap[rankId] = {} self._rankIdToRefMap[rankId] = {} # get predictions at given ranks for seqId, taxonId in self._seqNameToPred.iteritems(): while (taxonId is not None) and (taxonId != 1): rankId = self._taxonomy.getRankIdOfTaxonId(taxonId) if rankId in self._allowedRankIdsSet: self._rankIdToPredMap[rankId][seqId] = taxonId taxonId = self._taxonomy.getParent(taxonId) # get reference predictions at given ranks for seqId, taxonId in self._seqNameToRefPred.iteritems(): while (taxonId is not None) and (taxonId != 1): rankId = self._taxonomy.getRankIdOfTaxonId(taxonId) if rankId in self._allowedRankIdsSet: self._rankIdToRefMap[rankId][seqId] = taxonId taxonId = self._taxonomy.getParent(taxonId)
def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy, minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True, ignoreScaffPredToRoot=True): """ Initializes the main Consistency class. @param contigNameToBp: dictionary that maps contig names to bp (int); or a fasta file that contain contigs @param contigNameToNcbid: dictionary that maps contig names to ncbids (int); or a prediction file - first column contig name, last column ncbid @param scaffToContigList: dictionary that maps scaffold names to list of contig names; or a file - first column scaffold name, second column contig name @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp) @param cladesSet: consider only scaffolds that contain at least one contig from this set @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds (as artificial scaffolds) @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative) """ # check input options assert minScaffContigCount is None or isinstance( minScaffContigCount, int) assert minScaffBpLen is None or isinstance(minScaffBpLen, int) assert cladesSet is None or isinstance(cladesSet, set) assert isinstance(considerContigWithNoScaff, bool) assert isinstance(ignoreScaffPredToRoot, bool) if isinstance(contigNameToBp, dict): self._contigNameToBp = contigNameToBp elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp): self._contigNameToBp = getSequenceToBpDict(contigNameToBp) else: print("Can't get contig info from: ", contigNameToBp) return if isinstance(contigNameToNcbid, dict): self._contigToPred = contigNameToNcbid elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid): self._contigToPred = cami.readAssignments(contigNameToNcbid) else: print("Can't get prediction info from: ", contigNameToNcbid) return if isinstance(scaffToContigList, dict): self._scaffToContigsList = scaffToContigList elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList): self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t') else: print("Can't get scaffold config mapping from: ", scaffToContigList) return if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()): self._taxonomy = taxonomy elif isinstance(taxonomy, str) and os.path.isfile(taxonomy): self._taxonomy = _TaxonomyWrapper(taxonomy) else: print("Can't use taxonomy:", taxonomy) return # check the consistency of the data! # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it for scaff, contigsList in self._scaffToContigsList.iteritems(): removeList = [] for contig in contigsList: if contig not in self._contigNameToBp: removeList.append(contig) for contig in removeList: contigsList.remove(contig) # if a contig was predicted but there is no scaffold assigned to it then this # contig is assigned to an "artificial scaffold" if considerContigWithNoScaff: scaffContigSet = set() for s, l in self._scaffToContigsList.iteritems(): for c in l: scaffContigSet.add(c) aloneContigSet = set() for c in self._contigToPred: if c not in scaffContigSet: aloneContigSet.add(c) for c in aloneContigSet: scaffName = str('scaffold_' + c) # make up a scaffold name assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!' self._scaffToContigsList[scaffName] = [c] # filter out scaffolds according to the input constrains self._scaffolds = dict() for scaffName, contigsList in self._scaffToContigsList.iteritems(): if minScaffContigCount is not None: if len(contigsList) < minScaffContigCount: continue if minScaffBpLen is not None: sum = 0 for contig in contigsList: sum += self._contigNameToBp[contig] if sum < minScaffBpLen: continue if cladesSet is not None: passScaff = False for contig in contigsList: if (contig in self._contigToPred) and ( self._contigToPred[contig] in cladesSet): passScaff = True break if not passScaff: continue # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it! s = self._processScaffold(scaffName) if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot): self._scaffolds[scaffName] = s
def _main(): # define arguments parser = argparse.ArgumentParser(description='Default task: PPS+ evaluation', epilog='') parser.add_argument('-b', '--cont-binning-file', nargs=1, type=file, required=True, help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b') parser.add_argument('-t', '--cont-true-binning-file', nargs=1, type=file, required=True, help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t') parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False, help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f') parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False, help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m') parser.add_argument('-n', '--cont-ncbi-taxonomy', nargs=1, required=False, help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir', dest='n') parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True, help='Output directory.', metavar='output_dir', dest='o') parser.add_argument('-j', '--default-job', nargs='+', help='What task/job should be performed (p~precision/recall, s~scaff-contig consistency, ' 'c~confusion tables, default - if not spec compute all)', metavar='', dest='j') args = parser.parse_args() # read and check the arguments seqIdToBp = None scaffToContig = None binning = None trueBinning = None outputDir = None job = None if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]): outputDir = args.o[0] if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name): binningFile = args.b[0].name binning = cami.readAssignments(binningFile) if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name): trueBinningFile = args.t[0].name trueBinning = cami.readAssignments(trueBinningFile) if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name): seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name) # contigsFileListing = args.f[0].name # for line in open(contigsFileListing): # if os.path.isfile(line.strip()): # d = fasta.getSequenceToBpDict(line.strip()) # if seqIdToBp is None: # seqIdToBp = d # else: # count = len(d) + len(seqIdToBp) # seqIdToBp.update(d) # if count > len(seqIdToBp): # sys.stderr.write('The fasta files contain duplicate entries!') if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name): scaffoldContigMapping = args.m[0].name scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t') taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db') if not os.path.isfile(taxonomyPath): if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]): # build the ncbi taxonomy in the case it doesn't exist ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0])) else: taxonomyPath = None if args.j and len(args.j) > 0 and len(set(args.j).intersection(set(['p', 's', 'c']))) > 0: job = set(args.j) # print job # print args.j # print len(seqIdToBp) # print len(binning) # print len(trueBinning) # print taxonomyPath # print outputDir if (job is None or 'p' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing precision/recall') # precision/recall - no correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv')) out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # precision/recall - with correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall_correction.csv')) out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # compute confusion matrices if (job is None or 'c' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing confusion matrices') confusionMatrix = confusion_matrix.ConfusionMatrix(seqIdToBp, binning, trueBinning, taxonomyPath, RANKS) for rank in RANKS: confusionMatrix.generateConfusionMatrix(rank, os.path.join(outputDir, 'confusion_matrix')) confusionMatrix.close() # compute scaffold contig consistency if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \ and outputDir: print('Computing scaffold-contig consistency') cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt')) out.writeText(cons.getGroupedScaffoldsPrint()) cons.close() out.close() createEvalMetaFile(outputDir)
def _main(): # define arguments parser = argparse.ArgumentParser( description='Default task: PPS+ evaluation', epilog='') parser.add_argument( '-b', '--cont-binning-file', nargs=1, type=file, required=True, help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b') parser.add_argument( '-t', '--cont-true-binning-file', nargs=1, type=file, required=True, help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t') parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False, help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f') parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False, help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m') parser.add_argument( '-n', '--cont-ncbi-taxonomy', nargs=1, required=False, help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir', dest='n') parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True, help='Output directory.', metavar='output_dir', dest='o') parser.add_argument( '-j', '--default-job', nargs='+', help= 'What task/job should be performed (p~precision/recall, s~scaff-contig consistency, ' 'c~confusion tables, default - if not spec compute all)', metavar='', dest='j') args = parser.parse_args() # read and check the arguments seqIdToBp = None scaffToContig = None binning = None trueBinning = None outputDir = None job = None if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]): outputDir = args.o[0] if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name): binningFile = args.b[0].name binning = cami.readAssignments(binningFile) if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name): trueBinningFile = args.t[0].name trueBinning = cami.readAssignments(trueBinningFile) if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name): seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name) # contigsFileListing = args.f[0].name # for line in open(contigsFileListing): # if os.path.isfile(line.strip()): # d = fasta.getSequenceToBpDict(line.strip()) # if seqIdToBp is None: # seqIdToBp = d # else: # count = len(d) + len(seqIdToBp) # seqIdToBp.update(d) # if count > len(seqIdToBp): # sys.stderr.write('The fasta files contain duplicate entries!') if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name): scaffoldContigMapping = args.m[0].name scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t') taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db') if not os.path.isfile(taxonomyPath): if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]): # build the ncbi taxonomy in the case it doesn't exist ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0])) else: taxonomyPath = None if args.j and len(args.j) > 0 and len( set(args.j).intersection(set(['p', 's', 'c']))) > 0: job = set(args.j) # print job # print args.j # print len(seqIdToBp) # print len(binning) # print len(trueBinning) # print taxonomyPath # print outputDir if ( job is None or 'p' in args.j ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing precision/recall') # precision/recall - no correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv')) out.writeText( acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # precision/recall - with correction acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD) out = csv.OutFileBuffer( os.path.join(outputDir, 'precision_recall_correction.csv')) out.writeText( acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE)) out.close() acc.close() # compute confusion matrices if ( job is None or 'c' in args.j ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir: print('Computing confusion matrices') confusionMatrix = confusion_matrix.ConfusionMatrix( seqIdToBp, binning, trueBinning, taxonomyPath, RANKS) for rank in RANKS: confusionMatrix.generateConfusionMatrix( rank, os.path.join(outputDir, 'confusion_matrix')) confusionMatrix.close() # compute scaffold contig consistency if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \ and outputDir: print('Computing scaffold-contig consistency') cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath) out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt')) out.writeText(cons.getGroupedScaffoldsPrint()) cons.close() out.close() createEvalMetaFile(outputDir)