示例#1
0
文件: workflow.py 项目: alienzj/EukCC
    def write_outfile(self, outfile=None, result=None):
        if outfile is None:
            outfile = os.path.join(self.cfg["outdir"], "eukcc.tsv")
        # write to output file
        k = [
            "completeness",
            "contamination",
            "max_silent_contamination",
            "node",
            "n",
            "ngenomes",
            "cover",
            "nPlacements",
            "taxid",
            "lineage",
            "taxidlineage",
            "file",
        ]
        with open(outfile, "w") as f:
            f.write("{}\n".format("\t".join(k)))
            if result is None:
                logging.warning("No estimates were written")
                exit(11)
            for p in result:
                # insert the file name
                p["file"] = self.cfg["name"]
                # write to file
                f.write("{}\n".format("\t".join([str(p[key]) for key in k])))

        log("Wrote estimates to: {}".format(outfile))
示例#2
0
    def estimate(self, hits, outfile, placements):
        hit = {}
        logging.info("Estimating scores now")

        if self.cfg["touch"]:
            file.touch(outfile)
            logging.info("Returning as we only touch")
            return

        r = base.readTSV(hits)
        # count profile hits
        for row in r:
            if row["profile"] not in hit.keys():
                hit[row["profile"]] = 1
            else:
                hit[row["profile"]] += 1

        singletons = set(hit.keys())
        multitons = set([k for k, v in hit.items() if v > 1])

        # now we can estimate completeness and contamination for each placement
        for i in range(len(placements)):
            s = self.readSet(placements[i]["node"])
            # completeness is the overap of both sets
            cmpl = len(singletons & s) / len(s)
            cont = len(multitons & s) / len(s)

            # make to percentage and round to 2 positions
            placements[i]["completeness"] = round(cmpl * 100, 2)
            placements[i]["contamination"] = round(cont * 100, 2)

        log("Finished estimating")

        # write to output file
        k = [
            "completeness",
            "contamination",
            "node",
            "n",
            "ngenomes",
            "cover",
            "nPlacements",
            "taxid",
            "lineage",
            "taxidlineage",
            "file",
        ]
        with open(outfile, "w") as f:
            f.write("{}\n".format("\t".join(k)))
            for p in placements:
                # insert the file name
                p["file"] = self.cfg["name"]
                # write to file
                f.write("{}\n".format("\t".join([str(p[key]) for key in k])))

        log("Wrote estimates to: {}".format(outfile))

        # done
        return True
示例#3
0
文件: info.py 项目: alienzj/EukCC
 def readInfo(self, name):
     p = os.path.join(self.cfg["db"], "refpkg", name, "CONTENTS.json")
     # raise error if we cant find the file
     if not base.exists(p):
         log("Could not find {}".format(p))
         exit(13)
     # read and return json
     with open(p) as json_file:
         j = json.load(json_file)
         return j
示例#4
0
文件: info.py 项目: alienzj/EukCC
 def pkgfile(self, name, t):
     """
     get a file path for a refpkg package
     """
     info = self.readInfo(name)
     p = os.path.join(self.cfg["db"], "refpkg", name, info["files"][t])
     if base.exists(p):
         return p
     else:
         log("Could not find: {}".format(p))
         exit(12)
示例#5
0
    def concatHMM(self):
        # create a dir for this
        hmmdir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer", "estimations")
        file.isdir(hmmdir)
        hmmconcat = os.path.join(hmmdir, "all.hmm")

        if self.cfg["touch"]:
            file.touch(hmmconcat)
            return hmmconcat

        profiles = set()
        for p in self.placements[self.cfg["placementMethod"]]:
            localpath = os.path.join(self.cfg["db"], "sets", "{}.set".format(p["node"]))
            with open(localpath) as f:
                for line in f:
                    profiles.add(line.strip())
        # make profiles to sorted list
        profiles = list(profiles)
        profiles.sort()
        # create all paths for all hmms
        hmmerpaths = [os.path.join(self.cfg["db"], "hmms", "panther", "{}.hmm".format(profile)) for profile in profiles]
        # sort and check if we already have the hmm for this
        canuseprev = False
        profilehash = hashlib.sha256("_".join(profiles).encode()).hexdigest()
        hashpath = os.path.join(hmmdir, "all.hash")
        if file.exists(hashpath):
            with open(hashpath) as f:
                for line in f:
                    prevhash = line.strip()
                    break
            canuseprev = prevhash == profilehash

        if canuseprev:
            # we can use the existing file, so no need to continue
            log("Using pressed hmms from last run")
            return hmmconcat

        # concatenate
        if len(profiles) == 0:
            logging.error("We have no profiles to evaluate")
            exit(1)

        log("{} hmm profiles need to be used for estimations".format(len(profiles)))
        log("Concatenating hmms, this might take a while (IO limited)")
        hmmconcat = base.concatenate(hmmconcat, hmmerpaths)
        # press
        log("Pressing hmms")
        hp = hmmpress("hmmpress", hmmconcat, None, touch=self.cfg["touch"])
        hp.run()

        # save profile hash
        with open(hashpath, "w") as f:
            f.write(f"{profilehash}")

        return hmmconcat
示例#6
0
文件: workflow.py 项目: alienzj/EukCC
    def estimate(self, hits, outfile, placements):
        hit = {}
        logging.info("Estimating scores now")

        if self.cfg["touch"]:
            file.touch(outfile)
            logging.info("Returning as we only touch")
            return

        r = base.readTSV(hits)
        # count profile hits
        for row in r:
            if row["profile"] not in hit.keys():
                hit[row["profile"]] = 1
            else:
                hit[row["profile"]] += 1

        singletons = set(hit.keys())
        multitons = set([k for k, v in hit.items() if v > 1])

        # now we can estimate completeness and contamination for each placement
        for i in range(len(placements)):
            s = self.readSet(placements[i]["node"])
            placements[i]["set"] = s
            # completeness is the overap of both sets
            cmpl = len(singletons & s) / len(s)
            cont = len(multitons & s) / len(s)

            # make to percentage and round to 2 positions
            placements[i]["completeness"] = round(cmpl * 100, 2)
            placements[i]["contamination"] = round(cont * 100, 2)

        # compute silent fraction per placement and set
        self.get_silent_contig(self._clean_fasta, hits, placements)

        log("Finished estimating")
        self.write_outfile(outfile, placements)

        # done
        return True
示例#7
0
文件: EukCC.py 项目: shulp2211/EukCC
parser.add_argument('--quiet',
                    '-q',
                    dest='quiet',
                    action='store_true',
                    default=False,
                    help='silcence most output')
parser.add_argument('--debug',
                    '-d',
                    action='store_true',
                    default=False,
                    help='debug and thus ignore safety')
args = parser.parse_args()

###############################################
# starting the analysis
log("Running eukcc for {} bin{}".format(len(args.fasta),
                                        "s" if len(args.fasta) > 1 else ""))

# create output if not exists
if not file.isdir(args.outdir):
    exit()

# check if a protein fasta was passed (implied )
if args.bed is not None:
    # set no glob
    args.noglob = True
    args.isprotein = True
else:
    args.isprotein = False

# check if we can expand glob:
if len(args.fasta) == 1 and not args.noglob: