Пример #1
0
    def get_df(self, window=100):
        print("building GC content")
        data = tools._base_content(self.filename, window, "GC")
        names = self.fasta.names
        lengths = self.fasta.lengths
        GC = [np.nanmean(data[name]) for name in names]
        nreads = [0] * len(GC)
        covStats = [0] * len(GC)
        if self.mode == "canu":
            for i, comment in enumerate(self.fasta.comments):
                read = [x for x in comment.split() if x.startswith("reads")][0]
                covStat = [
                    x for x in comment.split() if x.startswith("covStat")
                ][0]
                read = read.split("=")[1]
                covStat = covStat.split("=")[1]
                nreads[i] = int(read)
                covStats[i] = float(covStat)
        #if self.bamfile
        df = pd.DataFrame({
            "GC": list(GC),
            "length": lengths,
            "name": names,
            "nread": nreads,
            "covStat": covStats
        })

        # deal with the bamfile
        if self.bam:
            bam_df = self.bam.get_df()
            bam_df = bam_df.query("flag in [0,16]")
            bam_df.set_index("qname", inplace=True)
            chrom_name = bam_df.loc[self.fasta.names]["rname"]
            df["chromosome"] = list(chrom_name)

        self._df = df.copy()
        return df
Пример #2
0
 def get_gc(self, window=100):
     data = tools._base_content(self.filename, window, "GC")
     names = self.fasta.names
     lengths = self.fasta.lengths
     GC = [100 * np.nanmean(data[name]) for name in names]
     return GC