Пример #1
0
 def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam,
               guard=None):
     if guard is None:
         guard = []
     # Calculate the profile data
     data = {}
     regions = []
     print("Loading data")
     try:
         # Load data in parallel
         pool = multiprocessing.Pool(processes=ncpus)
         jobs = []
         for datafile in datafiles:
             jobs.append(pool.apply_async(load_heatmap_data, args=(
             featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats,
             fragmentsize, dynam, guard)))
         for job in jobs:
             track, regions, profile, guard = job.get()
             data[track] = profile
     except Exception as e:
         sys.stderr.write("Error loading data in parallel, trying serial\n")
         sys.stderr.write("Error: {}\n".format(e))
         for datafile in datafiles:
             track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up,
                                                                extend_dyn_down, rmdup, rpkm, rmrepeats,
                                                                fragmentsize, dynam, guard)
             data[track] = profile
     return data, regions, guard
Пример #2
0
    def _load_bams(self, bams, title, window=200):
        tmp = pd.DataFrame(index=self.regions)
        with NamedTemporaryFile(mode="w") as f_out:

            for region in self.regions:
                print("{}\t{}\t{}".format(*re.split("[:-]", region)),
                      file=f_out)
            f_out.flush()

            for bam in bams:
                result = load_heatmap_data(
                    f_out.name,
                    bam,
                    bins=1,
                    up=window // 2,
                    down=window // 2,
                    rmdup=True,
                    rmrepeats=True,
                )
                tmp[result[0]] = result[2].T[0]

        fname = f"{self.data_dir}/{title}.qnorm.ref.txt.gz"
        if os.path.exists(fname):
            logger.debug(f"quantile normalization for {title}")
            qnorm_ref = pd.read_table(fname, index_col=0)["qnorm_ref"].values
            if len(self.regions) != len(qnorm_ref):
                qnorm_ref = np.random.choice(qnorm_ref,
                                             size=len(self.regions),
                                             replace=True)

            tmp = qnorm.quantile_normalize(tmp, target=qnorm_ref)
        else:
            tmp = np.log1p(tmp)

        # Limit memory usage by using float16
        tmp = tmp.mean(1).astype("float16").to_frame(title)

        fname = f"{self.data_dir}/{title}.mean.ref.txt.gz"
        if self.region_type == "reference" and os.path.exists(fname):
            mean_ref = pd.read_table(fname, index_col=0)
            if mean_ref.shape[0] == tmp.shape[0]:
                mean_ref.index = tmp.index
                tmp[f"{title}.relative"] = (
                    tmp[title] - mean_ref.loc[tmp.index]["mean_ref"].values)
                tmp[f"{title}.relative"] = scale(tmp[f"{title}.relative"])
            else:
                logger.debug(
                    f"Regions of {fname} are not the same as input regions.")
                logger.debug("Skipping calculation of relative values.")

        tmp[title] = tmp[title] / tmp[title].max()

        return tmp
Пример #3
0
 def load_data(featurefile,
               amount_bins,
               extend_dyn_up,
               extend_dyn_down,
               rmdup,
               rpkm,
               rmrepeats,
               fragmentsize,
               dynam,
               guard=None):
     if guard is None:
         guard = []
     # Calculate the profile data
     data = {}
     regions = []
     print("Loading data")
     try:
         # Load data in parallel
         pool = multiprocessing.Pool(processes=ncpus)
         jobs = []
         for datafile in datafiles:
             jobs.append(
                 pool.apply_async(load_heatmap_data,
                                  args=(featurefile, datafile, amount_bins,
                                        extend_dyn_up, extend_dyn_down,
                                        rmdup, rpkm, rmrepeats,
                                        fragmentsize, dynam, guard)))
         for job in jobs:
             track, regions, profile, guard = job.get()
             data[track] = profile
     except Exception as e:
         sys.stderr.write("Error loading data in parallel, trying serial\n")
         sys.stderr.write("Error: {}\n".format(e))
         for datafile in datafiles:
             track, regions, profile, guard = load_heatmap_data(
                 featurefile, datafile, amount_bins, extend_dyn_up,
                 extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize,
                 dynam, guard)
             data[track] = profile
     return data, regions, guard
Пример #4
0
def coverage_table(
    peakfile,
    datafiles,
    window,
    log_transform=True,
    normalization="none",
    top=0,
    topmethod="var",
    rmdup=True,
    rmrepeats=True,
    ncpus=12,
):
    for x in datafiles:
        if not os.path.isfile(x):
            print("ERROR: Data file '{0}' does not exist".format(x))
            sys.exit(1)
    for x in datafiles:
        if ".bam" in x and not os.path.isfile("{0}.bai".format(x)):
            print("Data file '{0}' does not have an index file."
                  " Creating an index file for {0}.".format(x))
            pysam.index(x)

    logger.info("Loading data")
    data = {}
    try:
        # Load data in parallel
        pool = multiprocessing.Pool(processes=ncpus)
        jobs = []
        for datafile in datafiles:
            jobs.append(
                pool.apply_async(
                    load_heatmap_data,
                    args=(
                        peakfile,
                        datafile,
                        1,
                        window // 2,
                        window // 2,
                        rmdup,
                        False,
                        rmrepeats,
                        None,
                        False,
                        None,
                    ),
                ))
        for job in tqdm(jobs):
            track, regions, profile, guard = job.get()
            data[os.path.splitext(track)[0]] = profile[:, 0]
    except Exception as e:
        sys.stderr.write("Error loading data in parallel, trying serial\n")
        sys.stderr.write("Error: {}\n".format(e))
        for datafile in tqdm(datafiles):
            track, regions, profile, guard = load_heatmap_data(
                peakfile,
                datafile,
                1,
                window // 2,
                window // 2,
                rmdup,
                False,
                rmrepeats,
                None,
                False,
                None,
            )
            data[os.path.splitext(track)[0]] = profile[:, 0]

    # Create DataFrame with regions as index
    regions = ["{}:{}-{}".format(*region[:3]) for region in regions]
    df = pd.DataFrame(data, index=regions)

    if log_transform:
        logger.info("Log transform")
        df = np.log1p(df)
    if normalization == "scale":
        logger.info("Normalization by scaling")
        df[:] = scale(df, axis=0)
    if normalization == "quantile":
        logger.info("Normalization by quantile normalization")
        df = qnorm.quantile_normalize(df)
    else:
        logger.info("No normalization")

    if top > 0:
        if topmethod == "var":
            idx = df.var(1).sort_values().tail(top).index
        elif topmethod == "std":
            idx = df.std(1).sort_values().tail(top).index
        elif topmethod == "mean":
            idx = df.mean(1).sort_values().tail(top).index
        elif topmethod == "random":
            idx = df.sample(top).index
        else:
            raise ValueError(
                "unknown method {} for selecting regions".format(topmethod))
        df = df.loc[idx]
    return df