Пример #1
0
    def test_011_axis_pandas(self):
        """
        test numpy axis support
        """
        df = pd.DataFrame({
            "C1": {
                "A": 5.0,
                "B": 2.0,
                "C": 3.0,
                "D": 4.0
            },
            "C2": {
                "A": 4.0,
                "B": 1.0,
                "C": 4.0,
                "D": 2.0
            },
            "C3": {
                "A": 3.0,
                "B": 4.0,
                "C": 6.0,
                "D": 8.0
            },
        })

        np.testing.assert_array_almost_equal(
            qnorm.quantile_normalize(df.T, axis=0).T,
            qnorm.quantile_normalize(df, axis=1),
        )
        np.testing.assert_array_almost_equal(
            qnorm.quantile_normalize(df, axis=1),
            qnorm.quantile_normalize(df.T, axis=0).T,
        )
Пример #2
0
 def test_001_pandas(self):
     """
     test pandas support
     """
     df = pd.DataFrame({
         "C1": {
             "A": 5.0,
             "B": 2.0,
             "C": 3.0,
             "D": 4.0
         },
         "C2": {
             "A": 4.0,
             "B": 1.0,
             "C": 4.0,
             "D": 2.0
         },
         "C3": {
             "A": 3.0,
             "B": 4.0,
             "C": 6.0,
             "D": 8.0
         },
     })
     qnorm.quantile_normalize(df)
Пример #3
0
    def test_010_axis_numpy(self):
        """
        test numpy axis support
        """
        arr = np.random.normal(size=(50, 4))

        np.testing.assert_array_almost_equal(
            qnorm.quantile_normalize(arr.T, axis=0).T,
            qnorm.quantile_normalize(arr, axis=1),
        )
        np.testing.assert_array_almost_equal(
            qnorm.quantile_normalize(arr, axis=1),
            qnorm.quantile_normalize(arr.T, axis=0).T,
        )
Пример #4
0
    def test_021_from_hdf_largefile(self):
        """
        test whether or not incremental_quantile_normalize works with a larger
        random file
        """
        np.random.seed(42)
        df1 = pd.DataFrame(
            index=range(5000),
            columns=["sample" + str(col) for col in range(100)],
            dtype=int,
        )
        df1[:] = np.random.randint(0, 100, size=df1.shape)
        df1.to_hdf("test_large.hdf",
                   key="qnorm",
                   format="table",
                   data_columns=True)

        qnorm.incremental_quantile_normalize(
            "test_large.hdf",
            "test_large_out.hdf",
            rowchunksize=11,
            colchunksize=11,
        )
        df2 = pd.read_hdf("test_large_out.hdf", index_col=0, header=0)

        np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                       df2.values,
                                       decimal=4)
Пример #5
0
    def test_027_from_parquet_largefile(self):
        """
        test whether or not incremental_quantile_normalize works with a larger
        random file
        """
        np.random.seed(42)
        df1 = pd.DataFrame(
            index=range(5000),
            columns=["sample" + str(col) for col in range(100)],
        )
        df1[:] = np.random.randint(0, 100, size=df1.shape)
        df1 = df1.astype(float)
        df1.to_parquet("test_large.parquet")

        qnorm.incremental_quantile_normalize(
            "test_large.parquet",
            "test_large_out.parquet",
            rowchunksize=11,
            colchunksize=11,
        )
        df2 = pd.read_parquet("test_large_out.parquet")

        np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                       df2.values,
                                       decimal=4)
Пример #6
0
 def test_028(self):
     """
     Test another array, not just wiki example.
     """
     df = pd.DataFrame({
         "C1": {
             "A": 2.0,
             "B": 2.0,
             "C": 2.0,
             "D": 2.0,
             "E": 6.0,
             "F": 1.0,
         },
         "C2": {
             "A": 2.0,
             "B": 2.0,
             "C": 1.0,
             "D": 3.5,
             "E": 5.0,
             "F": 1.0,
         },
     })
     np.testing.assert_almost_equal(
         qnorm.quantile_normalize(df).values,
         np.array([
             [2.0625, 2.0],
             [2.0625, 2.0],
             [2.0625, 1.25],
             [2.0625, 2.75],
             [5.5, 5.5],
             [1.0, 1.25],
         ]),
     )
Пример #7
0
    def test_002_wiki(self):
        """
        test the wiki example
        https://en.wikipedia.org/wiki/Quantile_normalization
        """
        df = pd.DataFrame({
            "C1": {
                "A": 5.0,
                "B": 2.0,
                "C": 3.0,
                "D": 4.0
            },
            "C2": {
                "A": 4.0,
                "B": 1.0,
                "C": 4.0,
                "D": 2.0
            },
            "C3": {
                "A": 3.0,
                "B": 4.0,
                "C": 6.0,
                "D": 8.0
            },
        })

        result = np.array([
            [5.66666667, 5.16666667, 2.0],
            [2.0, 2.0, 3.0],
            [3.0, 5.16666667, 4.66666667],
            [4.66666667, 3.0, 5.66666667],
        ])

        np.testing.assert_array_almost_equal(
            qnorm.quantile_normalize(df).values, result)
Пример #8
0
    def test_009_wiki_ncpus(self):
        """
        test if an error is raised with a invalid sized target
        """
        df = pd.DataFrame({
            "C1": {
                "A": 5.0,
                "B": 2.0,
                "C": 3.0,
                "D": 4.0
            },
            "C2": {
                "A": 4.0,
                "B": 1.0,
                "C": 4.0,
                "D": 2.0
            },
            "C3": {
                "A": 3.0,
                "B": 4.0,
                "C": 6.0,
                "D": 8.0
            },
        })

        result = np.array([
            [5.66666667, 5.16666667, 2.0],
            [2.0, 2.0, 3.0],
            [3.0, 5.16666667, 4.66666667],
            [4.66666667, 3.0, 5.66666667],
        ])

        np.testing.assert_array_almost_equal(
            qnorm.quantile_normalize(df, ncpus=10).values, result)
Пример #9
0
 def test_005_single(self):
     """
     if dtype is single, return single
     """
     arr = np.random.normal(0, 1, size=(20, 3))
     arr = arr.astype(np.float32)
     qnorm_arr = qnorm.quantile_normalize(arr)
     assert qnorm_arr.dtype == np.float32
Пример #10
0
    def test_017_from_hdf(self):
        """
        test the basic incremental_quantile_normalize functionality
        """
        qnorm.incremental_quantile_normalize("test.hdf", "test_out.hdf")
        df1 = pd.read_hdf("test.hdf", index_col=0, header=0)
        df2 = pd.read_hdf("test_out.hdf", index_col=0, header=0)

        np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                       df2.values,
                                       decimal=5)
Пример #11
0
 def test_006_target(self):
     """
     test if the target is used instead of the qnorm values
     """
     arr = np.array([np.arange(0, 10), np.arange(0, 10)]).T
     np.random.shuffle(arr)
     target = np.arange(10, 20)
     qnorm_arr = qnorm.quantile_normalize(arr, target=target)
     for val in target:
         assert (val in qnorm_arr[:, 0] and val
                 in qnorm_arr[:, 1]), f"value {val} not in qnorm array"
Пример #12
0
    def _load_bams(self, bams, title, window=200):
        tmp = pd.DataFrame(index=self.regions)
        with NamedTemporaryFile(mode="w") as f_out:

            for region in self.regions:
                print("{}\t{}\t{}".format(*re.split("[:-]", region)),
                      file=f_out)
            f_out.flush()

            for bam in bams:
                result = load_heatmap_data(
                    f_out.name,
                    bam,
                    bins=1,
                    up=window // 2,
                    down=window // 2,
                    rmdup=True,
                    rmrepeats=True,
                )
                tmp[result[0]] = result[2].T[0]

        fname = f"{self.data_dir}/{title}.qnorm.ref.txt.gz"
        if os.path.exists(fname):
            logger.debug(f"quantile normalization for {title}")
            qnorm_ref = pd.read_table(fname, index_col=0)["qnorm_ref"].values
            if len(self.regions) != len(qnorm_ref):
                qnorm_ref = np.random.choice(qnorm_ref,
                                             size=len(self.regions),
                                             replace=True)

            tmp = qnorm.quantile_normalize(tmp, target=qnorm_ref)
        else:
            tmp = np.log1p(tmp)

        # Limit memory usage by using float16
        tmp = tmp.mean(1).astype("float16").to_frame(title)

        fname = f"{self.data_dir}/{title}.mean.ref.txt.gz"
        if self.region_type == "reference" and os.path.exists(fname):
            mean_ref = pd.read_table(fname, index_col=0)
            if mean_ref.shape[0] == tmp.shape[0]:
                mean_ref.index = tmp.index
                tmp[f"{title}.relative"] = (
                    tmp[title] - mean_ref.loc[tmp.index]["mean_ref"].values)
                tmp[f"{title}.relative"] = scale(tmp[f"{title}.relative"])
            else:
                logger.debug(
                    f"Regions of {fname} are not the same as input regions.")
                logger.debug("Skipping calculation of relative values.")

        tmp[title] = tmp[title] / tmp[title].max()

        return tmp
Пример #13
0
 def test_007_target_notsorted(self):
     """
     make sure an unsorted target gets sorted first
     """
     arr = np.array([np.arange(0, 10), np.arange(0, 10)]).T
     np.random.shuffle(arr)
     # take the reverse, which should be sorted by qnorm
     target = np.arange(10, 20)[::-1]
     qnorm_arr = qnorm.quantile_normalize(arr, target=target)
     for val in target:
         assert (val in qnorm_arr[:, 0] and val
                 in qnorm_arr[:, 1]), f"value {val} not in qnorm array"
Пример #14
0
    def test_003_no_change(self):
        """
        no sorting should happen here
        """
        arr = np.empty(shape=(20, 3))
        for col in range(arr.shape[1]):
            vals = np.arange(arr.shape[0])
            np.random.shuffle(vals)
            arr[:, col] = vals

        qnorm_arr = qnorm.quantile_normalize(arr)
        np.testing.assert_array_almost_equal(arr, qnorm_arr)
Пример #15
0
    def test_023_from_parquet(self):
        """
        test the basic incremental_quantile_normalize functionality
        """
        qnorm.incremental_quantile_normalize("test.parquet",
                                             "test_out.parquet")
        df1 = pd.read_parquet("test.parquet")
        df2 = pd.read_parquet("test_out.parquet")

        np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                       df2.values,
                                       decimal=5)
Пример #16
0
    def extend(self, outdir: str, data_files: List[str]) -> T:

        if self.schema_version == "0.0.0":
            raise ValueError("dataset does not support custom sources")

        outdir = Path(outdir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        meanstd = pd.read_table(self.meanstd_file)
        bed = meanstd["index"].str.replace("[:-]", "\t").to_frame()

        logger.info("Processing BAM files")
        with NamedTemporaryFile() as f:
            bed.to_csv(f.name, index=False, header=False)

            # create coverage_table
            df = coverage_table(peakfile=f.name,
                                datafiles=data_files,
                                window=self.window,
                                ncpus=12)
            target = np.load(self.target_file)["target"]
            df = qnorm.quantile_normalize(df, target=target)
            df.index = meanstd["index"]
            df = df.sub(meanstd["mean"].values, axis=0)
            df = df.div(meanstd["std"].values, axis=0)

        genes = _create_gene_table(
            df,
            self.meanstd_file,
            self.gene_file,
            self.gene_mapping,
            genome=self.genome,
            link_file=self.link_file,
        )
        logger.info(f"Writing reference to {outdir}")

        df.reset_index().to_feather(outdir / "enhancers.feather")
        genes.to_csv(outdir / "genes.txt", sep="\t")

        info = {
            "genes": "genes.txt",
            "enhancers": "enhancers.feather",
            "source": self.name,
            "genome": self.genome,
            "schema_version": __schema_version__,
        }

        with open(outdir / "info.yaml", "w") as f:
            yaml.dump(info, f)

        return ScepiaDataset(outdir)
Пример #17
0
    def test_013_from_csv_rowchunk(self):
        """
        test the incremental_quantile_normalize with rowchunks functionality
        """
        df1 = pd.read_csv("test.csv", index_col=0, header=0)

        for rowchunksize in range(1, 10):
            qnorm.incremental_quantile_normalize("test.csv",
                                                 "test_out.csv",
                                                 rowchunksize=rowchunksize)
            df2 = pd.read_csv("test_out.csv", index_col=0, header=0)

            np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                           df2.values,
                                           decimal=5)
Пример #18
0
    def test_019_from_hdf_colchunk(self):
        """
        test the incremental_quantile_normalize with colchunks functionality
        """
        df1 = pd.read_hdf("test.hdf", index_col=0, header=0)

        for colchunksize in range(1, 10):
            qnorm.incremental_quantile_normalize("test.hdf",
                                                 "test_out.hdf",
                                                 colchunksize=colchunksize)
            df2 = pd.read_hdf("test_out.hdf", index_col=0, header=0)

            np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                           df2.values,
                                           decimal=5)
def tpm_normalization(
        tpms: pd.DataFrame,
        column_order: list,
        minimum_value: int = None,
) -> pd.DataFrame:
    """filter and order a tpm table, then quantile normalize and log transform"""
    bc = tpms[column_order]                       # filter & order samples
    if minimum_value:
        b4 = bc.shape[0]
        bc = bc[bc.max(axis=1) >= minimum_value]  # filter genes
        aft = b4 - bc.shape[0]
        print(f"Genes with TPM below {minimum_value}: {aft} of {b4} ({round(100*aft/b4,0)}%)")
    bc = quantile_normalize(bc, axis=1)           # normalize
    bc = np.log2(bc+1)                            # transform
    return bc
Пример #20
0
    def test_025_from_parquet_colchunk(self):
        """
        test the incremental_quantile_normalize with colchunks functionality
        """
        df1 = pd.read_parquet("test.parquet")

        for colchunksize in range(1, 10):
            qnorm.incremental_quantile_normalize("test.parquet",
                                                 "test_out.parquet",
                                                 colchunksize=colchunksize)
            df2 = pd.read_parquet("test_out.parquet")

            np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                           df2.values,
                                           decimal=5)
Пример #21
0
    def peaks_merge(coverage_files, bed_output, ncore=1):
        """
        averages all peaks_count outputs
        uses quantile normalization to normalize for read depth
        returns one BED 3+1 file
        """
        ncore = min(4, ncore)
        bed = pd.read_csv(coverage_files[0], header=None, sep="\t")
        if len(coverage_files) > 1:
            for file in coverage_files[1:]:
                scores = pd.read_csv(file, header=None, sep="\t")[3]
                bed = pd.concat([bed, scores], axis=1)

            scores = bed.iloc[:, 3:]
            scores = qnorm.quantile_normalize(scores, axis=1, ncpus=ncore)
            scores = scores.mean(axis=1)

            bed = pd.concat([bed.iloc[:, :3], scores], axis=1)
        bed.to_csv(bed_output, sep="\t", header=False, index=False)
Пример #22
0
    def test_020_from_hdf_colrowchunk(self):
        """
        test the incremental_quantile_normalize with both row and colchunks
        """
        df1 = pd.read_hdf("test.hdf", index_col=0, header=0)

        for colchunksize in range(1, 10):
            for rowchunksize in range(1, 10):
                qnorm.incremental_quantile_normalize(
                    "test.hdf",
                    "test_out.hdf",
                    rowchunksize=rowchunksize,
                    colchunksize=colchunksize,
                )
                df2 = pd.read_hdf("test_out.hdf", index_col=0, header=0)

                np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                               df2.values,
                                               decimal=5)
Пример #23
0
    def test_026_from_parquet_colrowchunk(self):
        """
        test the incremental_quantile_normalize with both row and colchunks
        """
        df1 = pd.read_parquet("test.parquet")

        for colchunksize in range(1, 10):
            for rowchunksize in range(1, 10):
                qnorm.incremental_quantile_normalize(
                    "test.parquet",
                    "test_out.parquet",
                    rowchunksize=rowchunksize,
                    colchunksize=colchunksize,
                )
                df2 = pd.read_parquet("test_out.parquet")

                np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                               df2.values,
                                               decimal=5)
Пример #24
0
def main():
    """Console script for qnorm."""
    parser = argparse.ArgumentParser(
        description="Quantile normalization from the CLI!")
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version=f"qnorm: v{qnorm.__version__}",
    )
    parser.add_argument(
        "table", help="input csv/tsv file which will be quantile normalized")
    args = parser.parse_args()

    delimiter = get_delim(args.table)

    df = pd.read_csv(args.table, index_col=0, sep=delimiter, comment="#")
    qnorm_df = qnorm.quantile_normalize(df)

    print(qnorm_df.to_csv(sep=delimiter))
Пример #25
0
    def test_016_from_csv_largefile(self):
        """
        test whether or not incremental_quantile_normalize works with a larger
        random file
        """
        np.random.seed(42)
        df1 = pd.DataFrame(index=range(5000), columns=range(100))
        df1[:] = np.random.randint(0, 100, size=df1.shape)
        df1.to_csv("test_large.csv")

        qnorm.incremental_quantile_normalize(
            "test_large.csv",
            "test_large_out.csv",
            rowchunksize=11,
            colchunksize=11,
        )
        df2 = pd.read_csv("test_large_out.csv", index_col=0, header=0)

        np.testing.assert_almost_equal(qnorm.quantile_normalize(df1),
                                       df2.values,
                                       decimal=4)
Пример #26
0
import pandas as pd
import qnorm


df = pd.read_csv(snakemake.input[0], comment="#", index_col=0, sep="\t")

# cpm normalization
df = df * 1_000_000 / df.sum(axis=0)

# quantile normalize
df_qn = qnorm.quantile_normalize(df)
open(str(snakemake.output[0]), "w").write(
    "# The number of reads under each peak, cpm quantile normalized\n" +
    df_qn.to_csv(index_label="loc", index=True, header=True, sep="\t")
)
Пример #27
0
        if plt_x_ax < 4:
            plt_x_ax = plt_x_ax + 1
        elif plt_x_ax == 4:
            plt_x_ax = 0
            plt_y_ax = plt_y_ax + 1

    fig.savefig(f"{exp_path}/Distribution_graphs_notNorm.pdf")

# #### Quantile normalization on samples per mark

# In[21]:

dict_norm_dfs = {}
for mark in marks:
    df = dict_of_dfs[mark]
    norm_df = qnorm.quantile_normalize(df, axis=1, ncpus=20)
    dict_norm_dfs[mark] = norm_df

# In[22]:

warnings.filterwarnings("ignore")

plt.style.use("seaborn")
sns.set(rc={"figure.figsize": (23, 16)})
fig, axes = plt.subplots(3, 5)

plt_x_ax = 0
plt_y_ax = 0

for mark in marks:
    samples = list(dict_norm_dfs[mark])
Пример #28
0
def normalize(exp: pd.DataFrame, transpose: bool = False) -> pd.DataFrame:
    if transpose: exp = exp.transpose()
    exp = pd.DataFrame(np.log2(exp + 1))
    exp = qnorm.quantile_normalize(exp)
    return (exp)
Пример #29
0
def coverage_table(
    peakfile,
    datafiles,
    window,
    log_transform=True,
    normalization="none",
    top=0,
    topmethod="var",
    rmdup=True,
    rmrepeats=True,
    ncpus=12,
):
    for x in datafiles:
        if not os.path.isfile(x):
            print("ERROR: Data file '{0}' does not exist".format(x))
            sys.exit(1)
    for x in datafiles:
        if ".bam" in x and not os.path.isfile("{0}.bai".format(x)):
            print("Data file '{0}' does not have an index file."
                  " Creating an index file for {0}.".format(x))
            pysam.index(x)

    logger.info("Loading data")
    data = {}
    try:
        # Load data in parallel
        pool = multiprocessing.Pool(processes=ncpus)
        jobs = []
        for datafile in datafiles:
            jobs.append(
                pool.apply_async(
                    load_heatmap_data,
                    args=(
                        peakfile,
                        datafile,
                        1,
                        window // 2,
                        window // 2,
                        rmdup,
                        False,
                        rmrepeats,
                        None,
                        False,
                        None,
                    ),
                ))
        for job in tqdm(jobs):
            track, regions, profile, guard = job.get()
            data[os.path.splitext(track)[0]] = profile[:, 0]
    except Exception as e:
        sys.stderr.write("Error loading data in parallel, trying serial\n")
        sys.stderr.write("Error: {}\n".format(e))
        for datafile in tqdm(datafiles):
            track, regions, profile, guard = load_heatmap_data(
                peakfile,
                datafile,
                1,
                window // 2,
                window // 2,
                rmdup,
                False,
                rmrepeats,
                None,
                False,
                None,
            )
            data[os.path.splitext(track)[0]] = profile[:, 0]

    # Create DataFrame with regions as index
    regions = ["{}:{}-{}".format(*region[:3]) for region in regions]
    df = pd.DataFrame(data, index=regions)

    if log_transform:
        logger.info("Log transform")
        df = np.log1p(df)
    if normalization == "scale":
        logger.info("Normalization by scaling")
        df[:] = scale(df, axis=0)
    if normalization == "quantile":
        logger.info("Normalization by quantile normalization")
        df = qnorm.quantile_normalize(df)
    else:
        logger.info("No normalization")

    if top > 0:
        if topmethod == "var":
            idx = df.var(1).sort_values().tail(top).index
        elif topmethod == "std":
            idx = df.std(1).sort_values().tail(top).index
        elif topmethod == "mean":
            idx = df.mean(1).sort_values().tail(top).index
        elif topmethod == "random":
            idx = df.sample(top).index
        else:
            raise ValueError(
                "unknown method {} for selecting regions".format(topmethod))
        df = df.loc[idx]
    return df
Пример #30
0
 def test_000_numpy(self):
     """
     test numpy support
     """
     arr = np.random.normal(size=(20, 2))
     qnorm.quantile_normalize(arr)