예제 #1
0
def test_compress():

    output_filename = os.path.join(tmpdir, "test_compress.csv")
    compression = "gzip"

    output(
        df=data_df,
        output_filename=output_filename,
        compression=compression,
        float_format=None,
    )
    result = pd.read_csv("{}.gz".format(output_filename))

    pd.testing.assert_frame_equal(result,
                                  data_df,
                                  check_names=False,
                                  check_less_precise=1)

    # Test input filename overwriting compression
    output_filename = os.path.join(tmpdir, "test_compress.csv.bz2")
    output(
        df=data_df,
        output_filename=output_filename,
        compression=compression,
        float_format=None,
    )

    result = pd.read_csv(output_filename)
    pd.testing.assert_frame_equal(result,
                                  data_df,
                                  check_names=False,
                                  check_less_precise=1)
예제 #2
0
def test_compress_no_csv():
    # Test the ability of a naked string input, appending csv.gz
    output_filename = os.path.join(tmpdir, "test_compress")
    compression = "gzip"

    output(
        df=data_df,
        output_filename=output_filename,
        compression=compression,
        float_format=None,
    )
    result = pd.read_csv("{}.csv.gz".format(output_filename))

    pd.testing.assert_frame_equal(result,
                                  data_df,
                                  check_names=False,
                                  check_less_precise=1)

    # Test input filename of writing a tab separated file
    output_filename = os.path.join(tmpdir, "test_compress.tsv.bz2")
    output(
        df=data_df,
        output_filename=output_filename,
        compression=compression,
        float_format=None,
    )

    result = pd.read_csv(output_filename, sep="\t")
    pd.testing.assert_frame_equal(result,
                                  data_df,
                                  check_names=False,
                                  check_less_precise=1)
예제 #3
0
def test_compress_exception():
    output_filename = os.path.join(tmpdir, "test_compress_warning.csv.zip")
    with pytest.raises(Exception) as e:
        output(df=data_df,
               output_filename=output_filename,
               compression="not an option")

    assert "not supported" in str(e.value)
예제 #4
0
def test_output_none():
    output_filename = os.path.join(tmpdir, "test_output_none.csv")
    compression = None
    output(
        df=data_df,
        output_filename=output_filename,
        compression_options=compression,
        float_format=None,
    )

    result = pd.read_csv(output_filename)
    pd.testing.assert_frame_equal(
        result, data_df, check_names=False, check_less_precise=1
    )
예제 #5
0
def test_compress_tsv():
    # Test input filename of writing a tab separated file
    output_filename = os.path.join(tmpdir, "test_compress.tsv.gz")
    output(
        df=data_df,
        sep="\t",
        output_filename=output_filename,
        compression_options=compression_options,
        float_format=None,
    )

    result = pd.read_csv(output_filename, sep="\t")
    pd.testing.assert_frame_equal(
        result, data_df, check_names=False, check_less_precise=1
    )
예제 #6
0
def test_compress_warning():
    with pytest.warns(UserWarning) as w:
        warnings.simplefilter("always")

        output_filename = os.path.join(tmpdir, "test_compress_warning.csv.zip")
        compression = "gzip"
        output(
            df=data_df,
            output_filename=output_filename,
            compression=compression,
            float_format=None,
        )

        assert len(w) == 1
        assert issubclass(w[-1].category, UserWarning)

        result = pd.read_csv(output_filename)
        pd.testing.assert_frame_equal(result,
                                      data_df,
                                      check_names=False,
                                      check_less_precise=1)
예제 #7
0
def test_compress_no_timestamp():
    # The default behavior is to ignore timestamps
    buffer = io.BytesIO()

    output(
        df=data_df,
        output_filename=buffer,
        float_format=None,
    )

    buffer_output = buffer.getvalue()

    # Simulate different timestamp
    time.sleep(2)

    buffer = io.BytesIO()
    output(
        df=data_df,
        output_filename=buffer,
        float_format=None,
    )
    assert buffer_output == buffer.getvalue()

    # Simulate different time stamps
    buffer = io.BytesIO()
    output(
        df=data_df,
        output_filename=buffer,
        float_format=None,
        compression_options=compression_options,
    )
    buffer_output = buffer.getvalue()

    time.sleep(2)
    buffer = io.BytesIO()
    output(
        df=data_df,
        output_filename=buffer,
        float_format=None,
        compression_options=compression_options,
    )
    assert buffer_output != buffer.getvalue()
예제 #8
0
def annotate(
    profiles,
    platemap,
    cell_id="unknown",
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file="none",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    perturbation_mode="none",
    external_metadata="none",
    external_join_left="none",
    external_join_right="none",
    compression=None,
    float_format=None,
):
    """
    Exclude features that have correlations above a certain threshold

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    platemap - either pandas DataFrame or a file that stores platemap metadata
    cell_id - [default: "unknown"] provide a string to annotate cell id column
    join_on - list of length two indicating which variables to merge profiles and plate
              [default: ["Metadata_well_position", "Metadata_Well"]]. The first element
              indicates variable(s) in platemap and the second element indicates
              variable(s) in profiles to merge using.
              Note the setting of `add_metadata_id_to_platemap`
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_augmented.csv".
    add_metadata_id_to_platemap - boolean if the platemap variables should be recoded
    format_broad_cmap - [default: False] boolean if we need to add columns to make
                        compatible with Broad CMAP naming conventions.
    perturbation_mode - [default: "none"] - either "chemical", "genetic" or "none" and only
                        active if format_broad_cmap == True
    external_metadata - [default: "none"] a string indicating a file with additional
                        metadata information
    external_join_left - [default: "none"] the merge column in the profile metadata
    external_join_right - [default: "none"] the merge column in the external metadata
    compression - the mechanism to compress [default: None]
    float_format - decimal precision to use in writing output file [default: None]
                       For example, use "%.3g" for 3 decimal precision.

    Return:
    Pandas DataFrame of annotated profiles or written to file
    """

    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(profiles,
                               left_on=join_on[0],
                               right_on=join_on[1],
                               how="inner").drop(join_on[0], axis="columns")

    if format_broad_cmap:

        pert_opts = ["none", "chemical", "genetic"]
        assert (perturbation_mode in pert_opts
                ), "perturbation mode must be one of {}".format(pert_opts)

        assert (
            "Metadata_broad_sample" in annotated.columns
        ), "Are you sure this is a CMAP file? 'Metadata_broad_sample column not found.'"

        annotated = annotated.assign(
            Metadata_pert_id=annotated.Metadata_broad_sample.str.extract(
                r"(BRD[-N][A-Z0-9]+)"),
            Metadata_pert_mfc_id=annotated.Metadata_broad_sample,
            Metadata_pert_well=annotated.loc[:, join_on[1]],
            Metadata_pert_id_vendor="",
        )

        if "Metadata_pert_iname" in annotated.columns:
            annotated = annotated.assign(
                Metadata_pert_mfc_desc=annotated.Metadata_pert_iname,
                Metadata_pert_name=annotated.Metadata_pert_iname,
            )

        if "Metadata_cell_id" not in annotated.columns:
            annotated = annotated.assign(Metadata_cell_id=cell_id)

        if perturbation_mode == "chemical":
            annotated = annotated.assign(Metadata_broad_sample_type=[
                "control" if x in ["DMSO", np.nan] else "trt"
                for x in annotated.Metadata_broad_sample
            ])

            # Generate Metadata_broad_sample column
            annotated.loc[annotated.Metadata_broad_sample_type == "control",
                          "Metadata_broad_sample", ] = "DMSO"
            annotated.loc[annotated.Metadata_broad_sample == "empty",
                          "Metadata_broad_sample_type"] = "empty"

            if "Metadata_mmoles_per_liter" in annotated.columns:
                annotated.loc[annotated.Metadata_broad_sample_type ==
                              "control", "Metadata_mmoles_per_liter", ] = 0

            if "Metadata_solvent" in annotated.columns:
                annotated = annotated.assign(
                    Metadata_pert_vehicle=annotated.Metadata_solvent)
            if "Metadata_mg_per_ml" in annotated.columns:
                annotated.loc[annotated.Metadata_broad_sample_type ==
                              "control", "Metadata_mg_per_ml", ] = 0

        if perturbation_mode == "genetic":
            if "Metadata_pert_name" in annotated.columns:
                annotated = annotated.assign(Metadata_broad_sample_type=[
                    "control" if x == "EMPTY" else "trt"
                    for x in annotated.Metadata_pert_name
                ])

        if "Metadata_broad_sample_type" in annotated.columns:
            annotated = annotated.assign(
                Metadata_pert_type=annotated.Metadata_broad_sample_type)
        else:
            annotated = annotated.assign(Metadata_pert_type="",
                                         Metadata_broad_sample_type="")

    # Add specific Connectivity Map (CMAP) formatting
    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata != "none":
            assert os.path.exists(
                external_metadata
            ), "external metadata at {} does not exist".format(
                external_metadata)

            external_metadata = pd.read_csv(external_metadata)

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (annotated.merge(
            external_metadata,
            left_on=external_join_left,
            right_on=external_join_right,
            how="left",
        ).reset_index(drop=True).drop_duplicates())

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file != "none":
        output(
            df=annotated,
            output_filename=output_file,
            compression=compression,
            float_format=float_format,
        )
    else:
        return annotated