Exemplo n.º 1
0
def separate(in_gct, separate_field, row_or_col):
    """ Create a new GCT object for each unique value in separate_field.

    Args:
        in_gct (GCToo object)
        separate_field (string)
        row_or_col (string)

    Returns:
        gcts (list of GCToo objects)
        unique_values_in_field (list of strings)

    """
    if row_or_col == "row":
        assert separate_field in in_gct.row_metadata_df.columns, (
            ("separate_field must be in in_gct.row_metadata_df.columns. " +
             "separate_field: {}, in_gct.row_metadata_df.columns: {}").format(
                 separate_field, in_gct.row_metadata_df.columns.values))

        unique_values_in_field = list(
            in_gct.row_metadata_df.loc[:, separate_field].unique())

        gcts = []
        for val in unique_values_in_field:
            bool_array = in_gct.row_metadata_df.loc[:,
                                                    separate_field].values == val

            new_gct = slice_gct.slice_gctoo(in_gct, row_bool=bool_array)
            gcts.append(new_gct)

    elif row_or_col == "col":
        assert separate_field in in_gct.col_metadata_df.columns, (
            ("separate_field must be in in_gct.col_metadata_df.columns. " +
             "separate_field: {}, in_gct.col_metadata_df.columns: {}").format(
                 separate_field, in_gct.col_metadata_df.columns.values))

        unique_values_in_field = list(
            in_gct.col_metadata_df.loc[:, separate_field].unique())

        gcts = []
        for val in unique_values_in_field:
            bool_array = in_gct.col_metadata_df.loc[:,
                                                    separate_field].values == val
            new_gct = slice_gct.slice_gctoo(in_gct, col_bool=bool_array)
            gcts.append(new_gct)

    else:
        raise (Exception("row or col must be 'row' or 'col'."))

    # Make sure each gct is associated with a value from separate_field
    assert len(gcts) == len(unique_values_in_field), (
        "len(gcts): {}, len(unique_values_in_field): {}".format(
            len(gcts), len(unique_values_in_field)))

    return gcts, unique_values_in_field
Exemplo n.º 2
0
def main(args):

    # Import data
    assert os.path.exists(
        args.in_gct_path), ("in_gct_path could not be found: {}").format(
            args.in_gct_path)
    in_gct = parse(args.in_gct_path)

    # First, check if any rows are all NaN; if so, remove them
    dropped_df = in_gct.data_df.dropna(how="all")
    bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values)
    in_gct = sg.slice_gctoo(in_gct, row_bool=bools_of_remaining)

    if args.replace_with == "zero":
        in_gct.data_df.fillna(0, inplace=True)

    elif args.replace_with == "median":
        probe_medians = in_gct.data_df.median(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_medians[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    elif args.replace_with == "mean":
        probe_means = in_gct.data_df.mean(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_means[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    wg.write(in_gct, args.out_name, filler_null="NA")
Exemplo n.º 3
0
 def test_slice_cid_and_col_bool(self):
     # cid and col_bool should not both be provided
     with self.assertRaises(AssertionError) as e:
         out_gct = slice_gct.slice_gctoo(IN_GCT,
                                         cid=["e", "f", "g"],
                                         col_bool=[True, True, False])
     self.assertIn("cid and col_bool", str(e.exception))
Exemplo n.º 4
0
    def test_slice_and_exclude_rids(self):
        out_gct = slice_gct.slice_gctoo(IN_GCT,
                                        rid=["a", "c", "d"],
                                        exclude_rid=["d"])

        # Outputs should be dataframes even if there is only 1 index or column
        pd.util.testing.assert_frame_equal(out_gct.data_df,
                                           IN_GCT.data_df.iloc[[0, 2], :])
        pd.util.testing.assert_frame_equal(
            out_gct.row_metadata_df, IN_GCT.row_metadata_df.iloc[[0, 2], :])
        pd.util.testing.assert_frame_equal(out_gct.col_metadata_df,
                                           IN_GCT.col_metadata_df)
Exemplo n.º 5
0
    def test_slice_bools(self):
        out_gct = slice_gct.slice_gctoo(IN_GCT,
                                        row_bool=[True, False, True, False],
                                        col_bool=[False, False, True])

        # Outputs should be dataframes even if there is only 1 index or column
        pd.util.testing.assert_frame_equal(
            out_gct.data_df, pd.DataFrame(IN_GCT.data_df.iloc[[0, 2], 2]))
        pd.util.testing.assert_frame_equal(
            out_gct.row_metadata_df, IN_GCT.row_metadata_df.iloc[[0, 2], :])
        pd.util.testing.assert_frame_equal(
            out_gct.col_metadata_df,
            pd.DataFrame(IN_GCT.col_metadata_df.iloc[2, :]).T)
Exemplo n.º 6
0
	def test_parse(self):
		# parse whole thing 
		mg1 = mini_gctoo_for_testing.make()
		mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx")

		assert_frame_equal(mg1.data_df, mg2.data_df)
		assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
		assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)

		# test with string rid/cid 
		test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33','LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
		test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
		mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
		mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
			rid = test_rids, cid = test_cids)
		assert_frame_equal(mg3.data_df, mg4.data_df)
		assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
		assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

		# first, make & write out temp version of mini_gctoo with int rids/cids 
		new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
		int_indexed_data_df = new_mg.data_df.copy()
		int_indexed_data_df.index = range(0,6)
		int_indexed_data_df.columns = range(10,16)

		int_indexed_row_meta = new_mg.row_metadata_df.copy()
		int_indexed_row_meta.index = range(0,6)

		int_indexed_col_meta = new_mg.col_metadata_df.copy()
		int_indexed_col_meta.index = range(10,16)

		int_indexed_gctoo = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta,
			col_metadata_df = int_indexed_col_meta)

		write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

		# test with numeric (repr as string) rid/cid
		mg5 = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta, 
			col_metadata_df = int_indexed_col_meta)
		mg5 = slice_gct.slice_gctoo(mg5, row_bool = [True, False, True, False, True, False],
			col_bool = [True, False, False, True, True, True])

		mg5.data_df.index.name = "rid"
		mg5.data_df.columns.name = "cid"

		mg5.row_metadata_df.index.name = "rid"
		mg5.row_metadata_df.columns.name = "rhd"

		mg5.col_metadata_df.index.name = "cid"
		mg5.col_metadata_df.columns.name = "chd"

		mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid = [0, 2, 4], 
			cid = [10,13,14,15], convert_neg_666=False)

		os.remove("int_indexed_mini_gctoo.gctx")

		assert_frame_equal(mg5.data_df, mg6.data_df)
		assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
		assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)		

		# test with ridx/cidx
		mg7 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], 
			cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
		mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])

		assert_frame_equal(mg7.data_df, mg8.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df)			

		# test with rid/cidx
		mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
			cidx = [4])

		assert_frame_equal(mg7.data_df, mg9.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df)			

		# test with ridx/cid
		mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4],
			cid = ['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

		assert_frame_equal(mg7.data_df, mg10.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)			
Exemplo n.º 7
0
    def test_gctx_parsing(self):
        # parse in gctx, no other arguments
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse("functional_tests/mini_gctoo_for_testing.gctx")

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # check convert_neg_666 worked correctly
        self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())

        # parse w/o convert_neg_666
        mg2_alt = parse("functional_tests/mini_gctoo_for_testing.gctx",
                        convert_neg_666=False)
        self.assertFalse(
            mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all())

        # parsing w/rids & cids specified
        test_rids = [
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33',
            'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'
        ]
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse("functional_tests/mini_gctoo_for_testing.gctx",
                    rid=test_rids,
                    cid=test_cids)
        pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
        pandas_testing.assert_frame_equal(mg3.row_metadata_df,
                                          mg4.row_metadata_df)
        pandas_testing.assert_frame_equal(mg3.col_metadata_df,
                                          mg4.col_metadata_df)

        # parsing w/ridx & cidx specified
        mg5 = slice_gct.slice_gctoo(
            mg1,
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
        mg6 = parse("functional_tests/mini_gctoo_for_testing.gctx",
                    ridx=[4],
                    cidx=[4])

        pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
        pandas_testing.assert_frame_equal(mg5.row_metadata_df,
                                          mg6.row_metadata_df)
        pandas_testing.assert_frame_equal(mg5.col_metadata_df,
                                          mg6.col_metadata_df)

        # parsing row metadata only
        mg7 = parse("functional_tests/mini_gctoo_for_testing.gctx",
                    row_meta_only=True)
        pandas_testing.assert_frame_equal(mg7, mg1.row_metadata_df)

        # parsing col metadata only
        mg8 = parse("functional_tests/mini_gctoo_for_testing.gctx",
                    col_meta_only=True)
        pandas_testing.assert_frame_equal(mg8, mg1.col_metadata_df)

        # parsing w/multiindex
        mg9 = parse("functional_tests/mini_gctoo_for_testing.gctx",
                    make_multiindex=True)
        self.assertTrue(mg9.multi_index_df is not None)