Пример #1
0
def data_with_tmp_annotations(ext: MatrixDataType, annotations_fixture=False):
    tmp_dir = tempfile.mkdtemp()
    annotations_file = path.join(tmp_dir, "test_annotations.csv")
    if annotations_fixture:
        shutil.copyfile(f"{FIXTURES_ROOT}/pbmc3k-annotations.csv", annotations_file)
    fname = {
        MatrixDataType.H5AD: f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad",
        MatrixDataType.CXG: f"{FIXTURES_ROOT}/pbmc3k.cxg",
    }[ext]
    data_locator = DataLocator(fname)
    config = AppConfig()
    config.update_server_config(
        app__flask_secret_key="secret",
        single_dataset__obs_names=None,
        single_dataset__var_names=None,
        single_dataset__datapath=data_locator.path,
    )
    config.update_default_dataset_config(
        embeddings__names=["umap"], presentation__max_categories=100, diffexp__lfc_cutoff=0.01,
    )

    config.complete_config()
    data = MatrixDataLoader(data_locator.abspath()).open(config)
    annotations = AnnotationsLocalFile(None, annotations_file)
    return data, tmp_dir, annotations
Пример #2
0
    def handle_embeddings(self):
        self.validate_correct_type_of_configuration_attribute(
            "embeddings__names", list)
        self.validate_correct_type_of_configuration_attribute(
            "embeddings__enable_reembedding", bool)

        server_config = self.app_config.server_config
        if self.embeddings__enable_reembedding:
            if server_config.single_dataset__datapath:
                matrix_data_loader = MatrixDataLoader(
                    server_config.single_dataset__datapath,
                    app_config=self.app_config)
                if matrix_data_loader.matrix_data_type != MatrixDataType.H5AD:
                    raise ConfigurationError(
                        "enable-reembedding is only supported with H5AD files."
                    )
                if server_config.adaptor__anndata_adaptor__backed:
                    raise ConfigurationError(
                        "enable-reembedding is not supported when run in --backed mode."
                    )

            try:
                get_scanpy_module()
            except NotImplementedError:
                # Todo add scanpy to requirements.txt and remove this check once re-embeddings is fully supported
                raise ConfigurationError(
                    "Please install scanpy to enable UMAP re-embedding")
Пример #3
0
def data_with_tmp_tiledb_annotations(ext: MatrixDataType):
    tmp_dir = tempfile.mkdtemp()
    fname = {
        MatrixDataType.H5AD: f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad",
        MatrixDataType.CXG: "test/fixtures/pbmc3k.cxg",
    }[ext]
    data_locator = DataLocator(fname)
    config = AppConfig()
    config.update_server_config(
        app__flask_secret_key="secret",
        multi_dataset__dataroot=data_locator.path,
        authentication__type="test",
        authentication__insecure_test_environment=True,
    )
    config.update_default_dataset_config(
        embeddings__names=["umap"],
        presentation__max_categories=100,
        diffexp__lfc_cutoff=0.01,
        user_annotations__type="hosted_tiledb_array",
        user_annotations__hosted_tiledb_array__db_uri="postgresql://*****:*****@localhost:5432",
        user_annotations__hosted_tiledb_array__hosted_file_directory=tmp_dir,
    )

    config.complete_config()

    data = MatrixDataLoader(data_locator.abspath()).open(config)
    annotations = AnnotationsHostedTileDB(tmp_dir, DbUtils("postgresql://*****:*****@localhost:5432"),)
    return data, tmp_dir, annotations
Пример #4
0
    def handle_single_dataset(self, context):
        self.validate_correct_type_of_configuration_attribute("single_dataset__datapath", (str, type(None)))
        self.validate_correct_type_of_configuration_attribute("single_dataset__title", (str, type(None)))
        self.validate_correct_type_of_configuration_attribute("single_dataset__about", (str, type(None)))
        self.validate_correct_type_of_configuration_attribute("single_dataset__obs_names", (str, type(None)))
        self.validate_correct_type_of_configuration_attribute("single_dataset__var_names", (str, type(None)))

        if self.single_dataset__datapath is None:
            return

        # create the matrix data cache manager:
        if self.matrix_data_cache_manager is None:
            self.matrix_data_cache_manager = MatrixDataCacheManager(max_cached=1, timelimit_s=None)

        # preload this data set
        matrix_data_loader = MatrixDataLoader(self.single_dataset__datapath, app_config=self.app_config)
        try:
            matrix_data_loader.pre_load_validation()
        except DatasetAccessError as e:
            raise ConfigurationError(str(e))

        file_size = matrix_data_loader.file_size()
        file_basename = basename(self.single_dataset__datapath)
        if file_size > BIG_FILE_SIZE_THRESHOLD:
            context["messagefn"](f"Loading data from {file_basename}, this may take a while...")
        else:
            context["messagefn"](f"Loading data from {file_basename}.")

        if self.single_dataset__about:

            def url_check(url):
                try:
                    result = urlparse(url)
                    if all([result.scheme, result.netloc]):
                        return True
                    else:
                        return False
                except ValueError:
                    return False

            if not url_check(self.single_dataset__about):
                raise ConfigurationError(
                    "Must provide an absolute URL for --about. (Example format: http://example.com)"
                )
Пример #5
0
def dataroot_test_index():
    # the following index page is meant for testing/debugging purposes
    data = '<!doctype html><html lang="en">'
    data += "<head><title>Hosted Cellxgene</title></head>"
    data += "<body><H1>Welcome to cellxgene</H1>"

    config = current_app.app_config
    server_config = config.server_config

    auth = server_config.auth
    if auth.is_valid_authentication_type():
        if server_config.auth.is_user_authenticated():
            data += f"<p>Logged in as {auth.get_user_id()} / {auth.get_user_name()} / {auth.get_user_email()}</p>"
        if auth.requires_client_login():
            if server_config.auth.is_user_authenticated():
                data += f"<p><a href='{auth.get_logout_url(None)}'>Logout</a></p>"
            else:
                data += f"<p><a href='{auth.get_login_url(None)}'>Login</a></p>"

    datasets = []
    for dataroot_dict in server_config.multi_dataset__dataroot.values():
        dataroot = dataroot_dict["dataroot"]
        url_dataroot = dataroot_dict["base_url"]
        locator = DataLocator(
            dataroot, region_name=server_config.data_locator__s3__region_name)
        for fname in locator.ls():
            location = path_join(dataroot, fname)
            try:
                MatrixDataLoader(location, app_config=config)
                datasets.append((url_dataroot, fname))
            except DatasetAccessError:
                # skip over invalid datasets
                pass

    data += "<br/>Select one of these datasets...<br/>"
    data += "<ul>"
    datasets.sort()
    for url_dataroot, dataset in datasets:
        data += f"<li><a href={url_dataroot}/{dataset}>{dataset}</a></li>"
    data += "</ul>"
    data += "</body></html>"

    return make_response(data)
Пример #6
0
def main():
    parser = argparse.ArgumentParser("A command to test diffexp")
    parser.add_argument("dataset", help="name of a dataset to load")
    parser.add_argument("-na",
                        "--numA",
                        type=int,
                        help="number of rows in group A")
    parser.add_argument("-nb",
                        "--numB",
                        type=int,
                        help="number of rows in group B")
    parser.add_argument("-va",
                        "--varA",
                        help="obs variable:value to use for group A")
    parser.add_argument("-vb",
                        "--varB",
                        help="obs variable:value to use for group B")
    parser.add_argument("-t",
                        "--trials",
                        default=1,
                        type=int,
                        help="number of trials")
    parser.add_argument("-a",
                        "--alg",
                        choices=("default", "generic", "cxg"),
                        default="default",
                        help="algorithm to use")
    parser.add_argument("-s",
                        "--show",
                        default=False,
                        action="store_true",
                        help="show the results")
    parser.add_argument("-n",
                        "--new-selection",
                        default=False,
                        action="store_true",
                        help="change the selection between each trial")
    parser.add_argument("--seed",
                        default=1,
                        type=int,
                        help="set the random seed")

    args = parser.parse_args()

    app_config = AppConfig()
    app_config.update_server_config(single_dataset__datapath=args.dataset)
    app_config.update_server_config(app__verbose=True)
    app_config.complete_config()

    loader = MatrixDataLoader(args.dataset)
    adaptor = loader.open(app_config)

    if args.show:
        if isinstance(adaptor, CxgAdaptor):
            adaptor.open_array("X").schema.dump()

    random.seed(args.seed)
    np.random.seed(args.seed)
    rows = adaptor.get_shape()[0]

    if args.numA:
        filterA = random.sample(range(rows), args.numA)
    elif args.varA:
        vname, vval = args.varA.split(":")
        filterA = get_filter_from_obs(adaptor, vname, vval)
    else:
        print("must supply numA or varA")
        sys.exit(1)

    if args.numB:
        filterB = random.sample(range(rows), args.numB)
    elif args.varB:
        vname, vval = args.varB.split(":")
        filterB = get_filter_from_obs(adaptor, vname, vval)
    else:
        print("must supply numB or varB")
        sys.exit(1)

    for i in range(args.trials):
        if args.new_selection:
            if args.numA:
                filterA = random.sample(range(rows), args.numA)
            if args.numB:
                filterB = random.sample(range(rows), args.numB)

        maskA = np.zeros(rows, dtype=bool)
        maskA[filterA] = True
        maskB = np.zeros(rows, dtype=bool)
        maskB[filterB] = True

        t1 = time.time()
        if args.alg == "default":
            results = adaptor.compute_diffexp_ttest(maskA, maskB)
        elif args.alg == "generic":
            results = diffexp_generic.diffexp_ttest(adaptor, maskA, maskB)
        elif args.alg == "cxg":
            if not isinstance(adaptor, CxgAdaptor):
                print("cxg only works with CxgAdaptor")
                sys.exit(1)
            results = diffexp_cxg.diffexp_ttest(adaptor, maskA, maskB)

        t2 = time.time()
        print("TIME=", t2 - t1)

    if args.show:
        for res in results:
            print(res)
Пример #7
0
 def load_dataset(self, path, extra_server_config={}, extra_dataset_config={}):
     config = app_config(path, extra_server_config=extra_server_config, extra_dataset_config=extra_dataset_config)
     loader = MatrixDataLoader(path)
     adaptor = loader.open(config)
     return adaptor