Exemplo n.º 1
0
def import_media_randomly(directory):
    ids = [ind["_id"] for ind in snpdb.find_individuals()]
    total_t = 0.0
    for filename in os.listdir(directory):
        id = random.choice(ids)
        f = open(os.path.join(directory, filename), "rb")
        t = _stopwatch(snpdb.insert_file, f, id)
        total_t += t
        f.close()
        print(f"Added file {filename} to individual {id} in {t:.3f} s.")

    time.sleep(61)
    stats = snpdb.get_db_stats(2**20)
    data = stats["dataSize"]
    storage = stats["storageSize"]

    print(f"Total time: {total_t:.3f} s, " +
          f"storage: {data:.1f} MiB(raw), {storage:.1f} MiB (compressed)")
Exemplo n.º 2
0
def execute_experiment_all(
    result: dict,
    experiment_id: str,
    compression_method: str,
    nsnps: int,
    nsamples: int,
    N: int = 1,
):
    """Execute experiment for all file types.

    Warning: this function uses variables defined outside its scope

    Args:
        result (dict): Dictionary for experiment. Values will be assigned
                       to this dictionary "by reference".
    """
    nsnps_id = nsnps_ids[nsnps]
    nsamples_id = nsamples_ids[nsamples]
    print("Starting Experiment " + experiment_id + " (" + nsnps_id +
          " SNPs, " + nsamples_id + " individuals) with N = " + str(N) +
          "; Compression method: " + compression_method)

    # Filenames
    f_ext: dict = exps[experiment_id]["file_extensions"]
    fqfname: str = fastq_dir_1 + "SH.71992.AP.01.1.fastq"
    imfname: str = data_dir + "out_image.jpg"
    im_res: tuple = (800, 600)
    mfnames: dict = {
        k: data_dir + "out_" + nsnps_id + "_" + nsamples_id + f_ext[k]["map"]
        for k in f_ext if k in ["0125", "PLINK"]
    }
    pfnames: dict = {
        k: data_dir + "out_" + nsnps_id + "_" + nsamples_id + f_ext[k]["ped"]
        for k in f_ext if k in ["0125", "PLINK"]
    }
    fnames: dict = {
        k: data_dir + "out_" + nsnps_id + "_" + nsamples_id + f_ext[k]["ext"]
        for k in f_ext if k in ["FR", "VCF"]
    }
    ifnames: dict = {
        k: data_dir + "out_" + nsnps_id + "_" + nsamples_id + f_ext[k]["ids"]
        for k in f_ext if k not in ["FastQ", "Media"]
    }

    # Setting up result dictionary
    result["fsize"] = []  # file size
    result["dbsize"] = []  # doc size in db
    result["time"] = []  # insertion time
    result["summarize"] = []  # summarization example and time
    result["individuals_of_snps"] = []
    result["delete_individual"] = []
    result["export"] = []
    result["export_bin"] = []

    # * Performing experiment N times and storing results
    for i in range(N):
        print("i: " + str(i))
        print("Resetting database...")
        reset_db(compression_method=compression_method)
        print("Database reset operation successful.")
        print("Generating input files...")
        t_tmp: float = 0.0
        t_map: float = 0.0
        t_sample: float = 0.0
        t_bin: float = 0.0
        map_size: float = 0.0
        sample_size: float = 0.0
        bin_size: float = 0.0
        # * Generating input files
        # If less than 10000 samples, generate in one file
        # Else, generate blocks of up to 10000 samples
        n_blocks: int = int(np.ceil(nsamples / 10000))
        remaining_samples: int = nsamples
        start_sample: int = 1
        imported_map = {k: False for k in fnames}  # for single-file formats
        # Map files
        for k in mfnames:
            generate_random_file(filename=mfnames[k],
                                 file_type=f_ext[k]["map"],
                                 verbose=True,
                                 n=nsnps)
            #  start_from_id=start_map)
        # Importing map files
        for k in mfnames:
            print("Importing double-file format maps:", k)
            t_tmp = time.time()
            snpdb.import_map(
                map_reader=exps[experiment_id]["readers"][k]["map"](
                    mfnames[k]),
                map_name=experiment_id + "_" + nsnps_id + "_" + nsamples_id +
                "_" + k,
                force_create_new=True,
                force_use_existing=False,
                report=False,
            )
            t_tmp = time.time() - t_tmp
            t_map += t_tmp
        for i in range(n_blocks):
            print("Block: " + str(i))
            nsamples_block = int(np.minimum(remaining_samples, 10000.0))
            # Generating single-file format files
            for k in fnames:
                # Map/Samples file
                generate_random_file(
                    filename=fnames[k],
                    file_type=f_ext[k]["ext"],
                    verbose=True,
                    n=nsamples_block,
                    map_size=nsnps,
                    start_samples_from_id=start_sample,
                )
                # Id map file
                generate_random_file(
                    filename=ifnames[k],
                    file_type=".ids",
                    verbose=True,
                    n=nsamples_block,
                    first_sample_id=start_sample,
                )
                # Import double-file map if not imported
                if not imported_map[k]:
                    imported_map[k] = True
                    print("Importing single-file format maps:", k)
                    t_tmp = time.time()
                    snpdb.import_map(
                        map_reader=exps[experiment_id]["readers"][k]["map"](
                            fnames[k]),
                        map_name=experiment_id + "_" + nsnps_id + "_" +
                        nsamples_id + "_" + k,
                        force_create_new=True,
                        force_use_existing=False,
                        report=False,
                    )
                    t_tmp = time.time() - t_tmp
                    t_map += t_tmp
            # Generating double-file format sample files
            for k in pfnames:
                # Samples file
                generate_random_file(
                    filename=pfnames[k],
                    file_type=f_ext[k]["ped"],
                    verbose=True,
                    n=nsamples_block,
                    map_size=nsnps,
                    start_from_id=start_sample,
                )
                # Id map file
                generate_random_file(
                    filename=ifnames[k],
                    file_type=".ids",
                    verbose=True,
                    n=nsamples_block,
                    first_sample_id=start_sample,
                )
            start_sample += nsamples_block
            remaining_samples -= nsamples_block

            # Import sample files
            print("Inserting sample files into database...")
            for k in pfnames:
                # Importing sample file
                id_map: dict = {}
                # Linking samples to individuals in the database
                if ifnames[k] is not None:
                    with open(ifnames[k], "r") as f:
                        for line in f:
                            (sample, individual) = line.split()
                            id_map[sample] = individual
                t_tmp = time.time()
                snpdb.import_samples(
                    sample_reader=exps[experiment_id]["readers"][k]["ped"](
                        pfnames[k]),
                    map_name=experiment_id + "_" + nsnps_id + "_" +
                    nsamples_id + "_" + k,
                    id_map=id_map,
                    report=False,
                )
                t_tmp = time.time() - t_tmp
                t_sample += t_tmp
            for k in fnames:
                # Importing sample file
                id_map = {}
                # Linking samples to individuals in the database
                if ifnames[k] is not None:
                    with open(ifnames[k], "r") as f:
                        for line in f:
                            (sample, individual) = line.split()
                            id_map[sample] = individual
                t_tmp = time.time()
                snpdb.import_samples(
                    sample_reader=exps[experiment_id]["readers"][k]["ped"](
                        fnames[k]),
                    map_name=experiment_id + "_" + nsnps_id + "_" +
                    nsamples_id + "_" + k,
                    id_map=id_map,
                    report=False,
                )
                t_tmp = time.time() - t_tmp
                t_sample += t_tmp
        # Generating and Importing FastQ and Media files
        print("Inserting FastQ file into database...")
        with open(fqfname, "rb") as fqf:
            t_tmp = time.time()
            snpdb.insert_file(file=fqf, individual_id=0)
            t_tmp = time.time() - t_tmp
            t_bin += t_tmp
        print("Generating and inserting media file into database...")
        im_arr = np.random.rand(im_res[0], im_res[1], 3) * 255
        im_out = Image.fromarray(im_arr.astype("uint8")).convert("RGB")
        im_out.save(imfname)
        with open(imfname, "rb") as imf:
            t_tmp = time.time()
            snpdb.insert_file(file=imf, individual_id=0)
            t_tmp = time.time() - t_tmp
            t_bin += t_tmp

        # Validating statistics
        print("Validating statistics...")
        snpdb._db.command("validate", snpdb._config["MAPS_COLL"], full=True)
        snpdb._db.command("validate", snpdb._config["MAPSNPS_COLL"], full=True)
        snpdb._db.command("validate", snpdb._config["SNPS_COLL"], full=True)
        snpdb._db.command("validate", snpdb._config["SAMPLES_COLL"], full=True)
        snpdb._db.command("validate",
                          snpdb._config["SNPBLOCKS_COLL"],
                          full=True)
        snpdb._db.command("validate",
                          snpdb._config["INDIVIDUALS_COLL"],
                          full=True)
        snpdb._db.command("validate", "fs.chunks", full=True)
        snpdb._db.command("validate", "fs.files", full=True)

        # Getting dbsizes
        map_size = (
            snpdb._db.command("collstats",
                              snpdb._config["MAPS_COLL"])["storageSize"] +
            snpdb._db.command("collstats",
                              snpdb._config["MAPSNPS_COLL"])["storageSize"] +
            snpdb._db.command("collstats",
                              snpdb._config["SNPS_COLL"])["storageSize"])
        sample_size = (
            snpdb._db.command("collstats",
                              snpdb._config["SAMPLES_COLL"])["storageSize"] +
            snpdb._db.command("collstats",
                              snpdb._config["SNPBLOCKS_COLL"])["storageSize"] +
            snpdb._db.command(
                "collstats", snpdb._config["INDIVIDUALS_COLL"])["storageSize"])
        bin_size = snpdb._db.command("collstats", "fs.chunks")["storageSize"]

        # Appending generated file sizes
        fsize: float = 0.0
        fsize += sum([os.stat(mfnames[k]).st_size for k in mfnames])
        fsize += sum([os.stat(pfnames[k]).st_size * n_blocks for k in pfnames])
        fsize += sum([os.stat(fnames[k]).st_size * n_blocks for k in fnames])
        fsize += os.stat(imfname).st_size
        fsize += os.stat(fqfname).st_size
        result["fsize"].append(fsize)
        # Appending stored document sizes from MongoDB
        result["dbsize"].append(map_size + sample_size + bin_size)
        # Appending insertion times
        result["time"].append(t_map + t_sample + t_bin)

        # Executing additional steps
        print("Executing additional steps...")

        # 2.2 Sumarização
        ind = np.random.choice(snpdb.find_individuals())
        t_tmp = time.time()
        summary = snpdb.summarize(ind)
        t_tmp = time.time() - t_tmp
        result["summarize"].append({
            "individual": ind,
            "summary": summary,
            "time": t_tmp
        })

        # 2.3 Exportação de sumarização para formatos originais
        try:
            export: dict = {}
            export["Z125"] = {}
            export["PLINK"] = {}
            ind_map = result["summarize"][-1]["individual"]["samples"][-1][
                "map"]
            samples = [
                sample["id"]
                for sample in result["summarize"][-1]["individual"]["samples"]
                if sample["map"] == ind_map
            ]
            t_tmp = time.time()
            snpdb.export_map(ind_map, writers.Z125MapWriter,
                             data_dir + "ind_export.0125map")
            t_tmp = time.time() - t_tmp
            export["Z125"]["map"] = t_tmp
            t_tmp = time.time()
            snpdb.export_samples(
                samples,
                ind_map,
                writers.Z125SampleWriter,
                data_dir + "ind_export.0125ped",
            )
            t_tmp = time.time() - t_tmp
            export["Z125"]["samples"] = t_tmp
            t_tmp = time.time()
            snpdb.export_map(ind_map, writers.PlinkMapWriter,
                             data_dir + "ind_export.plmap")
            t_tmp = time.time() - t_tmp
            export["PLINK"]["map"] = t_tmp
            t_tmp = time.time()
            snpdb.export_samples(
                samples,
                ind_map,
                writers.PlinkSampleWriter,
                data_dir + "ind_export.plped",
            )
            t_tmp = time.time() - t_tmp
            export["PLINK"]["samples"] = t_tmp
            result["export"].append(export)
        except IndexError as e:
            print("Warning: individual has no map/samples", e)

        # 2.4 Busca de indivíduos, dada uma lista de SNPs
        snp = np.random.choice(snpdb.find_snp())
        result["individuals_of_snps"].append({})
        result["individuals_of_snps"][-1]
        t_tmp = time.time()
        try:
            inds = snpdb.find_individuals_of_snps(id=snp["i"], )
            result["individuals_of_snps"][-1]["snp"] = snp
            result["individuals_of_snps"][-1]["individuals"] = inds
        except Exception as e:
            print("Warning: couldn't retrieve individuals from database", e)
        t_tmp = time.time() - t_tmp
        result["individuals_of_snps"][-1]["snp"] = t_tmp

        # 2.5 Exportação de dados brutos/binários
        try:
            db_files = snpdb.list_files()
            t_tmp = time.time()
            snpdb.get_files(db_files)
            t_tmp = time.time() - t_tmp
            result["export_bin"].append(t_tmp)
        except Exception as e:
            print(e)

        # 2.6 Remoção de todos os dados de um indivíduo
        ind = np.random.choice(snpdb.find_individuals())
        t_tmp = time.time()
        delete_results = snpdb.delete_individuals(id=ind["_id"])
        t_tmp = time.time() - t_tmp
        result["delete_individual"].append({
            "individual":
            ind,
            "deleted_count": [i.deleted_count for i in delete_results],
            "time":
            t_tmp,
        })

        # Writing partial results to file
        with open(results_fname, "w") as f:
            json.dump(
                results,
                f,
                ensure_ascii=True,
                check_circular=True,
                allow_nan=True,
                indent=1,
                sort_keys=True,
            )
Exemplo n.º 3
0
def execute_experiment_two_files(
    result: dict,
    experiment_id: str,
    compression_method: str,
    nsnps: int,
    nsamples: int,
    N: int = 1,
) -> None:
    """Execute experiment for file types with two files (map, sample).

    Warning: this function uses variables defined outside its scope

    Args:
        result (dict): Dictionary for experiment. Values will be assigned
                       to this dictionary "by reference".
    """
    nsnps_id = nsnps_ids[nsnps]
    nsamples_id = nsamples_ids[nsamples]
    print("Starting Experiment " + experiment_id + " (" + nsnps_id +
          " SNPs, " + nsamples_id + " individuals) with N = " + str(N) +
          "; Compression method: " + compression_method)

    # get filenames
    f_ext: dict = exps[experiment_id]["file_extensions"]
    mfname = str(data_dir + "out_" + nsnps_id + "_" + nsamples_id +
                 f_ext["map"])
    pfname = str(data_dir + "out_" + nsnps_id + "_" + nsamples_id +
                 f_ext["ped"])
    ifname = str(data_dir + "out_" + nsnps_id + "_" + nsamples_id +
                 f_ext["ids"])

    # Setting up result dictionary
    result["fsize"] = []  # file size
    result["dbsize"] = []  # doc size in db
    result["time"] = []  # insertion time
    result["summarize"] = []  # summarization example and time
    result["individuals_of_snps"] = []
    result["delete_individual"] = []

    # * Performing experiment N times and storing results
    for i in range(N):
        print("i: " + str(i))
        print("Resetting database...")
        reset_db(compression_method=compression_method)
        print("Database reset operation successful.")
        print("Generating input files...")
        t_map: float = 0.0
        t_sample: float = 0.0
        map_size: float = 0.0
        sample_size: float = 0.0
        # * Generating input files
        # If less than 10000 samples, generate in one file
        # Else, generate blocks of up to 10000 samples
        n_blocks: int = int(np.ceil(nsamples / 1000))
        remaining_samples: int = nsamples
        start_sample: int = 1
        # Map file
        generate_random_file(filename=mfname,
                             file_type=f_ext["map"],
                             verbose=True,
                             n=nsnps)
        #  start_from_id=start_map)
        # Importing map file
        t_tmp: float = time.time()
        snpdb.import_map(
            map_reader=exps[experiment_id]["readers"]["map"](mfname),
            map_name=experiment_id + "_" + nsnps_id + "_" + nsamples_id,
            force_create_new=True,
            force_use_existing=False,
            report=False,
        )
        t_tmp = time.time() - t_tmp
        t_map += t_tmp
        # Validating statistics
        snpdb._db.command("validate", snpdb._config["MAPS_COLL"], full=True)
        snpdb._db.command("validate", snpdb._config["MAPSNPS_COLL"], full=True)
        snpdb._db.command("validate", snpdb._config["SNPS_COLL"], full=True)
        # map_size = _MAPS + _MAPSNPS + _SNPS
        map_size = (
            snpdb._db.command("collstats",
                              snpdb._config["MAPS_COLL"])["storageSize"] +
            snpdb._db.command("collstats",
                              snpdb._config["MAPSNPS_COLL"])["storageSize"] +
            snpdb._db.command("collstats",
                              snpdb._config["SNPS_COLL"])["storageSize"])
        print("Imported map file\tTime: " + str(round(t_map, 3)) + "s\tSize:" +
              str(round(map_size / 1024**2, 2)) + "MB")
        for i in range(n_blocks):
            print("Block: " + str(i), " T: ", t_sample, "s")
            nsamples_block = int(np.minimum(remaining_samples, 1000.0))
            # Samples file
            generate_random_file(
                filename=pfname,
                file_type=f_ext["ped"],
                verbose=True,
                n=nsamples_block,
                map_size=nsnps,
                start_from_id=start_sample,
            )
            # Id map file
            generate_random_file(
                filename=ifname,
                file_type=".ids",
                verbose=True,
                n=nsamples_block,
                first_sample_id=start_sample,
            )
            start_sample += nsamples_block
            remaining_samples -= nsamples_block

            # Importing sample file
            id_map: dict = {}
            # Linking samples to individuals in the database
            if ifname is not None:
                with open(ifname, "r") as f:
                    for line in f:
                        (sample, individual) = line.split()
                        id_map[sample] = individual
            t_tmp = time.time()
            snpdb.import_samples(
                sample_reader=exps[experiment_id]["readers"]["ped"](pfname),
                map_name=experiment_id + "_" + nsnps_id + "_" + nsamples_id,
                id_map=id_map,
                report=False,
            )
            t_tmp = time.time() - t_tmp
            t_sample += t_tmp
        # Validating Statistics
        snpdb._db.command("validate", snpdb._config["SAMPLES_COLL"], full=True)
        snpdb._db.command("validate",
                          snpdb._config["SNPBLOCKS_COLL"],
                          full=True)
        snpdb._db.command("validate",
                          snpdb._config["INDIVIDUALS_COLL"],
                          full=True)
        # sample_size = _SAMPLES + _SNPBLOCKS + _INDS
        sample_size = (
            snpdb._db.command("collstats",
                              snpdb._config["SAMPLES_COLL"])["storageSize"] +
            snpdb._db.command("collstats",
                              snpdb._config["SNPBLOCKS_COLL"])["storageSize"] +
            snpdb._db.command(
                "collstats", snpdb._config["INDIVIDUALS_COLL"])["storageSize"])
        print("Imported samples file\tTime: " + str(round(t_sample, 3)) +
              "s\tSize:" + str(round(sample_size / 1024**2, 2)) + "MB")

        # Appending generated file sizes
        result["fsize"].append(
            float(os.stat(mfname).st_size) +
            float(os.stat(pfname).st_size) * n_blocks)
        # Appending stored document sizes from MongoDB
        result["dbsize"].append(map_size + sample_size)
        # Appending insertion times
        result["time"].append(t_map + t_sample)

        # Executing additional steps
        print("Executing additional steps...")

        # 2.2 Sumarização
        ind = np.random.choice(snpdb.find_individuals())
        t_tmp = time.time()
        summary = snpdb.summarize(ind)
        t_tmp = time.time() - t_tmp
        result["summarize"].append({
            "individual": ind,
            "summary": summary,
            "time": t_tmp
        })

        # TODO: 2.3 Exportação de sumarização para formatos originais

        # 2.4 Busca de indivíduos, dada uma lista de SNPs
        snp = np.random.choice(snpdb.find_snp())
        result["individuals_of_snps"].append({})
        result["individuals_of_snps"][-1]
        t_tmp = time.time()
        try:
            inds = snpdb.find_individuals_of_snps(id=snp["i"], )
            result["individuals_of_snps"][-1]["snp"] = snp
            result["individuals_of_snps"][-1]["individuals"] = inds
        except Exception as e:
            print("Warning: couldn't retrieve individuals from database", e)
        t_tmp = time.time() - t_tmp
        result["individuals_of_snps"][-1]["time"] = t_tmp

        # TODO: 2.5 Exportação de indivíduos para formatos originais

        # 2.6 Remoção de todos os dados de um indivíduo
        ind = np.random.choice(snpdb.find_individuals())
        t_tmp = time.time()
        delete_results = snpdb.delete_individuals(id=ind["_id"])
        t_tmp = time.time() - t_tmp
        result["delete_individual"].append({
            "individual":
            ind,
            "deleted_count": [i.deleted_count for i in delete_results],
            "time":
            t_tmp,
        })

        # Writing partial results to file
        with open(results_fname, "w") as f:
            json.dump(
                results,
                f,
                ensure_ascii=True,
                check_circular=True,
                allow_nan=True,
                indent=1,
                sort_keys=True,
            )
Exemplo n.º 4
0
             args.name,
             args.min_chr,
             args.max_chr,
             args.min_pos,
             args.max_pos,
             args.map,
             args.id,
             args.chr,
     ):
         print(snp)
 elif args.subcommand == "find-maps":
     for map in snpdb.find_maps(args.name, args.min_size, args.max_size,
                                args.format):
         print(map)
 elif args.subcommand == "find-individuals":
     for ind in snpdb.find_individuals(None, args.name, args.map,
                                       args.sample):
         print(ind)
 elif args.subcommand == "find-samples":
     for sample in snpdb.find_sample(args.id, args.map):
         print(sample)
 elif args.subcommand == "get-snp-genotype":
     print(snpdb.find_snp_of_sample(args.map, args.sample, args.snp))
 elif args.subcommand == "put-file":
     for fname in args.file:
         with open(fname, "rb") as f:
             snpdb.insert_file(f, args.individual)
 elif args.subcommand == "find-files":
     for file in snpdb.list_files(args.individual, args.name):
         print(file)
 elif args.subcommand == "get-files":
     snpdb.get_files(snpdb.list_files(args.individual, args.name))