Exemplo n.º 1
0
    def __init__(
        self,
        output_dir="output",
        resources_dir="resources",
        parallelize=False,
        processes=os.cpu_count(),
    ):
        """ Initialize a ``Lineage`` object.

        Parameters
        ----------
        output_dir : str
            name / path of output directory
        resources_dir : str
            name / path of resources directory
        parallelize : bool
            utilize multiprocessing to speedup calculations
        processes : int
            processes to launch if multiprocessing
        """
        self._output_dir = os.path.abspath(output_dir)
        self._resources = Resources(resources_dir=resources_dir)
        self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
Exemplo n.º 2
0

if __name__ == "__main__":
    logger.info("start")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()

    # draw a sample from the observations
    random.seed(1)
    SAMPLE_SIZE = len(filenames)
    # SAMPLE_SIZE = 10
    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

    # setup tasks for parallelizing / execution on multiple cores
    p = Parallelizer(parallelize=True)
    tasks = [{"file": filenames[i]} for i in samples]

    # results are a list of lists
    rows = p(get_xy_chrom_snp_ratios, tasks)

    # remove None results
    rows = [row for row in rows if row]

    df = pd.DataFrame(
        rows,
        columns=[
            "file",
            "source",
            "build",
            "build_detected",
Exemplo n.º 3
0
def main():
    logger.info("start")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()

    filenames = [
        filename
        for filename in filenames
        if "readme" not in filename and "phenotype" not in filename
    ]

    # draw a sample from the observations
    random.seed(1)
    SAMPLE_SIZE = len(filenames)
    # SAMPLE_SIZE = 10
    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

    # setup tasks for parallelizing / execution on multiple cores
    p = Parallelizer(parallelize=True)
    tasks = [{"file": filenames[i]} for i in samples]

    # run tasks; results is a list of dicts
    results = p(load_file, tasks)

    # get results from `load_file` where `count` was non-zero
    rows = [item for item in results if "msg" not in item]

    df = pd.DataFrame(
        rows,
        columns=["file", "source", "build", "build_detected", "chromosomes", "count"],
    )

    save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")

    # log parsing statistics
    file_count = len(filenames)
    logger.info(f"{file_count} files in the openSNP datadump")
    logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed")
    logger.info(
        f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed"
    )

    # extract files from the datadump where `load_file` returned a message
    if EXTRACT_FILES:
        # group files with same message (e.g., {"some message": ["file1", "file2"], ...})
        d = {}
        for result in results:
            if "msg" in result:
                if result["msg"] in d:
                    d[result["msg"]].append(result["file"])
                else:
                    d[result["msg"]] = [result["file"]]

        # add messages / file filters as necessary...
        d["build not detected"] = list(df.loc[~df.build_detected].file.values)

        # extract files that have messages for debugging
        for msg, files in d.items():
            if len(files) == 0:
                continue

            # create a directory for each message (prefix indicates number of files)
            path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}")
            create_dir(path)
            # save each file with message into created directory
            for filename in files:
                with atomic_write(os.path.join(path, filename), mode="wb") as f:
                    f.write(r.load_opensnp_datadump_file(filename))

    logger.info("stop")
Exemplo n.º 4
0
def main():
    logging.info("start analysis")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()

    # draw a sample from the observations
    random.seed(1)
    SAMPLE_SIZE = len(filenames)
    # SAMPLE_SIZE = 10
    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

    # get the 1000 genomes samples
    dfsamples = get_1kg_samples(
        f"{DATA_DIR}/integrated_call_samples_v3.20130502.ALL.panel"
    )
    logging.info("retreived the 1kg samples")

    aisnps_1kg = (
        vcf2df(f"{DATA_DIR}/kidd.55aisnp.1kg.vcf", dfsamples)
        if aisnp_SET == "kidd et al. 55 aisnps"
        else vcf2df(f"{DATA_DIR}/Seldin.128aisnp.1kg.vcf", dfsamples)
    )
    logging.info("made the AIsnp DataFrame")

    # Encode 1kg data
    X_encoded, encoder = encode_genotypes(aisnps_1kg)
    logging.info("encoded the genotypes")

    # perform dimensionality reduction on the 1kg set
    X_reduced, reducer = dimensionality_reduction(
        X_encoded, algorithm=DIMENSIONALITY_REDUCTION_ALGORITHM
    )
    logging.info("Reduced the dimensionality of the genotypes")

    # predicted population
    knn_super_pop = KNeighborsClassifier(
        n_neighbors=9, weights="distance", n_jobs=1
    )
    knn_pop = KNeighborsClassifier(n_neighbors=9, weights="distance", n_jobs=1)

    # fit the knn before adding the user sample
    logging.info("Fitting the superpopulation model")
    knn_super_pop.fit(X_reduced, dfsamples["super population"])
    logging.info("Done!")
    logging.info("Fitting the population model")
    knn_pop.fit(X_reduced, dfsamples["population"])
    logging.info("Done!")

    # setup tasks for parallelizing / execution on multiple cores
    p = Parallelizer(parallelize=True)

    tasks = [
        {
            "file": filenames[i],
            "aisnps_1kg": aisnps_1kg,
            "X_encoded": X_encoded,
            "encoder": encoder,
            "reducer": reducer,
            "knn_super_pop": knn_super_pop,
            "knn_pop": knn_pop,
        }
        for i in samples
    ]

    # run tasks; results is a list of dicts
    results = p(process_file, tasks)

    # get rows for dataframe summarizing results
    rows = [row for row in results if row]

    df = pd.DataFrame(
        rows,
        columns=[
            "file",
            "source",
            "build",
            "build_detected",
            "chromosomes_summary",
            "snp_count",
            "AFR",
            "AMR",
            "EAS",
            "EUR",
            "SAS",
            "ACB",
            "ASW",
            "BEB",
            "CDX",
            "CEU",
            "CHB",
            "CHS",
            "CLM",
            "ESN",
            "FIN",
            "GBR",
            "GIH",
            "GWD",
            "IBS",
            "ITU",
            "JPT",
            "KHV",
            "LWK",
            "MSL",
            "MXL",
            "PEL",
            "PJL",
            "PUR",
            "STU",
            "TSI",
            "YRI",
            "component1",
            "component2",
            "component3",
        ],
    )

    save_df_as_csv(df, OUTPUT_DIR, "opensnp_ancestry.csv")

    logging.info("analysis done!")