Exemplo n.º 1
0
def process(args):
    """Imports hardcoded values for Setaria database. Many items are placeholder values."""
    # =======================================
    # ========= Database Connection =========
    # =======================================
    try:
        conn = connect()
    except:
        raise

    # =======================================
    # ========== Experiment Design ==========
    # =======================================
    # What the database needs in order to create an 'experiment' is the follow
    # Species: maize (Zea mays)
    # Population: Maize282
    # Chromosome: 10 (just the number and a way to generate its unique name)
    # Line: 282set_B73 (B73) -- taken from file if possible
    # Genotype Version: B73 RefGen_v4_AGPv4_Maize282 (the reference genome)
    # Growout, Type, and Location:
    #       Location: code, city, state, country
    #                 "PU", "West Lafayette", "Indiana", "United States"
    #       Type: "field", "phenotyper", etc.
    #       Growout: name, population ID, location ID, year, type
    #                "PU09", maize282popID, PUlocID, 2009, fieldGrowoutTypeID
    # Traits (planned phenotypes/traits to measure)

    # Expected User Input
    # Species
    species_shortname = 'setaria'  # setaria
    species_binomial = 'Setaria italica'  # Setaria italica OR Setaria viridis  ???
    species_subspecies = None
    species_variety = None
    # Population
    population_name = 'SetariaPopulationName'
    # Chromosome
    chromosome_count = 9  # As defined by `awk -F'\t' '!a[$1]++{print NR":"$0}' 2.from12.setaria.maf0.1.maxMissing0.1.allLines.012.pos`
    # Line
    lines_filename = Template(
        '${cwd}/${chr}_${shortname}.012.indv'
    )  # NOTE(tparker): Can use any chromosome, as they are the same for each. In the future, this the extraneous copies of the lines may be removed and there will be one specific line file, much like the phenotype files
    # Genotype Version
    # NOTE(tparker): This is possibly just the info about the reference genome
    #                It is likely included with the VCF genotype file (.012).
    genotype_version_assembly_name = 'SetariaGenotypeVersionAssemblyName'
    genotype_version_annotation_name = 'SetariaAnotationVersionName'  # NOTE(tparker): Not sure where to find this info or who names it
    reference_genome_line_name = 'REF_REF_REF_REF'  # Placeholder
    # Growout, Type, and Location
    # NOTE(tparker): Unknown at this time
    ## Location
    ## Type
    ## Growout
    #
    # Traits
    phenotype_filename = Template('${cwd}/${growout}.ph.csv')

    # Model Construction & Insertion
    if not args.debug:
        # Species
        s = species(species_shortname, species_binomial, species_subspecies,
                    species_variety)
        species_id = insert.insert_species(conn, s)
        species_id = find.find_species(conn,
                                       species_shortname)  # For idempotence
        # Population
        p = population(population_name, species_id)
        population_id = insert.insert_population(conn, p)
        population_id = find.find_population(
            conn, population_name)  # For idempotence
        if args.verbose:
            print(f'[Insert]\tPopulation ID\t{population_id}')
        # Chromosome
        chromosome_ids = insert.insert_all_chromosomes_for_species(
            conn, chromosome_count, species_id)
        # Line
        working_filepath = lines_filename.substitute(
            dict(chr="chr1",
                 cwd=f"{args.working_directory}",
                 shortname=species_shortname))
        try:
            if not os.path.isfile(working_filepath):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        working_filepath)
        except:
            raise

        line_ids = insert.insert_lines_from_file(
            conn, working_filepath, population_id
        )  # hard-coded substitue until just one file is used for lines
        # Genotype Version
        reference_genome_id = find.find_line(conn, reference_genome_line_name,
                                             population_id)
        gv = genotype_version(genotype_version_assembly_name,
                              genotype_version_annotation_name,
                              reference_genome=reference_genome_id,
                              genotype_version_population=population_id)
        genotype_version_id = insert.insert_genotype_version(conn, gv)
        genotype_version_id = find.find_genotype_version(
            conn, genotype_version_assembly_name)

        # Growout, Type, and Location
        # NOTE(tparker): Unknown at this time
        ## Location
        ## Type
        ## Growout

        # Traits
        # Go through all the phenotype files available for the dataset and insert
        # the recorded traits for each.
        try:
            if not os.path.isfile(
                    phenotype_filename.substitute(
                        dict(growout="phenotyper",
                             cwd=f"{args.working_directory}"))):
                raise FileNotFoundError(
                    errno.ENOENT, os.strerror(errno.ENOENT),
                    phenotype_filename.substitute(
                        dict(growout="phenotyper",
                             cwd=f"{args.working_directory}")))
        except:
            raise
        traits = list(
            pd.read_csv(phenotype_filename.substitute(
                dict(growout="phenotyper", cwd=f"{args.working_directory}")),
                        index_col=0))
        trait_ids = insert.insert_traits_from_traitlist(conn, traits)

    # DEBUG
    else:
        print('Experiment Design\n=======================================')
        # Species
        s = species(species_shortname, species_binomial, species_subspecies,
                    species_variety)
        print('\n------------------------\nSpecies\n------------------------')
        print(s)
        species_id = randint(1, 1000)
        print(f'Species ID set to {species_id}')
        # Population
        p = population(population_name, species_id)
        print(
            '\n------------------------\nPopulation\n------------------------')
        print(p)
        population_id = randint(1, 1000)
        print(f'Population ID set to {population_id}')
        # Chromosome
        print(
            '\n------------------------\nChromosome (from file)\n------------------------'
        )
        print(
            f'insert_all_chromosomes_for_species(conn, {chromosome_count}, {species_id})'
        )
        # Line
        print(
            '\n------------------------\nLines (from file)\n------------------------'
        )
        print(
            f'insert_lines_from_file(conn, {lines_filename.substitute(dict(chr="chr1", cwd=f"{args.working_directory}", shortname=species_shortname))}, {population_id})'
        )
        # Genotype Version
        reference_genome_id = None
        line_id = randint(1, 1000)
        print(
            '\n------------------------\nLine ID (Reference Genome)\n------------------------'
        )
        print(f'Line ID set to {line_id}')
        gv = genotype_version(genotype_version_assembly_name,
                              genotype_version_annotation_name,
                              reference_genome=line_id,
                              genotype_version_population=population_id)
        print(
            '\n------------------------\nGenotype Version\n------------------------'
        )
        print(gv)
        genotype_version_id = randint(1, 1000)
        print(f'Genotype Version ID set to {genotype_version_id}')

        # Growout, Type, and Location
        # NOTE(tparker): Unknown at this time
        ## Location
        ## Type
        ## Growout

        # Traits
        # Go through all the phenotype files available for the dataset and insert
        # the recorded traits for each.

        print('\n------------------------\nTraits\n------------------------')
        print('Trait (from file)')

        print(
            f'list(pd.read_csv({phenotype_filename.substitute(dict(chr="chr1", cwd=f"{args.working_directory}"))}, index_col=0))'
        )
        traits = ['weight', 'height', 'root_angle']
        print(f'insert.insert_traits_from_traitlist(conn, {traits})')
        trait_ids = [randint(1, 1000) for t in traits]
        print(f'Trait IDs set to {trait_ids}')

    # # =====================================
    # # ========== Pipeline Design ==========
    # # =====================================
    # # GWAS Algorithm: "MLMM", "EMMAx", "GAPIT", "FarmCPU"
    # # Imputation Method: "impute to major allele"
    # # Kinship Algorithm: "loiselle"
    # # Population Structure Algorithm: "Eigenstrat"

    # Expected User Input
    # GWAS Algorithm
    gwas_algorithm_name = 'MLMM'  # According to Greg's README
    # Imputation Method
    imputation_method_name = 'SetariaImputationMethodName'  # Unknown, apparently it was done by someone named Sujan
    # Kinship Algorithm
    kinship_algorithm_name = 'AstleBalding synbreed (placeholder)'  # Placeholder, I don't know the exact string that should be used
    # Population Structure Algorithm
    population_structure_algorithm_name = 'Eigenstrat'  # This is a guess based on filename

    if not args.debug:
        # Model Construction & Insertion
        # GWAS Algorithm
        ga = gwas_algorithm(gwas_algorithm_name)
        gwas_algorithm_id = insert.insert_gwas_algorithm(conn, ga)
        # Imputation Method
        im = imputation_method(imputation_method_name)
        imputation_method_id = insert.insert_imputation_method(conn, im)
        # Kinship Algorithm
        ka = kinship_algorithm(kinship_algorithm_name)
        kinship_algorithm_id = insert.insert_kinship_algorithm(conn, ka)
        # Population Structure Algorithm
        psa = population_structure_algorithm(
            population_structure_algorithm_name)
        population_structure_algorithm_id = insert.insert_population_structure_algorithm(
            conn, psa)

    else:
        print('\n\nPipeline Design\n=======================================')
        # Model Construction & Insertion
        # GWAS Algorithm
        print(
            '\n------------------------\nGWAS Algorithm\n------------------------'
        )
        print(f'insert_gwas_algorithm(conn, {gwas_algorithm_name})')
        gwas_algorithm_id = randint(1, 1000)
        print(f'GWAS Algorithm ID set to {gwas_algorithm_id}')
        # Imputation Method
        print(
            '\n------------------------\nImputation Method\n------------------------'
        )
        print(
            f'insert.insert_imputation_method(conn, {imputation_method_name})')
        imputation_method_id = randint(1, 1000)
        print(f'Imputation method ID set to {imputation_method_id}')
        # Kinship Algorithm
        print(
            '\n------------------------\nKinship Algorithm\n------------------------'
        )
        print(
            f'insert.insert_kinship_algorithm(conn, {kinship_algorithm_name})')
        kinship_algorithm_id = randint(1, 1000)
        print(f'Kinship algorithm ID set to {kinship_algorithm_id}')
        # Population Structure Algorithm
        print(
            '\n------------------------\nPopulation Structure Algorithm\n------------------------'
        )
        print(
            f'insert.insert_population_structure_algorithm(conn, {population_structure_algorithm_name})'
        )
        population_structure_algorithm_id = randint(1, 1000)
        print(
            f'Population structure algorithm ID set to {population_structure_algorithm_id}'
        )

    # ===========================================
    # ========== Experiment Collection ==========
    # ===========================================
    # Phenotype (external source?)
    #       This needs to be standardized to a .pheno filetype.
    #       For now, it is the longFormat for the Maize282 datset
    #       5.mergedWeightNorm.LM.rankAvg.longFormat.csv, but for Setaria will be
    # Genotype (VCF output)
    # Variant (VCF output)

    # Expected User Input
    # Phenotype
    # NOTE(tparker): Define in earlier stage
    # Genotype
    genotype_filename = Template('${cwd}/${chr}_${shortname}.012')
    # Variants
    variants_filename = Template('${cwd}/${chr}_${shortname}.012.pos')

    if not args.debug:
        # Model Construction & Insertion
        # Phenotype
        try:
            if not os.path.isfile(
                    phenotype_filename.substitute(
                        dict(growout="phenotyper",
                             cwd=f"{args.working_directory}"))):
                raise FileNotFoundError(
                    errno.ENOENT, os.strerror(errno.ENOENT),
                    phenotype_filename.substitute(
                        dict(growout="phenotyper",
                             cwd=f"{args.working_directory}")))
        except:
            raise
        phenotype_ids = insert.insert_phenotypes_from_file(
            conn,
            phenotype_filename.substitute(
                dict(growout="phenotyper", cwd=f"{args.working_directory}")),
            population_id)
        # Genotype
        for c in range(1, chromosome_count + 1):
            chromosome_shortname = 'chr' + str(c)
            chromosome_id = find.find_chromosome(conn, chromosome_shortname,
                                                 species_id)
            geno_filename = genotype_filename.substitute(
                dict(chr=chromosome_shortname,
                     cwd=f'{args.working_directory}',
                     shortname=species_shortname))
            line_filename = lines_filename.substitute(
                dict(chr=chromosome_shortname,
                     cwd=f'{args.working_directory}',
                     shortname=species_shortname))
            try:
                if not os.path.isfile(geno_filename):
                    raise FileNotFoundError(errno.ENOENT,
                                            os.strerror(errno.ENOENT),
                                            geno_filename)
                if not os.path.isfile(line_filename):
                    raise FileNotFoundError(errno.ENOENT,
                                            os.strerror(errno.ENOENT),
                                            line_filename)
            except:
                raise
            # genotype_ids = insert.insert_genotypes_from_file(conn, geno_filename, line_filename, chromosome_id, population_id, genotype_version_id)
        # Variants
        for c in range(1, chromosome_count + 1):
            chromosome_shortname = 'chr' + str(c)
            chromosome_id = find.find_chromosome(conn, chromosome_shortname,
                                                 species_id)
            variant_filename = variants_filename.substitute(
                dict(chr=chromosome_shortname,
                     cwd=f'{args.working_directory}',
                     shortname=species_shortname))
            try:
                if not os.path.isfile(variant_filename):
                    raise FileNotFoundError(errno.ENOENT,
                                            os.strerror(errno.ENOENT),
                                            variant_filename)
            except:
                raise
            variant_ids = insert.insert_variants_from_file(
                conn, variant_filename, species_id, chromosome_id)

    else:
        print(
            '\n\nExperiment Collection\n======================================='
        )
        # Model Construction & Insertion
        # Phenotype
        print(
            '\n------------------------\nPhenotypes\n------------------------')
        print(
            f'insert.insert_phenotypes_from_file(conn, {phenotype_filename.substitute(dict(chr="chr1", cwd=f"{args.working_directory}"))}, {population_id})'
        )
        # Genotype
        for c in range(1, chromosome_count + 1):
            chromosome_shortname = 'chr' + str(c)
            # chromosome_id = find.find_chromosome(conn, chromosome_shortname, species_id)
            chromosome_id = randint(1, 1000)
            print(f'Chromosome ID set to {chromosome_id}')
            geno_filename = genotype_filename.substitute(
                dict(chromosome_shortname=chromosome_shortname,
                     cwd=f"{args.working_directory}"))
            line_filename = lines_filename.substitute(
                dict(chromosome_shortname=chromosome_shortname,
                     cwd=f"{args.working_directory}"))
            print(
                f'insert.insert_genotypes_from_file(conn, {geno_filename}, {line_filename}, {chromosome_id}, {population_id}, {line_id})'
            )
            genotype_ids = [randint(1, 1000) for g in range(1, 25)]
            print(f'Genotype IDs set to {genotype_ids}')
        # Variants
        for c in range(1, chromosome_count + 1):
            chromosome_shortname = 'chr' + str(c)
            # chromosome_id = find.find_chromosome(conn, chromosome_shortname, species_id)
            chromosome_id = randint(1, 1000)
            print(f'Chromosome ID set to {chromosome_id}')
            variant_filename = variants_filename.substitute(
                dict(chromosome_shortname=chromosome_shortname))
            print(
                f'insert.insert_variants_from_file(conn, {variant_filename}, {species_id}, {chromosome_id})'
            )

    # =========================================
    # ========== Pipeline Collection ==========
    # =========================================
    # Kinship
    # Setaria Kinship is stored in:
    ## /shares/ibaxter_share/gziegler/SetariaGWASPipeline/data/genotype/6.AstleBalding.synbreed.kinship.rda
    ## Exported the file to CSV using R
    ### load('6.AstleBalding.synbreed.kinship.rda')
    ### write.csv(kinship, '6.AstleBalding.synbreed.kinship.csv')
    # Population Structure

    # Expected User Input
    # Kinship
    # NOTE(tparker): Currently the database just stores the filename.
    #                There is no service to upload the file to database's
    #                host, so there's no single location to find the file
    #                I would like to find out why this is the case and if
    #                it would just be better to store it in the database and
    #                allow the user to export the table themselves as a CSV.
    kinship_filepath = f'{args.working_directory}/6.AstleBalding.synbreed.kinship.csv'
    # Population Structure
    # NOTE(tparker): Same reasoning as the kinship file. There should be a way
    #                for the data to be stored in the database, not a
    population_structure_filepath = f'{args.working_directory}/6.Eigenstrat.population.structure.50PCs.csv'

    if not args.debug:
        # Model Construction & Insertion
        # Kinship
        try:
            if not os.path.isfile(kinship_filepath):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        kinship_filepath)
            if not os.path.isfile(population_structure_filepath):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        population_structure_filepath)
        except:
            raise
        k = kinship(kinship_algorithm_id, kinship_filepath)
        kinship_id = insert.insert_kinship(conn, k)
        # Population Structure
        ps = population_structure(population_structure_algorithm_id,
                                  population_structure_filepath)
        population_structure_id = insert.insert_population_structure(conn, ps)
    else:
        print(
            '\n\nPipeline Collection\n=======================================')
        # Model Construction & Insertion
        # Kinship
        k = kinship(kinship_algorithm_id, kinship_filepath)

        print('\n------------------------\nKinship\n------------------------')
        print(f'insert.insert_kinship(conn, {k})')
        kinship_id = randint(1, 1000)
        # Population Structure
        print(
            '\n------------------------\nPopulation Structure Algorithm\n------------------------'
        )
        try:
            if not os.path.isfile(population_structure_filepath):
                raise FileNotFoundError
        except:
            raise
        else:
            ps = population_structure(population_structure_algorithm_id,
                                      population_structure_filepath)
            print(f'insert.insert_population_structure(conn, {ps})')
            population_structure_id = randint(1, 1000)
            print(f'Population structure ID set to {population_structure_id}')

    # =============================================
    # ================== Results ==================
    # =============================================
    # GWAS Run
    # GWAS Results

    # Expected User Input
    # GWAS Run & results
    gwas_filename = f'{args.working_directory}/placeholder_gwas_results.csv'
    # The following values (0.2, 0.2, and 0.1) were all taken from the Maize282 import
    # NOTE(tparker): Make sure to double check with Greg on what the true values should be
    #                Also, double check the source of the pipeline to see if there is any
    #                indication what the values shoudl be.
    missing_snp_cutoff_value = 0.2
    missing_line_cutoff_value = 0.2
    minor_allele_frequency_cutoff_value = 0.1

    if not args.debug:
        # Model Construction & Insertion
        # GWAS Run
        # NOTE(tparker): Check with Greg on what the imputation method was used. I believe it was
        #                set by someone named Sujan because imputation was done beforehand
        try:
            if not os.path.isfile(gwas_filename):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        gwas_filename)
        except:
            raise
        imputation_method_id = find.find_imputation_method(
            conn, imputation_method_name)
        gwas_run_ids = insert.insert_gwas_runs_from_gwas_results_file(
            conn, gwas_filename, gwas_algorithm_id, reference_genome_id,
            missing_snp_cutoff_value, missing_line_cutoff_value,
            minor_allele_frequency_cutoff_value, imputation_method_id,
            kinship_id, population_structure_id)
        # GWAS Results
        gwas_result_ids = insert.insert_gwas_results_from_file(
            conn, species_id, gwas_filename, gwas_algorithm_id,
            missing_snp_cutoff_value, missing_line_cutoff_value,
            imputation_method_id, reference_genome_id, kinship_id,
            population_structure_id, minor_allele_frequency_cutoff_value)
    else:
        print('\n\nResults\n=======================================')
        # Model Construction & Insertion
        # GWAS Run
        # NOTE(tparker): Check with Greg on what the imputation method was used. I believe it was
        #                set by someone named Sujan because imputation was done beforehand

        print(
            '\n------------------------\nImputation Method\n------------------------'
        )
        # Imputation Method ID was already set in a previous set. If this was done at a
        # time, then it will have to be searched for in the database.
        print(f'Imputation method ID set to {imputation_method_id}')
        print('\n------------------------\nGWAS Run\n------------------------')
        try:
            if not os.path.isfile(gwas_filename):
                raise FileNotFoundError
        except:
            raise
        else:
            print(
                f'insert.insert_gwas_runs_from_gwas_results_file(conn, {gwas_filename}, {gwas_algorithm_id}, {reference_genome_id}, {missing_snp_cutoff_value}, {missing_line_cutoff_value}, {minor_allele_frequency_cutoff_value}, {imputation_method_id}, {kinship_id}, {population_structure_id})'
            )
            gwas_run_ids = [randint(1, 1000) for g in range(1, 15)]
            print(f'GWAS run IDs set to {gwas_run_ids}')
            # GWAS Results
            print(
                '\n------------------------\nGWAS Result\n------------------------'
            )
            print(
                f'insert.insert_gwas_results_from_file(conn,{species_id},{gwas_filename},{gwas_algorithm_id},{missing_snp_cutoff_value},{missing_line_cutoff_value},{imputation_method_id},{reference_genome_id},{kinship_id},{population_structure_id},{minor_allele_frequency_cutoff_value})'
            )
            gwas_result_ids = [randint(1, 1000) for g in range(1, 15)]
            print(f'GWAS result IDs set to {gwas_result_ids}')
Exemplo n.º 2
0
          (insertedChromosomeIDs, '\t10 (sID: %s)' % maizeSpeciesID))

    # GET LINES FROM SPECIFIED 012.indv FILE AND ADD TO DB
    insertedLineIDs = insert.insert_lines_from_file(
        conn, '../data/chr10_282_agpv4.012.indv', maize282popID)
    print("[ INSERT ]\t%s\t%s\t(pID:  %s)" %
          (insertedLineIDs, '../data/chr10_282_agpv4.012.indv', maize282popID))

    # GET VARIANTS FROM .012.pos FILE AND ADD TO  DB
    # Found the issue, the 'true' database on adriatic houses variants for ALL chromosomes
    # So, to fix that, we gotta loop through each chromosome file and add them
    # NOTE(timp): For when this is generalized to more than just Zea mays, there need to be a
    # variable for the range instead because the number of chromosomes may differ between species
    for c in range(1, 11):
        chrShortname = 'chr' + str(c)
        chrId = find.find_chromosome(conn, chrShortname, maizeSpeciesID)
        filename = '../data/%s_282_agpv4.012.pos' % chrShortname
        # print("[ FIND ]\t(%s)\t%s" % (chrId, '< chromsome: %s >' % filename))
        insertedVariantIDs = insert.insert_variants_from_file(
            conn, filename, maizeSpeciesID, chrId)
        # print("num inserted variants:")
        # print(len(insertedVariantIDs))

    # ADD ALL GENOTYPES FROM A ONE-CHROMOSOME .012 FILE TO DB
    for c in range(1, 11):
        chrShortname = 'chr' + str(c)
        chrId = find.find_chromosome(conn, chrShortname, maizeSpeciesID)
        genoFilename = '../data/%s_282_agpv4.012' % chrShortname
        linesFilename = '../data/%s_282_agpv4.012.indv' % chrShortname
        # Example input file: chr1_282_agpv4.012.indv
        # 282set_33-16
Exemplo n.º 3
0
def process(args):
    try:
        conn = connect()
    except:
        raise

    # Species
    species_shortname = 'maize'  # setaria
    species_binomial = 'Zea mays'  # Setaria italica OR Setaria viridis  ???
    species_subspecies = None
    species_variety = None
    # Population
    population_name = 'Maize282'
    # Chromosome
    chromosome_count = 10
    # Line
    lines_filename = Template('${cwd}/${chr}_${shortname}.012.indv')
    # Genotype Version
    genotype_version_assembly_name = 'B73 RefGen_v4'
    genotype_version_annotation_name = 'AGPv4'  # NOTE(tparker): Not sure where to find this info or who names it
    reference_genome_line_name = 'REF_REF_REF_REF'  # Placeholder

    s = species(species_shortname, species_binomial, species_subspecies,
                species_variety)
    insertedSpeciesID = insert.insert_species(conn, mySpecies)
    print("[ INSERT ]\t(%s)\t%s" % (insertedSpeciesID, str(mySpecies)))
    maizeSpeciesID = find.find_species(conn, 'maize')
    print("[ FIND ]\t(%s)\t%s" % (maizeSpeciesID, '< species: maize >'))

    # ADD A HARD-CODED POPULATION TO DB USING insert_population()
    myPopulation = population('Maize282', maizeSpeciesID)
    insertedPopulationID = insert.insert_population(conn, myPopulation)
    print("[ INSERT ]\t(%s)\t%s" % (insertedPopulationID, str(myPopulation)))
    print("[ FIND ]\t(%s)\t%s" % (maize282popID, '< population: Maize282 >'))

    # ADD A HARD-CODED LINE TO DB USING insert_line()
    myLine = line(line_name='282set_B73', line_population=maize282popID)
    insertedLineID = insert.insert_line(conn, myLine)
    print("[ INSERT ]\t(%s)\t%s" % (insertedLineID, str(myLine)))
    B73lineID = find.find_line(conn, '282set_B73', maize282popID)
    print("[ FIND ]\t(%s)\t%s" % (B73lineID, '< line: Maize282 >'))

    # ADD NEW HARD-CODED GENOTYPE_VERSION TO DB
    myGenotypeVersion = genotype_version(
        genotype_version_name='B73 RefGen_v4_AGPv4_Maize282',
        genotype_version=315,
        reference_genome=B73lineID,
        genotype_version_population=maize282popID)
    B73_agpv4_maize282_versionID = insert.insert_genotype_version(
        conn, myGenotypeVersion)
    print("[ INSERT ]\t(%s)\t%s" %
          (B73_agpv4_maize282_versionID, str(myGenotypeVersion)))

    # ADD ALL CHROMOSOMES FOR A SPECIES TO DB
    insertedChromosomeIDs = insert.insert_all_chromosomes_for_species(
        conn, 10, maizeSpeciesID)
    print("[ INSERT ]\t%s\t%s" %
          (insertedChromosomeIDs, '\t10 (sID: %s)' % maizeSpeciesID))

    # GET LINES FROM SPECIFIED 012.indv FILE AND ADD TO DB
    insertedLineIDs = insert.insert_lines_from_file(
        conn, '../data/chr10_282_agpv4.012.indv', maize282popID)
    print("[ INSERT ]\t%s\t%s\t(pID:  %s)" %
          (insertedLineIDs, '../data/chr10_282_agpv4.012.indv', maize282popID))

    # GET VARIANTS FROM .012.pos FILE AND ADD TO  DB
    # Found the issue, the 'true' database on adriatic houses variants for ALL chromosomes
    # So, to fix that, we gotta loop through each chromosome file and add them
    # NOTE(timp): For when this is generalized to more than just Zea mays, there need to be a
    # variable for the range instead because the number of chromosomes may differ between species
    for c in range(1, 11):
        chrShortname = 'chr' + str(c)
        chrId = find.find_chromosome(conn, chrShortname, maizeSpeciesID)
        filename = '../data/%s_282_agpv4.012.pos' % chrShortname
        # print("[ FIND ]\t(%s)\t%s" % (chrId, '< chromsome: %s >' % filename))
        insertedVariantIDs = insert.insert_variants_from_file(
            conn, filename, maizeSpeciesID, chrId)
        # print("num inserted variants:")
        # print(len(insertedVariantIDs))

    # ADD ALL GENOTYPES FROM A ONE-CHROMOSOME .012 FILE TO DB
    for c in range(1, 11):
        chrShortname = 'chr' + str(c)
        chrId = find.find_chromosome(conn, chrShortname, maizeSpeciesID)
        genoFilename = '../data/%s_282_agpv4.012' % chrShortname
        linesFilename = '../data/%s_282_agpv4.012.indv' % chrShortname
        # Example input file: chr1_282_agpv4.012.indv
        # 282set_33-16
        # 282set_38-11Goodman-Buckler
        # 282set_4226
        # 282set_4722
        # 282set_A188
        # 282set_A214NGoodman-Buckler
        # 282set_A239
        # 282set_A441-5
        # 282set_A554
        # ...
        # This is a list of all the lines that have been genotyped
        # AFAIK, this is 1:1 for the rows of each file, so row 1 of .indv contains the line of row 1 in .012

        insertedGenotypeIDs = insert.insert_genotypes_from_file(
            conn, genoFilename, linesFilename, chrId, maize282popID, B73lineID)
        # print("Inserted genotype IDs:")
        # print(insertedGenotypeIDs)
        # print("[ INSERT ]\t%s\t%s\t%s\t(cID: %s, pID: %s, lID: %s)" % (insertedGenotypeIDs, genoFilename, linesFilename, str(chrId), str(maize282popID), str(B73lineID)))

    # PARSE TRAITS FROM PHENOTYPE FILE AND ADD TO DB
    phenotypeRawData = pd.read_csv(
        '../data/5.mergedWeightNorm.LM.rankAvg.longFormat.csv', index_col=0)
    traits = list(phenotypeRawData)
    insertedTraitIDs = insert.insert_traits_from_traitlist(conn, traits)
    # print("num inserted traits:")
    # print(len(insertedTraitIDs))
    # print("Inserted trait IDs:")
    # print(insertedTraitIDs)

    # PARSE PHENOTYPES FROM FILE AND ADD TO DB
    # Example input file: 5.mergedWeightNorm.LM.rankAvg.longFormat.csv
    # Pedigree                      weight_FL06   weight_MO06   weight_NC06 ...
    # 282set_33-16                  299.8285      NA            247.08025
    # 282set_38-11Goodman-Buckler	  NA            157.62175     183.5531625
    # 282set_4226                   NA            NA            266.214
    # 282set_4722                   155.593625    130.501625    98.497
    # 282set_A188                   252.62675     255.4635      213.556125
    # 282set_A214NGoodman-Buckler	  NA            NA            202.21075
    # 282set_A239                   NA            225.50125     217.842
    # ...
    # It is a line for line listing of all the traits by year
    # This WILL be changed out for using phenotype (.ph) files instead

    insertedPhenoIDs = insert.insert_phenotypes_from_file(
        conn, '../data/5.mergedWeightNorm.LM.rankAvg.longFormat.csv',
        maize282popID)
    # print("num phenotypes inserted:")
    # print(len(insertedPhenoIDs))
    # print("phenoIDs:")
    # print(insertedPhenoIDs)

    # ADD NEW HARD-CODED GROWOUT_TYPE TO DB
    greenhouse_GrowoutType = growout_type("greenhouse")
    greenhouse_GrowoutTypeID = insert.insert_growout_type(
        conn, greenhouse_GrowoutType)

    phenotyper_GrowoutType = growout_type("phenotyper")
    phenotyper_GrowoutTypeID = insert.insert_growout_type(
        conn, phenotyper_GrowoutType)

    field_GrowoutType = growout_type("field")
    field_GrowoutTypeID = insert.insert_growout_type(conn, field_GrowoutType)

    # LOOK UP ID OF A HARD-CODED GROWOUT_TYPE USING find_chromosome()
    fieldGrowoutTypeID = find.find_growout_type(conn, 'field')
    print("[ FIND ]\t(%s)\t%s" %
          (fieldGrowoutTypeID, '< growout_type: field >'))

    # ADD NEW HARD-CODED GROWOUT TO DB
    growouts = []
    growouts.append(
        growout("PU09", maize282popID, PUlocID, 2009, fieldGrowoutTypeID))
    growouts.append(
        growout("NY06", maize282popID, NYlocID, 2006, fieldGrowoutTypeID))
    growouts.append(
        growout("NY10", maize282popID, NYlocID, 2010, fieldGrowoutTypeID))
    growouts.append(
        growout("FL06", maize282popID, FLlocID, 2006, fieldGrowoutTypeID))
    growouts.append(
        growout("PR06", maize282popID, PRlocID, 2006, fieldGrowoutTypeID))
    growouts.append(
        growout("NC06", maize282popID, NClocID, 2006, fieldGrowoutTypeID))
    growouts.append(
        growout("PU10", maize282popID, PUlocID, 2010, fieldGrowoutTypeID))
    growouts.append(
        growout("SA06", maize282popID, SAlocID, 2006, fieldGrowoutTypeID))
    growouts.append(
        growout("MO06", maize282popID, MOlocID, 2006, fieldGrowoutTypeID))
    insertedGrowoutIDs = []
    for growout in growouts:
        print("-------------\t%s" % str(growout))
        insertedGrowoutIDs.append(insert.insert_growout(conn, growout))
    print("[ INSERT ]\t%s\t(new growout)" % (insertedGenotypeIDs))

    # ADD NEW HARD-CODED GWAS_ALGORITHM TO DB
    gwasAlgorithms = []
    gwasAlgorithms.append(gwas_algorithm("MLMM"))
    gwasAlgorithms.append(gwas_algorithm("EMMAx"))
    gwasAlgorithms.append(gwas_algorithm("GAPIT"))
    gwasAlgorithms.append(gwas_algorithm("FarmCPU"))
    newGWASalgorithmIDs = []
    for algorithm in gwasAlgorithms:
        newGWASalgorithmIDs.append(
            insert.insert_gwas_algorithm(conn, algorithm))
    print("[ INSERT ]\t%s\t(new gwas algorithm IDs)" % (newGWASalgorithmIDs))
    newGWASalgorithm = find.find_gwas_algorithm(conn, 'MLMM')

    # ADD NEW HARD-CODED IMPUTATION_METHOD TO DB
    newImputationMethods = []
    newImputationMethods.append(imputation_method("impute to major allele"))
    newImputationMethods.append(imputation_method("impute to minor allele"))
    newImputationMethods.append(imputation_method("impute to average allele"))
    newImputationMethods.append(imputation_method("IMPUTE"))
    newImputationMethods.append(imputation_method("BEAGLE"))
    for im in newImputationMethods:
        insert.insert_imputation_method(conn, im)

    # ADD NEW HARD-CODED KINSHIP_ALGORITHM TO DB
    kinshipAlgorithms = []
    kinshipAlgorithms.append(kinship_algorithm("loiselle"))
    kinshipAlgorithms.append(kinship_algorithm("van raden"))
    kinshipAlgorithms.append(kinship_algorithm("Synbreed_realizedAB"))
    newKinshipAlgorithmIDs = []
    for algorithm in kinshipAlgorithms:
        newKinshipAlgorithmIDs.append(
            insert.insert_kinship_algorithm(conn, algorithm))
    print("[ INSERT ]\t%s\t(new kinship algorithm IDs)" %
          (newKinshipAlgorithmIDs))
    # LOOK UP ID OF A HARD-CODED KINSHIP_ALGORITHM USING find_kinship_algorithm()
    VanRadenID = find.find_kinship_algorithm(conn, "van raden")
    print("Van Raden kinship alg ID:")
    print(VanRadenID)

    # ADD NEW HARD-CODED KINSHIP TO DB
    newKinship = kinship(VanRadenID,
                         "../data/4.AstleBalding.synbreed.kinship.csv")
    newKinshipID = insert.insert_kinship(conn, newKinship)
    print("New kinship ID:")
    print(newKinshipID)

    # ADD NEW HARD-CODED POPULATION_STRUCTURE_ALGORITHM TO DB
    newPopulationStructures = []
    newPopulationStructures.append(
        population_structure_algorithm("Eigenstrat"))
    newPopulationStructures.append(population_structure_algorithm("STRUCTURE"))
    newPopulationStructures.append(
        population_structure_algorithm("FastSTRUCTURE"))
    for ps in newPopulationStructures:
        insert.insert_population_structure_algorithm(conn, ps)

    # LOOK UP ID OF A HARD-CODED POPULATION_STRUCTURE_ALGORITHM USING find_population_structure_algorithm()
    EigenstratID = find.find_population_structure_algorithm(conn, "Eigenstrat")
    print("Eigenstrat pop str alg ID:")
    print(EigenstratID)

    # ADD NEW HARD-CODED POPULATION_STRUCTURE TO DB
    # Example input file: 4.Eingenstrat.population.structure.10PCs.csv
    # Line                         	      V1          	 V2	          V3 ...
    # 282set_4226                   -0.002298602  -0.029693879   0.008527265
    # 282set_4722                   -0.003785163	-0.083527265	-0.059586105
    # 282set_33-16                   0.000222197	-0.035755785   0.017007817
    # 282set_38-11Goodman-Buckler   -0.026698262	-0.053115302	-0.01159794
    # 282set_A188                    0.002520617	-0.041387288	-0.011656126
    # 282set_A239                   -0.024217977	-0.038008255   0.033222018
    # ...
    # The number of columns is one more than the number of PCs in filename
    newPopulationStructure = population_structure(
        EigenstratID, "../data/4.Eigenstrat.population.structure.10PCs.csv")
    newPopulationStructureID = insert.insert_population_structure(
        conn, newPopulationStructure)
    print("New population structure ID:")
    print(newPopulationStructureID)

    # LOOK UP ID OF A HARD-CODED GWAS_ALGORITHM
    MLMMalgorithmID = find.find_gwas_algorithm(conn, "MLMM")
    print("MLMM algorithm ID:")
    print(MLMMalgorithmID)

    # LOOK UP ID OF A HARD-CODED GENOTYPE_VERSION
    B73_agpv4_maize282_versionID = find.find_genotype_version(
        conn, "B73 RefGen_v4_AGPv4_Maize282")
    print("B73 agpv4 maize282 genotype version: ")
    print(B73_agpv4_maize282_versionID)

    # LOOK UP ID OF A HARD-CODED IMPUTATION_METHOD
    majorAlleleImputationID = find.find_imputation_method(
        conn, "impute to major allele")
    print("major allele imputation ID: ")
    print(majorAlleleImputationID)

    # LOOK UP ID OF A HARD-CODED KINSHIP
    # NOTE(timp): I could not find this file, but I found a R data file (.rda) that may contain the information.
    #             Although, the data may not be in the correct format.
    #             The temporary file is the one with 'export' in its name.
    # kinshipID = find.find_kinship(conn, "/opt/BaxDB/file_storage/kinship_files/4.AstleBalding.synbreed.kinship.csv")
    kinshipID = find.find_kinship(
        conn, "../data/4.AstleBalding.synbreed.kinship.csv")
    print("kinshipID: ")
    print(kinshipID)

    # LOOK UP ID OF A HARD-CODED POPULATION_STRUCTURE
    populationStructureID = find.find_population_structure(
        conn, "../data/4.Eigenstrat.population.structure.10PCs.csv")
    print("population structure ID: ")
    print(populationStructureID)

    # PARSE GWAS_RUNS FROM FILE AND ADD TO DB
    # NOTE(timp): Could not find file or possible equivalent
    insertedGwasRunIDs = insert.insert_gwas_runs_from_gwas_results_file(
        conn, '../data/9.mlmmResults.csv', MLMMalgorithmID,
        B73_agpv4_maize282_versionID, 0.2, 0.2, 0.1, majorAlleleImputationID,
        kinshipID, populationStructureID)
    print("Inserted gwas_run IDs:")
    print(insertedGwasRunIDs)

    # PARSE GWAS_RESULTS FROM FILE AND ADD TO DB
    # NOTE(timp): Could not find file or possible equivalent
    insertedGwasResultIDs = insert.insert_gwas_results_from_file(
        conn, maizeSpeciesID, '../data/9.mlmmResults.csv', MLMMalgorithmID,
        0.2, 0.2, majorAlleleImputationID, B73_agpv4_maize282_versionID,
        kinshipID, populationStructureID, 0.1)
    print("Inserted gwas result IDs: ")
    print(insertedGwasResultIDs)
Exemplo n.º 4
0
def process(args):
    """Imports hardcoded values for Setaria database. Many items are placeholder values."""
    # =======================================
    # ========= Database Connection =========
    # =======================================
    try:
        conn = connect(args)
    except:
        raise

    # Input file preprocessing and validation
    try:
        if not os.path.isfile(args.filename):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                    args.filename)
        else:
            with open(args.filename) as f:
                dp = json.load(f)  # data parameters

        # Verify that all necessary values were provided, assuming a complete dataset
            expected_fields = [
                "species_shortname", "species_binomial_name",
                "species_subspecies", "species_variety", "population_name",
                "number_of_chromosomes", "genotype_version_assembly_name",
                "genotype_version_annotation_name",
                "reference_genome_line_name", "gwas_algorithm_name",
                "imputation_method_name", "kinship_algortihm_name",
                "population_structure_algorithm_name", "kinship_filename",
                "population_structure_filename", "gwas_run_filename",
                "gwas_results_filename", "missing_SNP_cutoff_value",
                "missing_line_cutoff_value",
                "minor_allele_frequency_cutoff_value", "phenotype_filename"
            ]

        missing_keys = []
        for k in expected_fields:
            if k not in dp:
                missing_keys.append(k)
        if missing_keys:
            raise KeyError(
                f'The following keys are required. Please include them in your JSON configuration: {missing_keys}'
            )

        # Check for all required fields
        required_fields = [
            "species_shortname", "species_binomial_name", "population_name",
            "number_of_chromosomes", "genotype_version_assembly_name",
            "genotype_version_annotation_name", "reference_genome_line_name",
            "gwas_algorithm_name", "imputation_method_name",
            "kinship_algortihm_name", "population_structure_algorithm_name",
            "kinship_filename", "population_structure_filename",
            "gwas_run_filename", "gwas_results_filename",
            "missing_SNP_cutoff_value", "missing_line_cutoff_value",
            "minor_allele_frequency_cutoff_value", "phenotype_filename"
        ]

        empty_fields = []
        for rf in required_fields:
            if not dp[rf]:
                empty_fields.append(rf)
        if empty_fields:
            raise KeyError(
                f'The following keys must be defined. Empty strings are not permitted. Please modify your JSON configuration: {empty_fields}'
            )

        logging.info(
            'Configuration file is valid. Verifying that all files exist.')

        # Track all the files to check for existance
        locations = []
        filepath_template = Template('${cwd}/${filename}')

        # Verify that all files exist
        # Lines
        lines_filename = Template('${chr}_${shortname}.012.indv')
        # Genotype
        genotype_filename = Template('${chr}_${shortname}.012')
        # Variants
        variants_filename = Template('${chr}_${shortname}.012.pos')

        for c in range(1, dp['number_of_chromosomes'] + 1):
            chr_shortname = 'chr' + str(c)
            lines_filepath = lines_filename.substitute(
                dict(cwd=args.working_directory,
                     shortname=dp['species_shortname'],
                     chr=chr_shortname))
            genotype_filepath = genotype_filename.substitute(
                dict(cwd=args.working_directory,
                     shortname=dp['species_shortname'],
                     chr=chr_shortname))
            variants_filepath = variants_filename.substitute(
                dict(cwd=args.working_directory,
                     shortname=dp['species_shortname'],
                     chr=chr_shortname))

            locations.append(
                dict(cwd=args.working_directory,
                     filetype='line',
                     filename=lines_filepath))
            locations.append(
                dict(cwd=args.working_directory,
                     filetype='genotype',
                     filename=genotype_filepath))
            locations.append(
                dict(cwd=args.working_directory,
                     filetype='variant',
                     filename=variants_filepath))

        # Go through all the single files that are not named based off of a chromsome
        # Construct the file descriptor dictionaries, and then loop through and test each file's existance
        # phenotype_filename = Template('${cwd}/${growout}.ph.csv') # Welp, this is another instance of pheno file issue
        locations.append(
            dict(cwd=args.working_directory,
                 filetype='kinship',
                 filename=dp['kinship_filename']))
        locations.append(
            dict(cwd=args.working_directory,
                 filetype='population_structure',
                 filename=dp['population_structure_filename']))

        # Since there can be more than one file for the phenotypes, results, and run
        # For each array in the configuration file, add it to the list of paths to
        # verify as existing

        for configuration_entry in dp:
            if isinstance(dp[configuration_entry], list):
                for filename in dp[configuration_entry]:
                    locations.append(
                        dict(cwd=args.working_directory,
                             filetype=configuration_entry,
                             filename=filename))
            else:
                # For any of the entries that CAN be a list, add their single values to
                # the file list
                if configuration_entry in [
                        'phenotype_filename', 'gwas_run_filename',
                        'gwas_results_filename'
                ]:
                    locations.append(
                        dict(cwd=args.working_directory,
                             filetype=configuration_entry,
                             filename=dp[configuration_entry]))

        logging.debug("File locations\n======================")
        logging.debug(pformat(locations))

        for file_descriptor in locations:
            file_path = filepath_template.substitute(file_descriptor)
            if not os.path.isfile(file_path):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT), file_path)

        logging.info(f'Found all files. Validating file contents.')

        # Validate the contents of each file
        for file_descriptor in locations:
            ft = file_descriptor['filetype']
            fp = filepath_template.substitute(file_descriptor)
            if ft == 'line':
                validate_line(conn, args, fp)
            elif ft == 'variant':
                validate_variant(conn, args, fp)
            elif ft == 'genotype':
                validate_genotype(conn, args, fp)
            elif ft == 'kinship':
                validate_kinship(conn, args, fp)
            elif ft == 'population_structure':
                validate_population_structure(conn, args, fp)
            elif ft == 'phenotype_filename':
                validate_phenotype(conn, args, fp)
            elif ft == 'gwas_run_filename':
                validate_runs(conn, args, fp)
            elif ft == 'gwas_results_filename':
                validate_results(conn, args, fp)
            else:
                logging.debug(f"Calling validation on unknown file: {fp}")
    except:
        raise

    logging.info(f'Input files appear to be valid. Proceeding with import.')

    # =======================================
    # ========== Experiment Design ==========
    # =======================================
    # What the database needs in order to create an 'experiment' is the follow
    # Species: maize (Zea mays)
    # Population: Maize282
    # Chromosome: 10 (just the number and a way to generate its unique name)
    # Line: 282set_B73 (B73) -- taken from file if possible
    # Genotype Version: B73 RefGen_v4_AGPv4_Maize282 (the reference genome)
    # Growout, Type, and Location:
    #       Location: code, city, state, country
    #                 "PU", "West Lafayette", "Indiana", "United States"
    #       Type: "field", "phenotyper", etc.
    #       Growout: name, population ID, location ID, year, type
    #                "PU09", maize282popID, PUlocID, 2009, fieldGrowoutTypeID
    # Traits (planned phenotypes/traits to measure)

    # Expected User Input
    # Species
    species_shortname = dp['species_shortname']
    species_binomial = dp['species_binomial_name']
    species_subspecies = dp['species_subspecies']
    species_variety = dp['species_variety']
    # Population
    population_name = dp['population_name']
    # Chromosome
    chromosome_count = dp['number_of_chromosomes']
    # Line
    # NOTE(tparker): Can use any chromosome, as they are the same for each.
    # In the future, this the extraneous copies of the lines may be removed
    # and there will be one specific line file, much like the phenotype files
    lines_filename = Template('${cwd}/${chr}_${shortname}.012.indv')
    # Genotype Version
    # NOTE(tparker): This is possibly just the info about the reference genome
    #                It is likely included with the VCF genotype file (.012).
    genotype_version_assembly_name = dp['genotype_version_assembly_name']
    genotype_version_annotation_name = dp['genotype_version_annotation_name']
    reference_genome_line_name = dp['reference_genome_line_name']
    # Growout, Type, and Location
    # NOTE(tparker): Unknown at this time
    ## Location
    ## Type
    ## Growout
    #
    # Traits
    # Allow for more than on phenotype files
    if isinstance(dp["phenotype_filename"], list):
        phenotype_filenames = [
            f'{args.working_directory}/{filename}'
            for filename in dp['phenotype_filename']
        ]
    else:
        phenotype_filenames = [
            f'{args.working_directory}/{dp["phenotype_filename"]}'
        ]

    # Model Construction & Insertion
    # Species
    s = species(species_shortname, species_binomial, species_subspecies,
                species_variety)
    species_id = insert.insert_species(conn, args, s)
    logging.debug(f'[Insert]\tSpecies ID\t{species_id}, {s}')
    # Population
    p = population(population_name, species_id)
    population_id = insert.insert_population(conn, args, p)
    logging.debug(f'[Insert]\tPopulation ID\t{population_id}: {p}')
    # Chromosome
    chromosome_ids = insert.insert_all_chromosomes_for_species(
        conn, args, chromosome_count, species_id)
    logging.debug(f'[Insert]\tChromosome IDs\t{chromosome_ids}')
    # Line
    working_filepath = lines_filename.substitute(
        dict(chr="chr1",
             cwd=f"{args.working_directory}",
             shortname=species_shortname))
    try:
        if not os.path.isfile(working_filepath):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                    working_filepath)
    except:
        raise

    line_ids = insert.insert_lines_from_file(
        conn, args, working_filepath, population_id
    )  # hard-coded substitue until just one file is used for lines
    logging.debug(f'[Insert]\tLine IDs\t{line_ids}')
    # Genotype Version
    reference_genome_id = find.find_line(conn, args,
                                         reference_genome_line_name,
                                         population_id)
    logging.debug(
        f'[Insert]\tReference Genome ID\t{reference_genome_id}, ({reference_genome_line_name}, {population_id})'
    )
    gv = genotype_version(genotype_version_name=genotype_version_assembly_name,
                          genotype_version=genotype_version_annotation_name,
                          reference_genome=reference_genome_id,
                          genotype_version_population=population_id)
    genotype_version_id = insert.insert_genotype_version(conn, args, gv)
    logging.debug(f'[Insert]\tGenome Version ID\t{genotype_version_id}')
    if genotype_version_id is None:
        raise Exception(
            f'Genotype version is None for parameters: {pformat(gv)}')

    # Growout, Type, and Location
    # NOTE(tparker): Unknown at this time
    ## Location
    ## Type
    ## Growout

    # Traits
    # Go through all the phenotype files available for the dataset and insert
    # the recorded traits for each.
    for phenotype_filepath in phenotype_filenames:
        try:
            if not os.path.isfile(phenotype_filepath):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        phenotype_filepath)
        except:
            raise
        traits = list(pd.read_csv(phenotype_filepath, index_col=0))
        trait_ids = insert.insert_traits_from_traitlist(
            conn, args, traits, phenotype_filepath)
        logging.debug(
            f'[Insert]\tTrait IDs for {phenotype_filepath}\t{trait_ids}')

    # # =====================================
    # # ========== Pipeline Design ==========
    # # =====================================
    # # GWAS Algorithm: "MLMM", "EMMAx", "GAPIT", "FarmCPU"
    # # Imputation Method: "impute to major allele"
    # # Kinship Algorithm: "loiselle"
    # # Population Structure Algorithm: "Eigenstrat"

    # Expected User Input
    # GWAS Algorithm
    gwas_algorithm_name = dp[
        'gwas_algorithm_name']  # According to Greg's README
    # Imputation Method
    imputation_method_name = dp[
        'imputation_method_name']  # Unknown, apparently it was done by someone named Sujan
    # Kinship Algorithm
    kinship_algorithm_name = dp[
        'kinship_algortihm_name']  # Placeholder, I don't know the exact string that should be used
    # Population Structure Algorithm
    population_structure_algorithm_name = dp[
        'population_structure_algorithm_name']  # This is a guess based on filename

    # Model Construction & Insertion
    # GWAS Algorithm
    ga = gwas_algorithm(gwas_algorithm_name)
    gwas_algorithm_id = insert.insert_gwas_algorithm(conn, args, ga)
    # Imputation Method
    im = imputation_method(imputation_method_name)
    imputation_method_id = insert.insert_imputation_method(conn, args, im)
    # Kinship Algorithm
    ka = kinship_algorithm(kinship_algorithm_name)
    kinship_algorithm_id = insert.insert_kinship_algorithm(conn, args, ka)
    # Population Structure Algorithm
    psa = population_structure_algorithm(population_structure_algorithm_name)
    population_structure_algorithm_id = insert.insert_population_structure_algorithm(
        conn, args, psa)

    # ===========================================
    # ========== Experiment Collection ==========
    # ===========================================
    # Phenotype (external source?)
    #       This needs to be standardized to a .pheno filetype.
    #       For now, it is the longFormat for the Maize282 datset
    #       5.mergedWeightNorm.LM.rankAvg.longFormat.csv, but for Setaria will be
    # Genotype (VCF output)
    # Variant (VCF output)

    # Expected User Input
    # Phenotype
    # NOTE(tparker): Define in earlier stage
    # Genotype
    genotype_filename = Template('${cwd}/${chr}_${shortname}.012')
    # Variants
    variants_filename = Template('${cwd}/${chr}_${shortname}.012.pos')

    # Model Construction & Insertion
    # Phenotype
    for phenotype_filepath in phenotype_filenames:
        try:
            if not os.path.isfile(phenotype_filepath):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        phenotype_filepath)
        except:
            raise
        phenotype_ids = insert.insert_phenotypes_from_file(
            conn, args, phenotype_filepath, population_id, phenotype_filepath)
        logging.debug(
            f'[Insert]\tPhenotype IDs for {phenotype_filepath}\t{phenotype_ids}'
        )

    # Genotype
    for c in range(1, chromosome_count + 1):
        chromosome_shortname = 'chr' + str(c)
        chromosome_id = find.find_chromosome(conn, args, chromosome_shortname,
                                             species_id)
        geno_filename = genotype_filename.substitute(
            dict(chr=chromosome_shortname,
                 cwd=f'{args.working_directory}',
                 shortname=species_shortname))
        line_filename = lines_filename.substitute(
            dict(chr=chromosome_shortname,
                 cwd=f'{args.working_directory}',
                 shortname=species_shortname))
        try:
            if not os.path.isfile(geno_filename):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        geno_filename)
            if not os.path.isfile(line_filename):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        line_filename)
        except:
            raise
        genotype_ids = insert.insert_genotypes_from_file(
            conn=conn,
            args=args,
            genotypeFile=geno_filename,
            lineFile=line_filename,
            chromosomeID=chromosome_id,
            populationID=population_id,
            genotype_versionID=genotype_version_id)
    # Variants
    for c in range(1, chromosome_count + 1):
        chromosome_shortname = 'chr' + str(c)
        chromosome_id = find.find_chromosome(conn, args, chromosome_shortname,
                                             species_id)
        variant_filename = variants_filename.substitute(
            dict(chr=chromosome_shortname,
                 cwd=f'{args.working_directory}',
                 shortname=species_shortname))
        try:
            if not os.path.isfile(variant_filename):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        variant_filename)
        except:
            raise
        # insert.insert_variants_from_file(conn,
        #                                  args,
        #                                  variant_filename,
        #                                  species_id,
        #                                  chromosome_id)

        # NOTE(tparker): Changed variant insertion to the async version
        insert.insert_variants_from_file_async(conn, args, variant_filename,
                                               species_id, chromosome_id)

    # =========================================
    # ========== Pipeline Collection ==========
    # =========================================
    # Kinship
    # Setaria Kinship is stored in:
    ## /shares/ibaxter_share/gziegler/SetariaGWASPipeline/data/genotype/6.AstleBalding.synbreed.kinship.rda
    ## Exported the file to CSV using R
    ### load('6.AstleBalding.synbreed.kinship.rda')
    ### write.csv(kinship, '6.AstleBalding.synbreed.kinship.csv')
    # Population Structure

    # Expected User Input
    # Kinship
    # NOTE(tparker): Currently the database just stores the filename.
    #                There is no service to upload the file to database's
    #                host, so there's no single location to find the file
    #                I would like to find out why this is the case and if
    #                it would just be better to store it in the database and
    #                allow the user to export the table themselves as a CSV.
    kinship_filepath = f'{args.working_directory}/{dp["kinship_filename"]}'
    # Population Structure
    # NOTE(tparker): Same reasoning as the kinship file. There should be a way
    #                for the data to be stored in the database, not a
    population_structure_filepath = f'{args.working_directory}/{dp["population_structure_filename"]}'

    # Model Construction & Insertion
    # Kinship
    try:
        if not os.path.isfile(kinship_filepath):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                    kinship_filepath)
        if not os.path.isfile(population_structure_filepath):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                    population_structure_filepath)
    except:
        raise
    k = kinship(kinship_algorithm_id, kinship_filepath)
    kinship_id = insert.insert_kinship(conn, args, k)
    # Population Structure
    ps = population_structure(population_structure_algorithm_id,
                              population_structure_filepath)
    population_structure_id = insert.insert_population_structure(
        conn, args, ps)

    # =============================================
    # ================== Results ==================
    # =============================================
    # GWAS Run
    # GWAS Results

    # Expected User Input
    # GWAS Run & results
    if isinstance(dp['gwas_results_filename'], list):
        gwas_filenames = [
            f'{args.working_directory}/{filename}'
            for filename in dp['gwas_results_filename']
        ]  # allows for more than one gwas results/run file
    else:
        gwas_filenames = [
            f'{args.working_directory}/{dp["gwas_results_filename"]}'
        ]
    # The following values (0.2, 0.2, and 0.1) were all taken from the Maize282 import
    # NOTE(tparker): Make sure to double check with Greg on what the true values should be
    #                Also, double check the source of the pipeline to see if there is any
    #                indication what the values shoudl be.
    missing_snp_cutoff_value = dp['missing_SNP_cutoff_value']
    missing_line_cutoff_value = dp['missing_line_cutoff_value']
    minor_allele_frequency_cutoff_value = dp[
        'minor_allele_frequency_cutoff_value']

    # Model Construction & Insertion
    # GWAS Run
    # NOTE(tparker): Check with Greg on what the imputation method was used. I believe it was
    #                set by someone named Sujan because imputation was done beforehand
    for gwas_filename in gwas_filenames:
        try:
            if not os.path.isfile(gwas_filename):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        gwas_filename)
        except:
            raise
        imputation_method_id = find.find_imputation_method(
            conn, args, imputation_method_name)
        gwas_run_ids = insert.insert_gwas_runs_from_gwas_results_file(
            conn, args, gwas_filename, gwas_algorithm_id, genotype_version_id,
            missing_snp_cutoff_value, missing_line_cutoff_value,
            minor_allele_frequency_cutoff_value, imputation_method_id,
            kinship_id, population_structure_id)
        # GWAS Results
        gwas_result_ids = insert.insert_gwas_results_from_file(
            conn=conn,
            args=args,
            speciesID=species_id,
            gwas_results_file=gwas_filename,
            gwas_algorithm_ID=gwas_algorithm_id,
            missing_snp_cutoff_value=missing_snp_cutoff_value,
            missing_line_cutoff_value=missing_line_cutoff_value,
            imputationMethodID=imputation_method_id,
            genotypeVersionID=genotype_version_id,
            kinshipID=kinship_id,
            populationStructureID=population_structure_id,
            minor_allele_frequency_cutoff_value=
            minor_allele_frequency_cutoff_value)
Exemplo n.º 5
0
def insert_gwas_results_from_file(conn,
                                  args,
                                  speciesID,
                                  gwas_results_file,
                                  gwas_algorithm_ID,
                                  missing_snp_cutoff_value,
                                  missing_line_cutoff_value,
                                  imputationMethodID,
                                  genotypeVersionID,
                                  kinshipID,
                                  populationStructureID,
                                  minor_allele_frequency_cutoff_value):
  """Inserts a collection of GWAS results from a file into database

  This function inserts a collection of GWAS results from a file into a database

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    speciesID (int): :ref:`species_id <species_class>`
    gwas_results_file (str): absolute path to input file
    gwas_algorithm_ID (int): :ref:`gwas_algorithm_id <gwas_algorithm_class>`
    missing_snp_cutoff_value (numeric):
    missing_line_cutoff_value (numeric):`
    imputationMethodID (int): :ref:`imputation_method_id <imputation_method_class>`
    genotypeVersionID (int): :ref:`genotype_version_id <genotype_version_class>`
    kinshipID (int): :ref:`kinship_id <kinship_class>`
    populationStructureID (int): :ref:`population_structure_id <population_structure_class>`
    minor_allele_frequency_cutoff_value (numeric):

  Returns:
    list of int: GWAS Result IDs
  """
  new_gwas_result_IDs = []
  df = pd.read_csv(gwas_results_file)
  for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="GWAS Results"):
    trait = row['trait']
    # In case either nSNPs or nLines are not defined, make them none
    nSNPs = None
    nLines = None
    if 'nSNPs' in df.columns:
      nSNPs = row['nSNPs']
    if 'nLines' in df.columns:
      nLines = row['nLines']

    traitID = find.find_trait(conn, args, trait)
    gwas_run_ID = find.find_gwas_run(conn = conn, 
                                     args = args,
                                     gwas_algorithm = gwas_algorithm_ID,
                                     missing_snp_cutoff_value = missing_snp_cutoff_value,
                                     missing_line_cutoff_value = missing_line_cutoff_value,
                                     gwas_run_imputation_method = imputationMethodID,
                                     gwas_run_trait = traitID,
                                     nsnps = nSNPs,
                                     nlines = nLines,
                                     gwas_run_genotype_version = genotypeVersionID,
                                     gwas_run_kinship = kinshipID,
                                     gwas_run_population_structure = populationStructureID,
                                     minor_allele_frequency_cutoff_value = minor_allele_frequency_cutoff_value)

    logging.debug("Found run ID: %s", gwas_run_ID)

    snp = None
    if 'SNP' in df.columns:
      snp = row['SNP']
    chromosome = row['chr']
    chromosome = "chr"+str(chromosome)
    chromosomeID = find.find_chromosome(conn, args, chromosome, speciesID)
    basepair = row['chr']
    
    if 'PCs' in df.columns:
      pcs = row['PCs']
      if type(pcs) == str:
        pcs_list = pcs.split(":")
        pcs_list = [int(x) for x in pcs_list]
      elif np.isnan(pcs):
        pcs_list = None
    else:
      pcs_list = None

    # Assume the data producer did not provide the data
    # NOTE(tparker): I want to replace this with more flexible code that does
    # not restrict the user's spelling of the column names
    model = None
    modelAddedPval = None
    pval = None
    nullPval = None
    cofactor = None
    order = None

    if 'model' in df.columns:
      model = row['model']
    if 'modelAddedPval' in df.columns:
      modelAddedPval = row['modelAddedPval']
    if 'pval' in df.columns:
      pval = row['pval']
    if 'nullPval' in df.columns:
      nullPval = row['nullPval']
    if 'cofactor' in df.columns:
      cofactor = row['cofactor']
    if 'order' in df.columns:
      order = row['order']

    new_gwas_result = gwas_result(gwas_result_chromosome = chromosomeID,
                                  basepair = basepair,
                                  gwas_result_gwas_run = gwas_run_ID,
                                  pval = pval,
                                  cofactor = cofactor,
                                  _order = order,
                                  null_pval = nullPval,
                                  model_added_pval = modelAddedPval,
                                  model = model,
                                  pcs = pcs_list)

    new_gwas_result_ID = insert_gwas_result(conn, args, new_gwas_result)
    new_gwas_result_IDs.append(new_gwas_result_ID)
  return new_gwas_result_IDs