Exemplo n.º 1
0
def textfile_chain(ctx, files, columns, formatflag, samples):
    """Output phenotypes in customizable text format."""
    def processor(pheno):
        if ctx.obj.get('to_be_deleted'):
            pheno = pheno.drop(ctx.obj['to_be_deleted'], axis='columns')
        pheno = pheno.to_textfile()
        pheno.write()
        return pheno

    assert formatflag == 'csv', "Sorry, textfile currently only supports csv output. This will change in future versions."

    ctx.obj['phenovars'] = list(
        dict.fromkeys(ctx.obj.get('phenovars', []) +
                      columns))  # Clever little trick to get unique list
    if samples:
        ctx.obj['samples'] = list(
            dict.fromkeys(ctx.obj.get('samples', []) + samples))
    from pkpheno import TextFile
    ctx.obj['constructor'] = ctx.obj.get('constructor', TextFile)
    ctx.obj['pheno'] = ctx.obj['constructor'](csv.DictReader(files[0]),
                                              phenovars=ctx.obj['phenovars'],
                                              samples=ctx.obj.get('samples'))
    for fileobj in files[1:]:
        pheno_new = ctx.obj['constructor'](csv.DictReader(fileobj),
                                           phenovars=ctx.obj['phenovars'],
                                           samples=ctx.obj.get('samples'))
        ctx.obj['pheno'] = ctx.obj['pheno'].combine_first(pheno_new)
    return processor
Exemplo n.º 2
0
def snptest(files, covariates, phenotypes, samples):
    """Output phenotypes in sample format for use with Snptest.

A properly formatted snptest sample (*.sam) file must contain the columns 'ID_1', 'ID_2', 'missing' and 'sex' in that
order, followed by any covariate columns and finally by any phenotype columns. If no columns with those exact names are
present in the input, then the program first tries to guess which input columns might contain the missing information.
The guessing uses common synonyms, eg mapping 'gender' to 'sex'. It also maps across supported formats mapping 'IID'
(Plink) to 'ID_1'. If the guessing fails, it then tries to fill in the missing information, which will produce
functional files for columns 'ID_2' (using 'ID_1' values) and 'missing' (using missing values).

Furthermore a snptest sample file must have phenotypes and covariates explicitly stated as such. Since this information
cannot be inferred form the data alone, the user should provide this using the '--covariates' and '--phenotypes'
options documented below.

\b
For the official docs on the sample format please refer to:
https://www.well.ox.ac.uk/~gav/qctool_v2/documentation/sample_file_formats.html

\b
Unofficial, but good (Scroll down):
https://jmarchini.org/file-formats/
"""
    import pkpheno as Pheno
    pheno = Pheno.Snptest(csv.DictReader(files[0]),
                          covariates=covariates,
                          phenovars=phenotypes,
                          samples=samples)
    for fileobj in files[1:]:
        pheno_new = Pheno.Snptest(csv.DictReader(fileobj),
                                  covariates=covariates,
                                  phenovars=phenotypes,
                                  samples=samples)
        pheno = pheno.combine_first(pheno_new)
    pheno.write()
Exemplo n.º 3
0
def rvtest_chain(ctx, files, columns, samples):
    """UNTESTED; Output phenotypes in psam-like format for RVtest.

RVtest phenotype files are very similar to the psam format. They are essentially plink2 files with a few caveats. The
names 'fatid' and 'matid' are used for paternal and maternal ids and 'sex' is encoded 0=males,1=females.

\b
For more on RVtest phenotype files, please refer to:
http://zhanxw.github.io/rvtests/#phenotype-file
"""
    from pkpheno.pkpheno import RVtest
    ctx.obj['phenovars'] = list(
        dict.fromkeys(ctx.obj.get('phenovars', []) +
                      columns))  # Clever little trick to get unique list
    if samples:
        ctx.obj['samples'] = list(
            dict.fromkeys(ctx.obj.get('samples', []) + samples))
    ctx.obj['constructor'] = ctx.obj.get('constructor', RVtest)
    ctx.obj['pheno'] = ctx.obj['constructor'](csv.DictReader(files[0]),
                                              phenovars=ctx.obj['phenovars'],
                                              samples=ctx.obj.get('samples'))
    for fileobj in files[1:]:
        pheno_new = ctx.obj['constructor'](csv.DictReader(fileobj),
                                           phenovars=ctx.obj['phenovars'],
                                           samples=ctx.obj.get('samples'))
        ctx.obj['pheno'] = ctx.obj['pheno'].combine_first(pheno_new)

    def processor(pheno):
        if ctx.obj.get('to_be_deleted'):
            pheno = pheno.drop(ctx.obj['to_be_deleted'], axis='columns')
        pheno = pheno.to_rvtest()
        pheno.write()
        return pheno

    return processor
Exemplo n.º 4
0
def plink_chain(ctx, files, columns, fam, samples):
    """Output phenotypes in psam/fam format for use with Plink.

A properly formatted psam file has in addition to the phenotype columns one or more of the following recognizable
columns: 'IID' (individual ID; required), 'FID', 'SID', 'PAT', 'MAT' and 'SEX'. If no columns with those exact names
are present in the input, then the program tries to guess which input columns might contain the missing information.
The guessing uses common synonyms, eg mapping 'gender' to 'SEX'. It also maps across supported formats mapping 'ID_1'
(snptest) to 'IID'.

For more on the psam format please refer to:
https://www.cog-genomics.org/plink/2.0/formats#psam

A properly formatted fam file (plink1.9) has *no* header, but expects the following six columns in exact order: 'FID',
'IID', 'PAT', MAT', 'SEX', and one final phenotype column. The plink1.9 fam format only supports one phenotype. To work
with more than one phenotype in plink1.9 the psam files prepared by this program are designed to be readable as an
alternate phenotype file in plink1.9.

\b
For more on alternate phenotype files in plink1.9, please refer to:
https://www.cog-genomics.org/plink/1.9/input#pheno

\b
For more on the fam format please refer to:
https://www.cog-genomics.org/plink/1.9/formats#fam
"""
    from pkpheno.pkpheno import Psam
    assert sum(
        [1 for x in [columns, fam] if x]
    ) <= 1, "'--columns' and '--fam' are mutually exclusive; please only specify one of them."
    if fam:
        columns = [fam]
        ctx.obj['fam'] = True
    ctx.obj['phenovars'] = list(
        dict.fromkeys(ctx.obj.get('phenovars', []) +
                      columns))  # Clever little trick to get unique list
    if samples:
        ctx.obj['samples'] = list(
            dict.fromkeys(ctx.obj.get('samples', []) +
                          samples)) if ctx.obj.get('samples') else samples
    ctx.obj['constructor'] = ctx.obj.get('constructor', Psam)
    ctx.obj['pheno'] = ctx.obj['constructor'](csv.DictReader(files[0]),
                                              phenovars=ctx.obj['phenovars'],
                                              samples=ctx.obj.get('samples'))
    for fileobj in files[1:]:
        pheno_new = ctx.obj['constructor'](csv.DictReader(fileobj),
                                           phenovars=ctx.obj['phenovars'],
                                           samples=ctx.obj.get('samples'))
        ctx.obj['pheno'] = ctx.obj['pheno'].combine_first(pheno_new)

    def processor(pheno):
        if ctx.obj.get('to_be_deleted'):
            pheno = pheno.drop(ctx.obj['to_be_deleted'], axis='columns')
        pheno = pheno.to_psam()
        pheno.write(header=False if ctx.obj.get('fam') else True)
        return pheno

    return processor
Exemplo n.º 5
0
def snptest_chain(ctx, files, covariates, phenotypes, samples):
    """Output phenotypes in sample format for use with Snptest.

A properly formatted snptest sample (*.sam) file must contain the columns 'ID_1', 'ID_2', 'missing' and 'sex' in that
order, followed by any covariate columns and finally by any phenotype columns. If no columns with those exact names are
present in the input, then the program first tries to guess which input columns might contain the missing information.
The guessing uses common synonyms, eg mapping 'gender' to 'sex'. It also maps across supported formats mapping 'IID'
(Plink) to 'ID_1'. If the guessing fails, it then tries to fill in the missing information, which will produce
functional files for columns 'ID_2' (using 'ID_1' values) and 'missing' (using missing values).

Furthermore a snptest sample file must have phenotypes and covariates explicitly stated as such. Since this information
cannot be inferred form the data alone, the user should provide this using the '--covariates' and '--phenotypes'
options documented below.

\b
For the official docs on the sample format please refer to:
https://www.well.ox.ac.uk/~gav/qctool_v2/documentation/sample_file_formats.html

\b
Unofficial, but good (Scroll down):
https://jmarchini.org/file-formats/
"""
    def processor(pheno):
        if ctx.obj.get('to_be_deleted'):
            pheno = pheno.drop(ctx.obj['to_be_deleted'], axis='columns')
        pheno = pheno.to_snptest(covariates=covariates)
        pheno.write()
        return pheno

    from pkpheno.pkpheno import Snptest
    ctx.obj['phenovars'] = list(
        dict.fromkeys(ctx.obj.get('phenovars', []) + covariates +
                      phenotypes))  # Clever little trick to get unique list
    if samples:
        ctx.obj['samples'] = list(
            dict.fromkeys(ctx.obj.get('samples', []) + samples))
    ctx.obj['constructor'] = ctx.obj.get('constructor', Snptest)
    ctx.obj['pheno'] = ctx.obj['constructor'](csv.DictReader(files[0]),
                                              phenovars=ctx.obj['phenovars'],
                                              samples=ctx.obj.get('samples'))
    for fileobj in files[1:]:
        pheno_new = ctx.obj['constructor'](csv.DictReader(fileobj),
                                           phenovars=ctx.obj['phenovars'],
                                           samples=ctx.obj.get('samples'))
        ctx.obj['pheno'] = ctx.obj['pheno'].combine_first(pheno_new)
    return processor
Exemplo n.º 6
0
def main(files, columns, samplefiles):
	"""NOT IMPLEMENTED YET. Output CSV file suitable for PEP.

The Portable Encapsulated Projects (PEP for short) community effort to facilitate the portability, reusability and
durability of sample metadata.

\b
For more on the PEP community effort, please refer to:
http://pep.databio.org/en/latest/
"""
	import pkpheno as Pheno
	click.echo(samplefiles)
	assert True, "."
	pheno = Pheno.PEP(csv.DictReader(files[0]), phenovars=columns)
	for fileobj in files[1:]:
		pheno_new = Pheno.PEP(csv.DictReader(fileobj), phenovars=columns)
		pheno = pheno.combine_first(pheno_new)
	pheno.write()
Exemplo n.º 7
0
def rvtest(files, columns, samples):
    """UNTESTED; Output phenotypes in psam-like format for RVtest.

RVtest phenotype files are very similar to the psam format. They are essentially plink2 files with a few caveats. The
names 'fatid' and 'matid' are used for paternal and maternal ids and 'sex' is encoded 0=males,1=females.

\b
For more on RVtest phenotype files, please refer to:
http://zhanxw.github.io/rvtests/#phenotype-file
"""
    import pkpheno as Pheno
    import pandas as pd
    pheno = Pheno.RVtest(csv.DictReader(files[0]),
                         phenovars=columns,
                         samples=samples)
    for fileobj in files[1:]:
        pheno_new = Pheno.RVtest(csv.DictReader(fileobj),
                                 phenovars=columns,
                                 samples=samples)
        pheno = pheno.combine_first(pheno_new)
    pheno.write()
Exemplo n.º 8
0
def plink(files, columns, fam, samples):
    """Output phenotypes in psam/fam format for use with Plink.

A properly formatted psam file has in addition to the phenotype columns one or more of the following recognizable
columns: 'IID' (individual ID; required), 'FID', 'SID', 'PAT', 'MAT' and 'SEX'. If no columns with those exact names
are present in the input, then the program tries to guess which input columns might contain the missing information.
The guessing uses common synonyms, eg mapping 'gender' to 'SEX'. It also maps across supported formats mapping 'ID_1'
(snptest) to 'IID'.

For more on the psam format please refer to:
https://www.cog-genomics.org/plink/2.0/formats#psam

A properly formatted fam file (plink1.9) has *no* header, but expects the following six columns in exact order: 'FID',
'IID', 'PAT', MAT', 'SEX', and one final phenotype column. The plink1.9 fam format only supports one phenotype. To work
with more than one phenotype in plink1.9 the psam files prepared by this program are designed to be readable as an
alternate phenotype file in plink1.9.

\b
For more on alternate phenotype files in plink1.9, please refer to:
https://www.cog-genomics.org/plink/1.9/input#pheno

\b
For more on the fam format please refer to:
https://www.cog-genomics.org/plink/1.9/formats#fam
"""
    import pkpheno as Pheno
    assert sum(
        [1 for x in [columns, fam] if x]
    ) <= 1, "'--columns' and '--fam' are mutually exclusive; please only specify one of them."
    if fam:
        columns = [fam]
    pheno = Pheno.Psam(csv.DictReader(files[0]),
                       phenovars=columns,
                       samples=samples)
    for fileobj in files[1:]:
        pheno_new = Pheno.Psam(csv.DictReader(fileobj),
                               phenovars=columns,
                               samples=samples)
        pheno = pheno.combine_first(pheno_new)
    pheno.write(header=False if fam else True)