Пример #1
0
def qc_sd(args):
    """
    %prog sdtest input.matrix output.matrix

    run quality control on segregation distortions in each SNP.
    """
    p = OptionParser(qc_sd.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option('--population', default='RIL', choices=('RIL', 'F2', 'BCFn'),
                help = "population type")
    p.add_option('--sig_cutoff', default = 1e-2, type='float',
                help = "set the chi square test cutoff. 0(less strigent) to 1(more strigent)")
    q = OptionGroup(p, "format options")
    p.add_option_group(q)
    q.add_option('--homo1', default="A",
                help='character for homozygous genotype')
    q.add_option("--homo2", default='B',
                help="character for alternative homozygous genotype")
    q.add_option('--hete', default='X',
                help='character for heterozygous genotype')
    q.add_option('--missing', default='-',
                help='character for missing value')
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap

    if opts.sig_cutoff >=1 or opts.sig_cutoff <= 0:
        eprint('the cutoff chi square test should be smaller than 1 and larger than 0')
        sys.exit(1)

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1], iterator=True)
    Good_SNPs = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        df_chr_tmp_num = df_chr_tmp.replace([opts.homo1, opts.homo2, opts.hete, opts.missing], [0, 2, 1, 9])
        ob0, ob2 = (df_chr_tmp_num==0).sum(axis=1), (df_chr_tmp_num==2).sum(axis=1)
        obsum = ob0 + ob2
        exp0, exp2 = (obsum*0.75, obsum*0.25) if opts.population == 'BCFn' else (obsum*0.5, obsum*0.5)
        df_chi = pd.DataFrame(dict(zip(['ob0', 'ob2', 'exp0', 'exp2'], [ob0, ob2, exp0, exp2])))
        min_cond = ((df_chi['ob0']>5) & (df_chi['ob2']>5)).values
        pval_cond = chisquare([df_chi['ob0'], df_chi['ob2']], [df_chi['exp0'], df_chi['exp2']]).pvalue >= opts.sig_cutoff
        good_snp = df_chr_tmp.loc[(min_cond & pval_cond), :]
        Good_SNPs.append(good_snp)
    df1 = pd.concat(Good_SNPs)
    before_snp_num = sum(chr_nums.values())
    after_snp_num = df1.shape[0]
    pct = after_snp_num/float(before_snp_num)*100
    print('{} SNP markers before quality control.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after the quality control.'.format(after_snp_num, pct))
    df1.to_csv(outputmatrix, sep='\t', index=True)
Пример #2
0
def bin(args):
    """
    %prog bin corrected.matrix output.matrix

    compress markers byy merging consecutive markers with same genotypes
    """
    p = OptionParser(bin.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option('--diff_num', default=0, type='int',
        help='number of different genotypes between two consecutive markers less than or equal to this value will be merged. \
        missing values will not be counted.')
    p.add_option('--missing', default='-',
        help='character for missing value in genotype matrix file')
    p.add_option("--logfile", default='GC.bin.log',
        help="specify the file saving running info")
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap
    
    if Path(outputmatrix).exists():
        eprint("ERROR: Filename collision. The future output file `{}` exists".format(outputmatrix))
        sys.exit(1)
    
    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1],  iterator=True)
    Good_SNPs = []
    binning_info = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        if df_chr_tmp.shape[0] == 1:
            Good_SNPs.append(df_chr_tmp)
        else:
            represent_idx, block_idx, results = bin_markers(df_chr_tmp.loc[chrom], diff=opts.diff_num, missing_value=opts.missing)
            good_snp = df_chr_tmp.loc[(chrom, results), :]
            Good_SNPs.append(good_snp)
            if represent_idx:
                df_binning_info = pd.DataFrame(dict(zip(['chr', 'representative_marker', 'markers'], [chrom, represent_idx, block_idx])))
                binning_info.append(df_binning_info)
    df1 = pd.concat(Good_SNPs)
    df1.to_csv(outputmatrix, sep='\t', index=True)
    before_snp_num = sum(chr_nums.values())
    after_snp_num = df1.shape[0]
    pct = after_snp_num/float(before_snp_num)*100
    print('{} SNP markers before compression.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after compression.'.format(after_snp_num, pct))

    if binning_info:
        df2 = pd.concat(binning_info)
        df2.to_csv(opts.logfile, sep='\t', index=False)
        print('Check {} for binning details.'.format(opts.logfile))
Пример #3
0
 def dispatch(self, globals):
     from difflib import get_close_matches
     meta = 'ACTION'
     if len(sys.argv) == 1:
         self.print_help()
     action = sys.argv[1]
     if not action in self.valid_actions:
         eprint("[error] %s not a valid %s\n" % (action, meta))
         alt = get_close_matches(action, self.valid_actions)
         eprint(sys.stderr,
                "Did you mean one of these?\n\t%s\n" % (", ".join(alt)))
         self.print_help()
     globals[action](sys.argv[2:])
Пример #4
0
def mstmap2allmaps(args):
    """
    %prog mstmap2allmaps mstmap_fn allmaps_fn

    convert mstmap results to the file format Allmaps required
    """
    p = OptionParser(mstmap2allmaps.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option('--min_markers',
                 default=10,
                 type='int',
                 help='set the cutoff of marker numbers in a linkage group')
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mst_in, allmp_out = args
    inputmstmap = opts.input or mst_in
    outputallmp = opts.output or allmp_out

    if Path(outputallmp).exists():
        eprint('EROOR: Filename collision. The fugure output file `{}` exists'.
               format(outputallmp))
        sys.exit(1)

    fout = open(outputallmp, 'w')
    fout.write('Scaffold_ID\tScaffold_Position\tLG\tGenetic_Position\n')
    fp = open(inputmstmap)
    for header, seq in read_block(fp, "group "):
        lg_name = header.split()[-1]
        seq = list(seq)
        seq_len = len(seq) - 5
        if seq_len > opts.min_markers:
            for s in seq:
                if s.strip() == '' or s[0] == ';':
                    continue
                marker, genetic_pos = s.split()
                scaffold, pos = '_'.join(
                    marker.split('_')[:-1]), marker.split('_')[-1]
                fout.write('{}\t{}\t{}\t{}\n'.format(scaffold, pos, lg_name,
                                                     genetic_pos))
        print('markers in {} is less than {}, omit...'.format(
            lg_name, opts.min_markers))
    fp.close()
    fout.close()
Пример #5
0
def dpp(args):
    '''
    %prog training_data_dir label_fn model_results_dir

    Run dpp regression model
    '''
    p = OptionParser(dpp.__doc__)
    p.add_option('--problem_type', default='classification',choices=('classification', 'regression'),
        help = 'specify your problem type')
    p.add_option('--tensorboard', default='infer',
        help = 'tensorboard dir name')
    p.add_option('--epoch', default=500,
        help = 'number of epoches. set to 500 for leaf couting problem')
    p.add_option('--split_ratio', default=0.2,
        help = 'the ratio of training dataset used for testing')
    p.add_option('--lr_n', default=1, type='int',
        help = 'train model with differnt learning rates. if n=1: set lr to 0.001. if n>1: try differnt lr from 1e-2 to 1e-5 n times')
    p.set_slurm_opts(gpu=True)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    training_dir, label_fn, model_name, = args
    tb_dir_name = 'tensorboard_{}'.format(model_name) if opts.tensorboard == 'infer' else opts.tensorboard
    out_fns = fns(model_name, tb_dir=tb_dir_name, n=opts.lr_n)
    for i in range(opts.lr_n):
        try:
            os.mkdir(out_fns.model_name[i])
        except FileExistsError:
            eprint("ERROR: Filename collision. The future output file `{}` exists".format(out_fns.model_name[i]))
            sys.exit(1)
        cmd = 'python -m schnablelab.CNN.dpp_%s %s %s %s %s %s %s %s'%\
            (opts.problem_type, training_dir, label_fn, out_fns.model_name[i], out_fns.tb_dirs[i], opts.epoch, opts.split_ratio, out_fns.lrs[i])
        SlurmHeader = Slurm_gpu_header%(opts.time, opts.memory, out_fns.model_name[i], out_fns.model_name[i], out_fns.model_name[i])
        SlurmHeader += 'module load anaconda\nsource activate MCY\n'
        SlurmHeader += cmd

        f = open('%s.slurm'%out_fns.model_name[i], 'w')
        f.write(SlurmHeader)
        f.close()
        print('slurm file %s.slurm has been created, now you can sbatch your job file.'%out_fns.model_name[i]) 
Пример #6
0
def sortPos(args):
    """
    %prog sortPos input.map output.sorted.map

    sort markers based on position 
    """
    p = OptionParser(sortPos.__doc__)
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    inmap, outmap, = args
    
    if Path(outmap).exists():
        eprint("ERROR: Filename collision. The future output file `{}` exists".format(outputmatrix))
        sys.exit(1)
    df = pd.read_csv(inmap, delim_whitespace=True)
    idx_col = list(df.columns[0:2])
    df.sort_values(idx_col).to_csv(outmap, sep='\t', index=False)
    print('Done! Check %s file.'%outmap)
Пример #7
0
def format(args):
    """
    %prog format corrected.matrix 

    convert corrected genotype matrix file to other formats(mstmap, joinmap, r/qtl) for the genetic mapping software.
    Example:
    `python -m schnablelab.imputation.GC format test.map --mstmap --mstmap_pop_type RIL2`
    will generate `test.mstmap` for MSTmap use.
    """
    p = OptionParser(format.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("--mstmap",
                 default=False,
                 action="store_true",
                 help='convert to MSTmap format')
    p.add_option("--rqtl",
                 default=False,
                 action="store_true",
                 help='convert to R/qtl format')
    p.add_option("--joinmap",
                 default=False,
                 action="store_true",
                 help='convert to JoinMap format')

    q = OptionGroup(p, "format options for input matrix file")
    p.add_option_group(q)
    q.add_option('--homo1',
                 default="A",
                 choices=('a', 'A'),
                 help='character for homozygous genotype')
    q.add_option("--homo2",
                 default='B',
                 choices=('b', 'B'),
                 help="character for alternative homozygous genotype")
    q.add_option('--hete',
                 default='X',
                 choices=('h', 'H', 'X'),
                 help='character for heterozygous genotype')
    q.add_option('--missing',
                 default='-',
                 choices=('-', 'U'),
                 help='character for missing value')

    r = OptionGroup(p, "parameters for MSTmap")
    p.add_option_group(r)
    r.add_option(
        '--mstmap_pop_type',
        help='Possible values are DH and RILd, where d is any natural number. \
                For example, RIL6 means a RIL population at generation 6. \
                You should use RIL2 for F2. Use DH for BC1, DH and Hap.')
    r.add_option(
        "--population_name",
        default='LinkageGroup',
        help=
        "ives a name for the mapping population. It can be any string of letters (a-z, A-Z) or digits (0-9)"
    )
    r.add_option('--distance_function',
                 default='kosambi',
                 choices=('kosambi', 'haldane'),
                 help="choose Kosambi's and Haldane's distance functions")
    r.add_option(
        '--cut_off_p_value',
        default=0.000001,
        help=
        'specifies the threshold to be used for clustering the markers into LGs'
    )
    r.add_option('--no_map_dist',
                 default=15,
                 help='check mstmap manual for details')
    r.add_option('--no_map_size',
                 default=5,
                 help='check mstmap manual for details')
    r.add_option(
        '--missing_threshold',
        default=0.4,
        help=
        'any marker with more than this value will be removed completely without being mapped'
    )
    r.add_option(
        '--estimation_before_clustering',
        default='no',
        choices=('yes', 'no'),
        help=
        'if yes, MSTmap will try to estimate missing data before clustering the markers into linkage groups'
    )
    r.add_option('--detect_bad_data',
                 default='yes',
                 choices=('yes', 'no'),
                 help='if yes turn on the error detection feature in MSTmap')
    r.add_option('--objective_function',
                 default='COUNT',
                 choices=('COUNT', 'ML'),
                 help='specifies the objective function')

    s = OptionGroup(p, "parameters for JoinMap and R/qtl")
    p.add_option_group(s)
    s.add_option(
        '--pop_type',
        default='RIL',
        choices=('RIL', 'F2'),
        help=
        'specify mapping population type. Contact me if you need supports for other population types'
    )

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    inmap, = args
    inputmatrix = opts.input or inmap

    if (not opts.rqtl) and (not opts.joinmap) and (not opts.mstmap):
        eprint("ERROR: add at least one output format option.")
        sys.exit(1)

    if opts.mstmap:
        if not opts.mstmap_pop_type:
            eprint("ERROR: please choose population type for mstmap format.")
            sys.exit(1)
        if not (opts.mstmap_pop_type.startswith('RIL')
                or opts.mstmap_pop_type == 'DH'):
            eprint('ERROR: only RILd and DH supported in MSTmap')
            sys.exit(1)

        opf = inputmatrix.rsplit(".", 1)[0] + '.mstmap'  # output file
        if Path(opf).exists():
            eprint(
                "ERROR: Filename collision. The future output file `{}` exists"
                .format(opf))
            sys.exit(1)

        df = pd.read_csv(inputmatrix, delim_whitespace=True)
        cols = list(df.columns[2:])
        cols.insert(0, 'locus_name')
        df['locus_name'] = df.iloc[:, 0].astype(
            'str') + '_' + df.iloc[:, 1].astype('str')
        df = df[cols]
        print(df.head())
        snp_num, sm_num = df.shape[0], df.shape[1] - 1
        f1 = open(opf, 'w')
        f1.write(mst_header.format(opts.mstmap_pop_type, opts.population_name, opts.distance_function, opts.cut_off_p_value, \
            opts.no_map_dist, opts.no_map_size, opts.missing_threshold, opts.estimation_before_clustering, opts.detect_bad_data, \
            opts.objective_function, snp_num, sm_num))
        f1.close()

        df.to_csv(opf, sep='\t', index=False, mode='a')
        print('Done, check file {}!'.format(opf))

    if opts.joinmap:
        opf = inputmatrix.rsplit(".", 1)[0] + '.joinmap.xlsx'  # output file
        if Path(opf).exists():
            eprint(
                "ERROR: Filename collision. The future output file `{}` exists"
                .format(opf))
            sys.exit(1)

        df = pd.read_csv(inputmatrix, delim_whitespace=True)
        need_reps, reps = [], []
        if opts.homo1 != 'a':
            need_reps.append(opts.homo1)
            reps.append('a')
        if opts.homo2 != 'b':
            need_reps.append(opts.homo2)
            reps.append('b')
        if opts.hete != 'h':
            need_reps.append(opts.hete)
            reps.append('h')
        if opts.missing != '-':
            need_reps.append(opts.missing)
            reps.append('-')
        if need_reps:
            df = df.replace(need_reps, reps)

        cols = list(df.columns[2:])
        cols.insert(0, 'Classification')
        cols.insert(0, 'locus_name')
        df['locus_name'] = df.iloc[:, 0].astype(
            'str') + '_' + df.iloc[:, 1].astype('str')
        df['Classification'] = '(a,h,b)'
        df = df[cols]
        df.to_excel(opf)
        print(
            'Done! Now you can load the genotype data into the JoinMap project from the MS-Excel spreadsheet {} to a dataset node.'
            .format(opf))

    if opts.rqtl:
        opf = inputmatrix.rsplit(".", 1)[0] + '.rqtl.csv'  # output file
        if Path(opf).exists():
            eprint(
                "ERROR: Filename collision. The future output file `{}` exists"
                .format(opf))
            sys.exit(1)

        df = pd.read_csv(inputmatrix, delim_whitespace=True)
        need_reps, reps = [], []
        if opts.homo1 != 'A':
            need_reps.append(opts.homo1)
            reps.append('A')
        if opts.homo2 != 'B':
            need_reps.append(opts.homo2)
            reps.append('B')
        if opts.hete != 'H':
            need_reps.append(opts.hete)
            reps.append('H')
        if opts.missing != '-':
            need_reps.append(opts.missing)
            reps.append('-')
        if need_reps:
            df = df.replace(need_reps, reps)

        cols = list(df.columns[2:])
        cols.insert(0, 'id')
        df['id'] = df.iloc[:, 0].astype('str') + '_' + df.iloc[:,
                                                               1].astype('str')
        df = df[cols]

        df.loc[-1] = 1
        df.index = df.index + 1
        df = df.sort_index()
        df.iloc[0, 0] = np.nan
        df.to_csv(opf, index=False, na_rep='')
        print('Done, check file {}!'.format(opf))
Пример #8
0
def qc_missing(args):
    """
    %prog filtermissing input.matrix output.matrix

    run quality control of the missing genotypes in the input.matrix before starting the correction.
    """
    p = OptionParser(qc_missing.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option(
        '--cutoff_snp',
        default=0.5,
        type='float',
        help="SNP with missing rate higher than this value will be removed")
    p.add_option(
        '--rm_bad_samples',
        default=False,
        action="store_true",
        help=
        'remove bad samples after controlling the SNPs with high missing rate')
    p.add_option(
        '--cutoff_sample',
        type='float',
        help=
        "sample missing rate higher than this value will be removed after controlling the SNP missing rate"
    )
    q = OptionGroup(p, "format options")
    p.add_option_group(q)
    q.add_option('--homo1',
                 default="A",
                 help='character for homozygous genotype')
    q.add_option("--homo2",
                 default='B',
                 help="character for alternative homozygous genotype")
    q.add_option('--hete',
                 default='X',
                 help='character for heterozygous genotype')
    q.add_option('--missing', default='-', help='character for missing value')
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    if opts.rm_bad_samples and not opts.cutoff_sample:
        eprint(
            'missing value cutoff for --cutoff_sample option must be specified when --rm_bad_samples added.'
        )
        sys.exit(1)

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix,
                             delim_whitespace=True,
                             index_col=[0, 1],
                             iterator=True)
    Good_SNPs = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        df_chr_tmp_num = df_chr_tmp.replace(
            [opts.homo1, opts.homo2, opts.hete, opts.missing], [0, 2, 1, 9])
        sample_num = df_chr_tmp_num.shape[1]
        good_rates = df_chr_tmp_num.apply(lambda x:
                                          (x == 9).sum() / sample_num,
                                          axis=1) <= opts.cutoff_snp
        good_snp = df_chr_tmp.loc[good_rates, :]
        Good_SNPs.append(good_snp)
    df1 = pd.concat(Good_SNPs)
    before_snp_num = sum(chr_nums.values())
    after_snp_num, before_sm_num = df1.shape
    pct = after_snp_num / float(before_snp_num) * 100
    print('{} SNP markers before quality control.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after the quality control.'.format(
        after_snp_num, pct))

    if opts.rm_bad_samples:
        print('start quality control on samples')
        good_samples = df1.apply(lambda x:
                                 (x == opts.missing).sum() / after_snp_num,
                                 axis=0) <= opts.cutoff_sample
        df2 = df1.loc[:, good_samples]
        after_sm_num = df2.shape[1]
        pct_sm = after_sm_num / float(before_sm_num) * 100
        print('{} samples before quality control.'.format(before_sm_num))
        print('{}({:.1f}%) markers left after the quality control.'.format(
            after_sm_num, pct_sm))
        df2.to_csv(outputmatrix, sep='\t', index=True)
    else:
        df1.to_csv(outputmatrix, sep='\t', index=True)
Пример #9
0
def correct(args):
    """
    %prog correct config.txt input.matrix 

    Correct wrong genotype calls and impute missing values in biparental populations
    """
    p = OptionParser(correct.__doc__)
    p.add_option("-c", "--configfile", help=SUPPRESS_HELP)
    p.add_option("-m", "--matrixfile", help=SUPPRESS_HELP)
    p.add_option('--itertimes',
                 default=7,
                 type='int',
                 help='maximum correction times to reach the stablized status')
    q = OptionGroup(p, "output options")
    p.add_option_group(q)
    q.add_option('--opp',
                 default="'infer'",
                 help='specify the prefix of the output file names')
    q.add_option("--logfile",
                 default='GC.correct.log',
                 help="specify the file saving running info")
    q.add_option(
        '--debug',
        default=False,
        action="store_true",
        help=
        'trun on the debug mode that will generate a tmp file containing both original and corrected genotypes for debug use'
    )

    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    configfile, mapfile = args
    inputmatrix = opts.matrixfile or mapfile
    inputconfig = opts.configfile or configfile

    opf = inputmatrix.rsplit(
        ".",
        1)[0] + '.corrected.map' if opts.opp == "'infer'" else '{}.map'.format(
            opts.opp)  # output file
    if Path(opf).exists():
        eprint("ERROR: Filename collision. The future output file `{}` exists".
               format(opf))
        sys.exit(1)

    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                    format(sys.version.splitlines()[0].strip()))
        cpus = 1

    logging.basicConfig(filename=opts.logfile,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    cargs = ParseConfig(inputconfig)
    if cargs.win_size % 2 == 0:
        eprint("ERROR: The slding window value cannot be even")
        sys.exit(1)
    logging.debug("Parameters in config file: {0}".format(cargs.__dict__))

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix,
                             delim_whitespace=True,
                             index_col=[0, 1],
                             iterator=True)
    tmp_chr_list = []
    for chrom in chr_order:
        logging.debug('{}...'.format(chrom))
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        marker_num, sample_num = df_chr_tmp.shape
        logging.debug('{} contains {} markers and {} samples.'.format(
            chrom, marker_num, sample_num))
        tmp_sm_list = []
        for sm in df_chr_tmp:
            logging.debug('Start correcting {}...'.format(sm))
            orig_seq = df_chr_tmp[sm]
            orig_idx = orig_seq.index
            seq_no_idx = orig_seq.reset_index(drop=True)
            seq_no_idx_num = seq_no_idx.replace(
                [cargs.gt_a, cargs.gt_b, cargs.gt_h, cargs.gt_miss],
                [0, 2, 1, 9])
            if seq_no_idx_num.shape[0] <= cargs.win_size:
                logging.debug(
                    'number of markers smaller than the window size, omit...')
                final_seq_no_idx = seq_no_idx
            else:
                logging.debug('correction round 1...')
                correct_obj = CorrectOO(cargs, seq_no_idx_num)
                corrected_n = get_corrected_num(seq_no_idx_num,
                                                correct_obj.corrected)
                round_n = 2
                while round_n <= opts.itertimes:
                    logging.debug('correction round %s...' % round_n)
                    corrected_obj = CorrectOO(cargs, correct_obj.corrected)
                    corrected_n_new = get_corrected_num(
                        seq_no_idx_num, corrected_obj.corrected)
                    round_n += 1
                    if (corrected_n_new - corrected_n) / float(corrected_n +
                                                               0.01) <= 0.01:
                        break
                    else:
                        corrected_n = corrected_n_new
                final_seq_no_idx = corrected_obj.corrected.replace(
                    [0, 2, 1, 9],
                    [cargs.gt_a, cargs.gt_b, cargs.gt_h, cargs.gt_miss])
            final_seq_no_idx.index = orig_idx
            final_seq = final_seq_no_idx
            tmp_sm_list.append(final_seq)
        df_sm_tmp = pd.concat(tmp_sm_list, axis=1)
        tmp_chr_list.append(df_sm_tmp)
    df_corrected = pd.concat(tmp_chr_list)

    df_corrected.to_csv(opf, sep='\t', index=True)

    if opts.debug:
        logging.debug('generating the tmp file for debug use...')
        df_uncorrected = pd.read_csv(inputmatrix,
                                     delim_whitespace=True,
                                     index_col=[0, 1])
        df_debug = df_corrected.where(df_corrected == df_uncorrected,
                                      other=df_corrected + '(' +
                                      df_uncorrected + ')')
        df_debug.to_csv(opf + '.debug', sep='\t', index=True)
    print('Done!')
Пример #10
0
def vcf2map(args):
    """
    %prog vcf2map input.vcf output.matrix

    convert vcf format to genotype matrix format
    """
    p = OptionParser(vcf2map.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option('--homo1',
                 default="A",
                 help='character for homozygous genotype')
    p.add_option("--homo2",
                 default='B',
                 help="character for alternative homozygous genotype")
    p.add_option('--hete',
                 default='X',
                 help='character for heterozygous genotype')
    p.add_option('--missing', default='-', help='character for missing value')
    p.add_option("--logfile",
                 default='GC.vcf2map.info',
                 help="specify the log file")
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    invcf, outmap = args
    inputvcf = opts.input or invcf
    outputmatrix = opts.output or outmap

    if Path(outputmatrix).exists():
        eprint("ERROR: Filename collision. The future output file `{}` exists".
               format(outputmatrix))
        sys.exit(1)

    logging.basicConfig(filename=opts.logfile,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    right_gt = {
        '0|0': opts.homo1,
        '0/0': opts.homo1,
        '0|1': opts.hete,
        '1|0': opts.hete,
        '0/1': opts.hete,
        '1/0': opts.hete,
        '1|1': opts.homo2,
        '1/1': opts.homo2,
        '.|.': opts.missing,
        './.': opts.missing,
        '.': opts.missing
    }
    useless_cols = ['ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
    index_cols = ['#CHROM', 'POS']
    vcffile = open(inputvcf)
    n = 0
    for i in vcffile:
        if i.startswith('##'):
            n += 1
        else:
            break
    vcffile.close()
    chr_order, chr_nums = getChunk(inputvcf, ignore=n + 1)
    vcf_reader = pd.read_csv(inputvcf,
                             header=n,
                             delim_whitespace=True,
                             usecols=lambda x: x not in useless_cols,
                             iterator=True)
    tmp_chr_list = []
    for chrom in chr_order:
        logging.debug('{}...'.format(chrom))
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = vcf_reader.get_chunk(chunk)
        df_chr_tmp = df_chr_tmp.set_index(index_cols)
        df_chr_tmp = df_chr_tmp.applymap(lambda x: x.split(':')[0])
        df_chr_tmp = df_chr_tmp.applymap(lambda x: right_gt[x]
                                         if x in right_gt else np.nan)
        df_chr_tmp.dropna(inplace=True)
        tmp_chr_list.append(df_chr_tmp)
    df1 = pd.concat(tmp_chr_list)
    df1.to_csv(outputmatrix, sep='\t', index=True)

    vcffile.close()
    print('Done!')