Пример #1
0
def qc_sd(args):
    """
    %prog sdtest input.matrix output.matrix

    run quality control on segregation distortions in each SNP.
    """
    p = OptionParser(qc_sd.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option('--population', default='RIL', choices=('RIL', 'F2', 'BCFn'),
                help = "population type")
    p.add_option('--sig_cutoff', default = 1e-2, type='float',
                help = "set the chi square test cutoff. 0(less strigent) to 1(more strigent)")
    q = OptionGroup(p, "format options")
    p.add_option_group(q)
    q.add_option('--homo1', default="A",
                help='character for homozygous genotype')
    q.add_option("--homo2", default='B',
                help="character for alternative homozygous genotype")
    q.add_option('--hete', default='X',
                help='character for heterozygous genotype')
    q.add_option('--missing', default='-',
                help='character for missing value')
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap

    if opts.sig_cutoff >=1 or opts.sig_cutoff <= 0:
        eprint('the cutoff chi square test should be smaller than 1 and larger than 0')
        sys.exit(1)

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1], iterator=True)
    Good_SNPs = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        df_chr_tmp_num = df_chr_tmp.replace([opts.homo1, opts.homo2, opts.hete, opts.missing], [0, 2, 1, 9])
        ob0, ob2 = (df_chr_tmp_num==0).sum(axis=1), (df_chr_tmp_num==2).sum(axis=1)
        obsum = ob0 + ob2
        exp0, exp2 = (obsum*0.75, obsum*0.25) if opts.population == 'BCFn' else (obsum*0.5, obsum*0.5)
        df_chi = pd.DataFrame(dict(zip(['ob0', 'ob2', 'exp0', 'exp2'], [ob0, ob2, exp0, exp2])))
        min_cond = ((df_chi['ob0']>5) & (df_chi['ob2']>5)).values
        pval_cond = chisquare([df_chi['ob0'], df_chi['ob2']], [df_chi['exp0'], df_chi['exp2']]).pvalue >= opts.sig_cutoff
        good_snp = df_chr_tmp.loc[(min_cond & pval_cond), :]
        Good_SNPs.append(good_snp)
    df1 = pd.concat(Good_SNPs)
    before_snp_num = sum(chr_nums.values())
    after_snp_num = df1.shape[0]
    pct = after_snp_num/float(before_snp_num)*100
    print('{} SNP markers before quality control.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after the quality control.'.format(after_snp_num, pct))
    df1.to_csv(outputmatrix, sep='\t', index=True)
Пример #2
0
def bin(args):
    """
    %prog bin corrected.matrix output.matrix

    compress markers byy merging consecutive markers with same genotypes
    """
    p = OptionParser(bin.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option('--diff_num', default=0, type='int',
        help='number of different genotypes between two consecutive markers less than or equal to this value will be merged. \
        missing values will not be counted.')
    p.add_option('--missing', default='-',
        help='character for missing value in genotype matrix file')
    p.add_option("--logfile", default='GC.bin.log',
        help="specify the file saving running info")
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap
    
    if Path(outputmatrix).exists():
        eprint("ERROR: Filename collision. The future output file `{}` exists".format(outputmatrix))
        sys.exit(1)
    
    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1],  iterator=True)
    Good_SNPs = []
    binning_info = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        if df_chr_tmp.shape[0] == 1:
            Good_SNPs.append(df_chr_tmp)
        else:
            represent_idx, block_idx, results = bin_markers(df_chr_tmp.loc[chrom], diff=opts.diff_num, missing_value=opts.missing)
            good_snp = df_chr_tmp.loc[(chrom, results), :]
            Good_SNPs.append(good_snp)
            if represent_idx:
                df_binning_info = pd.DataFrame(dict(zip(['chr', 'representative_marker', 'markers'], [chrom, represent_idx, block_idx])))
                binning_info.append(df_binning_info)
    df1 = pd.concat(Good_SNPs)
    df1.to_csv(outputmatrix, sep='\t', index=True)
    before_snp_num = sum(chr_nums.values())
    after_snp_num = df1.shape[0]
    pct = after_snp_num/float(before_snp_num)*100
    print('{} SNP markers before compression.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after compression.'.format(after_snp_num, pct))

    if binning_info:
        df2 = pd.concat(binning_info)
        df2.to_csv(opts.logfile, sep='\t', index=False)
        print('Check {} for binning details.'.format(opts.logfile))
Пример #3
0
def qc_hetero(args):
    """
    %prog qc_hetero input.matrix output.matrix

    run quality control on the continuous same homozygous in heterozygous region.
    """
    p = OptionParser(qc_hetero.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option("--read_len",
                 default=150,
                 type='int',
                 help="read length for SNP calling")
    p.add_option("--logfile",
                 default='GC.qc_hetero.info',
                 help="specify the file saving binning info")
    q = OptionGroup(p, "format options")
    p.add_option_group(q)
    q.add_option('--homo1',
                 default="A",
                 help='character for homozygous genotype')
    q.add_option("--homo2",
                 default='B',
                 help="character for alternative homozygous genotype")
    q.add_option('--hete',
                 default='X',
                 help='character for heterozygous genotype')
    q.add_option('--missing', default='-', help='character for missing value')
    r = OptionGroup(p, 'advanced options')
    p.add_option_group(r)
    r.add_option(
        '--nonhetero_lens',
        default=8,
        type='int',
        help=
        'number of non heterozygous between two heterozygous in a heterozygous region'
    )
    r.add_option(
        '--min_homo',
        default=2,
        type='int',
        help=
        'number of continuous homozygous within the read length in the heterozygous region'
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap

    logging.basicConfig(filename=opts.logfile,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix,
                             delim_whitespace=True,
                             index_col=[0, 1],
                             iterator=True)
    Good_SNPs = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        logging.debug(chrom)
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        chr_idx = df_chr_tmp.index
        df_chr_tmp_num = df_chr_tmp.replace(
            [opts.homo1, opts.homo2, opts.hete, opts.missing],
            [0, 2, 1, 9]).loc[chrom]

        chr_bin_ids = []
        for sm in df_chr_tmp_num:
            geno_grouper = (df_chr_tmp_num[sm].diff(1) !=
                            0).astype('int').cumsum()
            idx, geno, lens = [], [], []
            for __, grp_geno in df_chr_tmp_num[sm].groupby(geno_grouper):
                idx.append(grp_geno.index)
                geno.append(grp_geno.unique()[0])
                lens.append(grp_geno.shape[0])
            df_grp_geno = pd.DataFrame(
                dict(zip(['idx', 'geno', 'lens'], [idx, geno, lens])))
            df_grp_geno['type'] = df_grp_geno['geno'].apply(
                lambda x: 1 if x == 1 else 0
            )  # 1: hetero genotype 0: others(homo1, homo2, missing)
            type_grouper = (df_grp_geno['type'].diff(1) !=
                            0).astype('int').cumsum()
            for __, grp_type in df_grp_geno.groupby(type_grouper):
                if grp_type['type'].unique()[0] == 0:
                    nonhetero_lens = grp_type['lens'].sum()
                    if nonhetero_lens <= opts.nonhetero_lens:
                        for __, row in grp_type.iterrows():
                            if row.geno == 0 or row.geno == 2:
                                bin_ids = get_blocks(row['idx'].values,
                                                     dist=opts.read_len,
                                                     block_size=opts.min_homo)
                                if bin_ids:
                                    for bin_index in bin_ids:
                                        if bin_index not in chr_bin_ids:
                                            chr_bin_ids.append(bin_index)
        if chr_bin_ids:
            dropping_ids = []
            merged_bin_ids = sort_merge_sort(chr_bin_ids)
            for idx_block in merged_bin_ids:
                logging.debug('positions: {}'.format(idx_block))
                genos_block = df_chr_tmp_num.loc[idx_block, :]
                missings = genos_block.apply(lambda x: (x == 9).sum(), axis=1)
                heteros = genos_block.apply(lambda x: (x == 1).sum(), axis=1)
                dropping_index = list(
                    pd.concat([missings, heteros],
                              axis=1).sort_values([0, 1]).index[1:])
                dropping_ids.extend(dropping_index)
            df_chr_tmp = df_chr_tmp.drop(dropping_ids, level=1)
        Good_SNPs.append(df_chr_tmp)
    df1 = pd.concat(Good_SNPs)
    before_snp_num = sum(chr_nums.values())
    after_snp_num = df1.shape[0]
    pct = after_snp_num / float(before_snp_num) * 100
    print('{} SNP markers before quality control.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after the quality control.'.format(
        after_snp_num, pct))
    df1.to_csv(outputmatrix, sep='\t', index=True)
    print('Done! Check {} for running details.'.format(opts.logfile))
Пример #4
0
def qc_missing(args):
    """
    %prog filtermissing input.matrix output.matrix

    run quality control of the missing genotypes in the input.matrix before starting the correction.
    """
    p = OptionParser(qc_missing.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option(
        '--cutoff_snp',
        default=0.5,
        type='float',
        help="SNP with missing rate higher than this value will be removed")
    p.add_option(
        '--rm_bad_samples',
        default=False,
        action="store_true",
        help=
        'remove bad samples after controlling the SNPs with high missing rate')
    p.add_option(
        '--cutoff_sample',
        type='float',
        help=
        "sample missing rate higher than this value will be removed after controlling the SNP missing rate"
    )
    q = OptionGroup(p, "format options")
    p.add_option_group(q)
    q.add_option('--homo1',
                 default="A",
                 help='character for homozygous genotype')
    q.add_option("--homo2",
                 default='B',
                 help="character for alternative homozygous genotype")
    q.add_option('--hete',
                 default='X',
                 help='character for heterozygous genotype')
    q.add_option('--missing', default='-', help='character for missing value')
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    if opts.rm_bad_samples and not opts.cutoff_sample:
        eprint(
            'missing value cutoff for --cutoff_sample option must be specified when --rm_bad_samples added.'
        )
        sys.exit(1)

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix,
                             delim_whitespace=True,
                             index_col=[0, 1],
                             iterator=True)
    Good_SNPs = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        df_chr_tmp_num = df_chr_tmp.replace(
            [opts.homo1, opts.homo2, opts.hete, opts.missing], [0, 2, 1, 9])
        sample_num = df_chr_tmp_num.shape[1]
        good_rates = df_chr_tmp_num.apply(lambda x:
                                          (x == 9).sum() / sample_num,
                                          axis=1) <= opts.cutoff_snp
        good_snp = df_chr_tmp.loc[good_rates, :]
        Good_SNPs.append(good_snp)
    df1 = pd.concat(Good_SNPs)
    before_snp_num = sum(chr_nums.values())
    after_snp_num, before_sm_num = df1.shape
    pct = after_snp_num / float(before_snp_num) * 100
    print('{} SNP markers before quality control.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after the quality control.'.format(
        after_snp_num, pct))

    if opts.rm_bad_samples:
        print('start quality control on samples')
        good_samples = df1.apply(lambda x:
                                 (x == opts.missing).sum() / after_snp_num,
                                 axis=0) <= opts.cutoff_sample
        df2 = df1.loc[:, good_samples]
        after_sm_num = df2.shape[1]
        pct_sm = after_sm_num / float(before_sm_num) * 100
        print('{} samples before quality control.'.format(before_sm_num))
        print('{}({:.1f}%) markers left after the quality control.'.format(
            after_sm_num, pct_sm))
        df2.to_csv(outputmatrix, sep='\t', index=True)
    else:
        df1.to_csv(outputmatrix, sep='\t', index=True)
Пример #5
0
def correct(args):
    """
    %prog correct config.txt input.matrix 

    Correct wrong genotype calls and impute missing values in biparental populations
    """
    p = OptionParser(correct.__doc__)
    p.add_option("-c", "--configfile", help=SUPPRESS_HELP)
    p.add_option("-m", "--matrixfile", help=SUPPRESS_HELP)
    p.add_option('--itertimes',
                 default=7,
                 type='int',
                 help='maximum correction times to reach the stablized status')
    q = OptionGroup(p, "output options")
    p.add_option_group(q)
    q.add_option('--opp',
                 default="'infer'",
                 help='specify the prefix of the output file names')
    q.add_option("--logfile",
                 default='GC.correct.log',
                 help="specify the file saving running info")
    q.add_option(
        '--debug',
        default=False,
        action="store_true",
        help=
        'trun on the debug mode that will generate a tmp file containing both original and corrected genotypes for debug use'
    )

    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    configfile, mapfile = args
    inputmatrix = opts.matrixfile or mapfile
    inputconfig = opts.configfile or configfile

    opf = inputmatrix.rsplit(
        ".",
        1)[0] + '.corrected.map' if opts.opp == "'infer'" else '{}.map'.format(
            opts.opp)  # output file
    if Path(opf).exists():
        eprint("ERROR: Filename collision. The future output file `{}` exists".
               format(opf))
        sys.exit(1)

    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                    format(sys.version.splitlines()[0].strip()))
        cpus = 1

    logging.basicConfig(filename=opts.logfile,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    cargs = ParseConfig(inputconfig)
    if cargs.win_size % 2 == 0:
        eprint("ERROR: The slding window value cannot be even")
        sys.exit(1)
    logging.debug("Parameters in config file: {0}".format(cargs.__dict__))

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix,
                             delim_whitespace=True,
                             index_col=[0, 1],
                             iterator=True)
    tmp_chr_list = []
    for chrom in chr_order:
        logging.debug('{}...'.format(chrom))
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        marker_num, sample_num = df_chr_tmp.shape
        logging.debug('{} contains {} markers and {} samples.'.format(
            chrom, marker_num, sample_num))
        tmp_sm_list = []
        for sm in df_chr_tmp:
            logging.debug('Start correcting {}...'.format(sm))
            orig_seq = df_chr_tmp[sm]
            orig_idx = orig_seq.index
            seq_no_idx = orig_seq.reset_index(drop=True)
            seq_no_idx_num = seq_no_idx.replace(
                [cargs.gt_a, cargs.gt_b, cargs.gt_h, cargs.gt_miss],
                [0, 2, 1, 9])
            if seq_no_idx_num.shape[0] <= cargs.win_size:
                logging.debug(
                    'number of markers smaller than the window size, omit...')
                final_seq_no_idx = seq_no_idx
            else:
                logging.debug('correction round 1...')
                correct_obj = CorrectOO(cargs, seq_no_idx_num)
                corrected_n = get_corrected_num(seq_no_idx_num,
                                                correct_obj.corrected)
                round_n = 2
                while round_n <= opts.itertimes:
                    logging.debug('correction round %s...' % round_n)
                    corrected_obj = CorrectOO(cargs, correct_obj.corrected)
                    corrected_n_new = get_corrected_num(
                        seq_no_idx_num, corrected_obj.corrected)
                    round_n += 1
                    if (corrected_n_new - corrected_n) / float(corrected_n +
                                                               0.01) <= 0.01:
                        break
                    else:
                        corrected_n = corrected_n_new
                final_seq_no_idx = corrected_obj.corrected.replace(
                    [0, 2, 1, 9],
                    [cargs.gt_a, cargs.gt_b, cargs.gt_h, cargs.gt_miss])
            final_seq_no_idx.index = orig_idx
            final_seq = final_seq_no_idx
            tmp_sm_list.append(final_seq)
        df_sm_tmp = pd.concat(tmp_sm_list, axis=1)
        tmp_chr_list.append(df_sm_tmp)
    df_corrected = pd.concat(tmp_chr_list)

    df_corrected.to_csv(opf, sep='\t', index=True)

    if opts.debug:
        logging.debug('generating the tmp file for debug use...')
        df_uncorrected = pd.read_csv(inputmatrix,
                                     delim_whitespace=True,
                                     index_col=[0, 1])
        df_debug = df_corrected.where(df_corrected == df_uncorrected,
                                      other=df_corrected + '(' +
                                      df_uncorrected + ')')
        df_debug.to_csv(opf + '.debug', sep='\t', index=True)
    print('Done!')
Пример #6
0
def vcf2map(args):
    """
    %prog vcf2map input.vcf output.matrix

    convert vcf format to genotype matrix format
    """
    p = OptionParser(vcf2map.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option('--homo1',
                 default="A",
                 help='character for homozygous genotype')
    p.add_option("--homo2",
                 default='B',
                 help="character for alternative homozygous genotype")
    p.add_option('--hete',
                 default='X',
                 help='character for heterozygous genotype')
    p.add_option('--missing', default='-', help='character for missing value')
    p.add_option("--logfile",
                 default='GC.vcf2map.info',
                 help="specify the log file")
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    invcf, outmap = args
    inputvcf = opts.input or invcf
    outputmatrix = opts.output or outmap

    if Path(outputmatrix).exists():
        eprint("ERROR: Filename collision. The future output file `{}` exists".
               format(outputmatrix))
        sys.exit(1)

    logging.basicConfig(filename=opts.logfile,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    right_gt = {
        '0|0': opts.homo1,
        '0/0': opts.homo1,
        '0|1': opts.hete,
        '1|0': opts.hete,
        '0/1': opts.hete,
        '1/0': opts.hete,
        '1|1': opts.homo2,
        '1/1': opts.homo2,
        '.|.': opts.missing,
        './.': opts.missing,
        '.': opts.missing
    }
    useless_cols = ['ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
    index_cols = ['#CHROM', 'POS']
    vcffile = open(inputvcf)
    n = 0
    for i in vcffile:
        if i.startswith('##'):
            n += 1
        else:
            break
    vcffile.close()
    chr_order, chr_nums = getChunk(inputvcf, ignore=n + 1)
    vcf_reader = pd.read_csv(inputvcf,
                             header=n,
                             delim_whitespace=True,
                             usecols=lambda x: x not in useless_cols,
                             iterator=True)
    tmp_chr_list = []
    for chrom in chr_order:
        logging.debug('{}...'.format(chrom))
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = vcf_reader.get_chunk(chunk)
        df_chr_tmp = df_chr_tmp.set_index(index_cols)
        df_chr_tmp = df_chr_tmp.applymap(lambda x: x.split(':')[0])
        df_chr_tmp = df_chr_tmp.applymap(lambda x: right_gt[x]
                                         if x in right_gt else np.nan)
        df_chr_tmp.dropna(inplace=True)
        tmp_chr_list.append(df_chr_tmp)
    df1 = pd.concat(tmp_chr_list)
    df1.to_csv(outputmatrix, sep='\t', index=True)

    vcffile.close()
    print('Done!')