Exemplo n.º 1
0
def qc_sd(args):
    """
    %prog sdtest input.matrix output.matrix

    run quality control on segregation distortions in each SNP.
    """
    p = OptionParser(qc_sd.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option('--population', default='RIL', choices=('RIL', 'F2', 'BCFn'),
                help = "population type")
    p.add_option('--sig_cutoff', default = 1e-2, type='float',
                help = "set the chi square test cutoff. 0(less strigent) to 1(more strigent)")
    q = OptionGroup(p, "format options")
    p.add_option_group(q)
    q.add_option('--homo1', default="A",
                help='character for homozygous genotype')
    q.add_option("--homo2", default='B',
                help="character for alternative homozygous genotype")
    q.add_option('--hete', default='X',
                help='character for heterozygous genotype')
    q.add_option('--missing', default='-',
                help='character for missing value')
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap

    if opts.sig_cutoff >=1 or opts.sig_cutoff <= 0:
        eprint('the cutoff chi square test should be smaller than 1 and larger than 0')
        sys.exit(1)

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1], iterator=True)
    Good_SNPs = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        df_chr_tmp_num = df_chr_tmp.replace([opts.homo1, opts.homo2, opts.hete, opts.missing], [0, 2, 1, 9])
        ob0, ob2 = (df_chr_tmp_num==0).sum(axis=1), (df_chr_tmp_num==2).sum(axis=1)
        obsum = ob0 + ob2
        exp0, exp2 = (obsum*0.75, obsum*0.25) if opts.population == 'BCFn' else (obsum*0.5, obsum*0.5)
        df_chi = pd.DataFrame(dict(zip(['ob0', 'ob2', 'exp0', 'exp2'], [ob0, ob2, exp0, exp2])))
        min_cond = ((df_chi['ob0']>5) & (df_chi['ob2']>5)).values
        pval_cond = chisquare([df_chi['ob0'], df_chi['ob2']], [df_chi['exp0'], df_chi['exp2']]).pvalue >= opts.sig_cutoff
        good_snp = df_chr_tmp.loc[(min_cond & pval_cond), :]
        Good_SNPs.append(good_snp)
    df1 = pd.concat(Good_SNPs)
    before_snp_num = sum(chr_nums.values())
    after_snp_num = df1.shape[0]
    pct = after_snp_num/float(before_snp_num)*100
    print('{} SNP markers before quality control.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after the quality control.'.format(after_snp_num, pct))
    df1.to_csv(outputmatrix, sep='\t', index=True)
Exemplo n.º 2
0
def format(args):
    """
    %prog format corrected.matrix 

    convert corrected genotype matrix file to other formats(mstmap, joinmap, r/qtl) for the genetic mapping software.
    Example:
    `python -m schnablelab.imputation.GC format test.map --mstmap --mstmap_pop_type RIL2`
    will generate `test.mstmap` for MSTmap use.
    """
    p = OptionParser(format.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("--mstmap",
                 default=False,
                 action="store_true",
                 help='convert to MSTmap format')
    p.add_option("--rqtl",
                 default=False,
                 action="store_true",
                 help='convert to R/qtl format')
    p.add_option("--joinmap",
                 default=False,
                 action="store_true",
                 help='convert to JoinMap format')

    q = OptionGroup(p, "format options for input matrix file")
    p.add_option_group(q)
    q.add_option('--homo1',
                 default="A",
                 choices=('a', 'A'),
                 help='character for homozygous genotype')
    q.add_option("--homo2",
                 default='B',
                 choices=('b', 'B'),
                 help="character for alternative homozygous genotype")
    q.add_option('--hete',
                 default='X',
                 choices=('h', 'H', 'X'),
                 help='character for heterozygous genotype')
    q.add_option('--missing',
                 default='-',
                 choices=('-', 'U'),
                 help='character for missing value')

    r = OptionGroup(p, "parameters for MSTmap")
    p.add_option_group(r)
    r.add_option(
        '--mstmap_pop_type',
        help='Possible values are DH and RILd, where d is any natural number. \
                For example, RIL6 means a RIL population at generation 6. \
                You should use RIL2 for F2. Use DH for BC1, DH and Hap.')
    r.add_option(
        "--population_name",
        default='LinkageGroup',
        help=
        "ives a name for the mapping population. It can be any string of letters (a-z, A-Z) or digits (0-9)"
    )
    r.add_option('--distance_function',
                 default='kosambi',
                 choices=('kosambi', 'haldane'),
                 help="choose Kosambi's and Haldane's distance functions")
    r.add_option(
        '--cut_off_p_value',
        default=0.000001,
        help=
        'specifies the threshold to be used for clustering the markers into LGs'
    )
    r.add_option('--no_map_dist',
                 default=15,
                 help='check mstmap manual for details')
    r.add_option('--no_map_size',
                 default=5,
                 help='check mstmap manual for details')
    r.add_option(
        '--missing_threshold',
        default=0.4,
        help=
        'any marker with more than this value will be removed completely without being mapped'
    )
    r.add_option(
        '--estimation_before_clustering',
        default='no',
        choices=('yes', 'no'),
        help=
        'if yes, MSTmap will try to estimate missing data before clustering the markers into linkage groups'
    )
    r.add_option('--detect_bad_data',
                 default='yes',
                 choices=('yes', 'no'),
                 help='if yes turn on the error detection feature in MSTmap')
    r.add_option('--objective_function',
                 default='COUNT',
                 choices=('COUNT', 'ML'),
                 help='specifies the objective function')

    s = OptionGroup(p, "parameters for JoinMap and R/qtl")
    p.add_option_group(s)
    s.add_option(
        '--pop_type',
        default='RIL',
        choices=('RIL', 'F2'),
        help=
        'specify mapping population type. Contact me if you need supports for other population types'
    )

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    inmap, = args
    inputmatrix = opts.input or inmap

    if (not opts.rqtl) and (not opts.joinmap) and (not opts.mstmap):
        eprint("ERROR: add at least one output format option.")
        sys.exit(1)

    if opts.mstmap:
        if not opts.mstmap_pop_type:
            eprint("ERROR: please choose population type for mstmap format.")
            sys.exit(1)
        if not (opts.mstmap_pop_type.startswith('RIL')
                or opts.mstmap_pop_type == 'DH'):
            eprint('ERROR: only RILd and DH supported in MSTmap')
            sys.exit(1)

        opf = inputmatrix.rsplit(".", 1)[0] + '.mstmap'  # output file
        if Path(opf).exists():
            eprint(
                "ERROR: Filename collision. The future output file `{}` exists"
                .format(opf))
            sys.exit(1)

        df = pd.read_csv(inputmatrix, delim_whitespace=True)
        cols = list(df.columns[2:])
        cols.insert(0, 'locus_name')
        df['locus_name'] = df.iloc[:, 0].astype(
            'str') + '_' + df.iloc[:, 1].astype('str')
        df = df[cols]
        print(df.head())
        snp_num, sm_num = df.shape[0], df.shape[1] - 1
        f1 = open(opf, 'w')
        f1.write(mst_header.format(opts.mstmap_pop_type, opts.population_name, opts.distance_function, opts.cut_off_p_value, \
            opts.no_map_dist, opts.no_map_size, opts.missing_threshold, opts.estimation_before_clustering, opts.detect_bad_data, \
            opts.objective_function, snp_num, sm_num))
        f1.close()

        df.to_csv(opf, sep='\t', index=False, mode='a')
        print('Done, check file {}!'.format(opf))

    if opts.joinmap:
        opf = inputmatrix.rsplit(".", 1)[0] + '.joinmap.xlsx'  # output file
        if Path(opf).exists():
            eprint(
                "ERROR: Filename collision. The future output file `{}` exists"
                .format(opf))
            sys.exit(1)

        df = pd.read_csv(inputmatrix, delim_whitespace=True)
        need_reps, reps = [], []
        if opts.homo1 != 'a':
            need_reps.append(opts.homo1)
            reps.append('a')
        if opts.homo2 != 'b':
            need_reps.append(opts.homo2)
            reps.append('b')
        if opts.hete != 'h':
            need_reps.append(opts.hete)
            reps.append('h')
        if opts.missing != '-':
            need_reps.append(opts.missing)
            reps.append('-')
        if need_reps:
            df = df.replace(need_reps, reps)

        cols = list(df.columns[2:])
        cols.insert(0, 'Classification')
        cols.insert(0, 'locus_name')
        df['locus_name'] = df.iloc[:, 0].astype(
            'str') + '_' + df.iloc[:, 1].astype('str')
        df['Classification'] = '(a,h,b)'
        df = df[cols]
        df.to_excel(opf)
        print(
            'Done! Now you can load the genotype data into the JoinMap project from the MS-Excel spreadsheet {} to a dataset node.'
            .format(opf))

    if opts.rqtl:
        opf = inputmatrix.rsplit(".", 1)[0] + '.rqtl.csv'  # output file
        if Path(opf).exists():
            eprint(
                "ERROR: Filename collision. The future output file `{}` exists"
                .format(opf))
            sys.exit(1)

        df = pd.read_csv(inputmatrix, delim_whitespace=True)
        need_reps, reps = [], []
        if opts.homo1 != 'A':
            need_reps.append(opts.homo1)
            reps.append('A')
        if opts.homo2 != 'B':
            need_reps.append(opts.homo2)
            reps.append('B')
        if opts.hete != 'H':
            need_reps.append(opts.hete)
            reps.append('H')
        if opts.missing != '-':
            need_reps.append(opts.missing)
            reps.append('-')
        if need_reps:
            df = df.replace(need_reps, reps)

        cols = list(df.columns[2:])
        cols.insert(0, 'id')
        df['id'] = df.iloc[:, 0].astype('str') + '_' + df.iloc[:,
                                                               1].astype('str')
        df = df[cols]

        df.loc[-1] = 1
        df.index = df.index + 1
        df = df.sort_index()
        df.iloc[0, 0] = np.nan
        df.to_csv(opf, index=False, na_rep='')
        print('Done, check file {}!'.format(opf))
Exemplo n.º 3
0
def qc_hetero(args):
    """
    %prog qc_hetero input.matrix output.matrix

    run quality control on the continuous same homozygous in heterozygous region.
    """
    p = OptionParser(qc_hetero.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option("--read_len",
                 default=150,
                 type='int',
                 help="read length for SNP calling")
    p.add_option("--logfile",
                 default='GC.qc_hetero.info',
                 help="specify the file saving binning info")
    q = OptionGroup(p, "format options")
    p.add_option_group(q)
    q.add_option('--homo1',
                 default="A",
                 help='character for homozygous genotype')
    q.add_option("--homo2",
                 default='B',
                 help="character for alternative homozygous genotype")
    q.add_option('--hete',
                 default='X',
                 help='character for heterozygous genotype')
    q.add_option('--missing', default='-', help='character for missing value')
    r = OptionGroup(p, 'advanced options')
    p.add_option_group(r)
    r.add_option(
        '--nonhetero_lens',
        default=8,
        type='int',
        help=
        'number of non heterozygous between two heterozygous in a heterozygous region'
    )
    r.add_option(
        '--min_homo',
        default=2,
        type='int',
        help=
        'number of continuous homozygous within the read length in the heterozygous region'
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap

    logging.basicConfig(filename=opts.logfile,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix,
                             delim_whitespace=True,
                             index_col=[0, 1],
                             iterator=True)
    Good_SNPs = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        logging.debug(chrom)
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        chr_idx = df_chr_tmp.index
        df_chr_tmp_num = df_chr_tmp.replace(
            [opts.homo1, opts.homo2, opts.hete, opts.missing],
            [0, 2, 1, 9]).loc[chrom]

        chr_bin_ids = []
        for sm in df_chr_tmp_num:
            geno_grouper = (df_chr_tmp_num[sm].diff(1) !=
                            0).astype('int').cumsum()
            idx, geno, lens = [], [], []
            for __, grp_geno in df_chr_tmp_num[sm].groupby(geno_grouper):
                idx.append(grp_geno.index)
                geno.append(grp_geno.unique()[0])
                lens.append(grp_geno.shape[0])
            df_grp_geno = pd.DataFrame(
                dict(zip(['idx', 'geno', 'lens'], [idx, geno, lens])))
            df_grp_geno['type'] = df_grp_geno['geno'].apply(
                lambda x: 1 if x == 1 else 0
            )  # 1: hetero genotype 0: others(homo1, homo2, missing)
            type_grouper = (df_grp_geno['type'].diff(1) !=
                            0).astype('int').cumsum()
            for __, grp_type in df_grp_geno.groupby(type_grouper):
                if grp_type['type'].unique()[0] == 0:
                    nonhetero_lens = grp_type['lens'].sum()
                    if nonhetero_lens <= opts.nonhetero_lens:
                        for __, row in grp_type.iterrows():
                            if row.geno == 0 or row.geno == 2:
                                bin_ids = get_blocks(row['idx'].values,
                                                     dist=opts.read_len,
                                                     block_size=opts.min_homo)
                                if bin_ids:
                                    for bin_index in bin_ids:
                                        if bin_index not in chr_bin_ids:
                                            chr_bin_ids.append(bin_index)
        if chr_bin_ids:
            dropping_ids = []
            merged_bin_ids = sort_merge_sort(chr_bin_ids)
            for idx_block in merged_bin_ids:
                logging.debug('positions: {}'.format(idx_block))
                genos_block = df_chr_tmp_num.loc[idx_block, :]
                missings = genos_block.apply(lambda x: (x == 9).sum(), axis=1)
                heteros = genos_block.apply(lambda x: (x == 1).sum(), axis=1)
                dropping_index = list(
                    pd.concat([missings, heteros],
                              axis=1).sort_values([0, 1]).index[1:])
                dropping_ids.extend(dropping_index)
            df_chr_tmp = df_chr_tmp.drop(dropping_ids, level=1)
        Good_SNPs.append(df_chr_tmp)
    df1 = pd.concat(Good_SNPs)
    before_snp_num = sum(chr_nums.values())
    after_snp_num = df1.shape[0]
    pct = after_snp_num / float(before_snp_num) * 100
    print('{} SNP markers before quality control.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after the quality control.'.format(
        after_snp_num, pct))
    df1.to_csv(outputmatrix, sep='\t', index=True)
    print('Done! Check {} for running details.'.format(opts.logfile))
Exemplo n.º 4
0
def qc_missing(args):
    """
    %prog filtermissing input.matrix output.matrix

    run quality control of the missing genotypes in the input.matrix before starting the correction.
    """
    p = OptionParser(qc_missing.__doc__)
    p.add_option("-i", "--input", help=SUPPRESS_HELP)
    p.add_option("-o", "--output", help=SUPPRESS_HELP)
    p.add_option(
        '--cutoff_snp',
        default=0.5,
        type='float',
        help="SNP with missing rate higher than this value will be removed")
    p.add_option(
        '--rm_bad_samples',
        default=False,
        action="store_true",
        help=
        'remove bad samples after controlling the SNPs with high missing rate')
    p.add_option(
        '--cutoff_sample',
        type='float',
        help=
        "sample missing rate higher than this value will be removed after controlling the SNP missing rate"
    )
    q = OptionGroup(p, "format options")
    p.add_option_group(q)
    q.add_option('--homo1',
                 default="A",
                 help='character for homozygous genotype')
    q.add_option("--homo2",
                 default='B',
                 help="character for alternative homozygous genotype")
    q.add_option('--hete',
                 default='X',
                 help='character for heterozygous genotype')
    q.add_option('--missing', default='-', help='character for missing value')
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    if opts.rm_bad_samples and not opts.cutoff_sample:
        eprint(
            'missing value cutoff for --cutoff_sample option must be specified when --rm_bad_samples added.'
        )
        sys.exit(1)

    inmap, outmap = args
    inputmatrix = opts.input or inmap
    outputmatrix = opts.output or outmap

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix,
                             delim_whitespace=True,
                             index_col=[0, 1],
                             iterator=True)
    Good_SNPs = []
    for chrom in chr_order:
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        df_chr_tmp_num = df_chr_tmp.replace(
            [opts.homo1, opts.homo2, opts.hete, opts.missing], [0, 2, 1, 9])
        sample_num = df_chr_tmp_num.shape[1]
        good_rates = df_chr_tmp_num.apply(lambda x:
                                          (x == 9).sum() / sample_num,
                                          axis=1) <= opts.cutoff_snp
        good_snp = df_chr_tmp.loc[good_rates, :]
        Good_SNPs.append(good_snp)
    df1 = pd.concat(Good_SNPs)
    before_snp_num = sum(chr_nums.values())
    after_snp_num, before_sm_num = df1.shape
    pct = after_snp_num / float(before_snp_num) * 100
    print('{} SNP markers before quality control.'.format(before_snp_num))
    print('{}({:.1f}%) markers left after the quality control.'.format(
        after_snp_num, pct))

    if opts.rm_bad_samples:
        print('start quality control on samples')
        good_samples = df1.apply(lambda x:
                                 (x == opts.missing).sum() / after_snp_num,
                                 axis=0) <= opts.cutoff_sample
        df2 = df1.loc[:, good_samples]
        after_sm_num = df2.shape[1]
        pct_sm = after_sm_num / float(before_sm_num) * 100
        print('{} samples before quality control.'.format(before_sm_num))
        print('{}({:.1f}%) markers left after the quality control.'.format(
            after_sm_num, pct_sm))
        df2.to_csv(outputmatrix, sep='\t', index=True)
    else:
        df1.to_csv(outputmatrix, sep='\t', index=True)
Exemplo n.º 5
0
def correct(args):
    """
    %prog correct config.txt input.matrix 

    Correct wrong genotype calls and impute missing values in biparental populations
    """
    p = OptionParser(correct.__doc__)
    p.add_option("-c", "--configfile", help=SUPPRESS_HELP)
    p.add_option("-m", "--matrixfile", help=SUPPRESS_HELP)
    p.add_option('--itertimes',
                 default=7,
                 type='int',
                 help='maximum correction times to reach the stablized status')
    q = OptionGroup(p, "output options")
    p.add_option_group(q)
    q.add_option('--opp',
                 default="'infer'",
                 help='specify the prefix of the output file names')
    q.add_option("--logfile",
                 default='GC.correct.log',
                 help="specify the file saving running info")
    q.add_option(
        '--debug',
        default=False,
        action="store_true",
        help=
        'trun on the debug mode that will generate a tmp file containing both original and corrected genotypes for debug use'
    )

    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    configfile, mapfile = args
    inputmatrix = opts.matrixfile or mapfile
    inputconfig = opts.configfile or configfile

    opf = inputmatrix.rsplit(
        ".",
        1)[0] + '.corrected.map' if opts.opp == "'infer'" else '{}.map'.format(
            opts.opp)  # output file
    if Path(opf).exists():
        eprint("ERROR: Filename collision. The future output file `{}` exists".
               format(opf))
        sys.exit(1)

    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                    format(sys.version.splitlines()[0].strip()))
        cpus = 1

    logging.basicConfig(filename=opts.logfile,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    cargs = ParseConfig(inputconfig)
    if cargs.win_size % 2 == 0:
        eprint("ERROR: The slding window value cannot be even")
        sys.exit(1)
    logging.debug("Parameters in config file: {0}".format(cargs.__dict__))

    chr_order, chr_nums = getChunk(inputmatrix)
    map_reader = pd.read_csv(inputmatrix,
                             delim_whitespace=True,
                             index_col=[0, 1],
                             iterator=True)
    tmp_chr_list = []
    for chrom in chr_order:
        logging.debug('{}...'.format(chrom))
        print('{}...'.format(chrom))
        chunk = chr_nums[chrom]
        df_chr_tmp = map_reader.get_chunk(chunk)
        marker_num, sample_num = df_chr_tmp.shape
        logging.debug('{} contains {} markers and {} samples.'.format(
            chrom, marker_num, sample_num))
        tmp_sm_list = []
        for sm in df_chr_tmp:
            logging.debug('Start correcting {}...'.format(sm))
            orig_seq = df_chr_tmp[sm]
            orig_idx = orig_seq.index
            seq_no_idx = orig_seq.reset_index(drop=True)
            seq_no_idx_num = seq_no_idx.replace(
                [cargs.gt_a, cargs.gt_b, cargs.gt_h, cargs.gt_miss],
                [0, 2, 1, 9])
            if seq_no_idx_num.shape[0] <= cargs.win_size:
                logging.debug(
                    'number of markers smaller than the window size, omit...')
                final_seq_no_idx = seq_no_idx
            else:
                logging.debug('correction round 1...')
                correct_obj = CorrectOO(cargs, seq_no_idx_num)
                corrected_n = get_corrected_num(seq_no_idx_num,
                                                correct_obj.corrected)
                round_n = 2
                while round_n <= opts.itertimes:
                    logging.debug('correction round %s...' % round_n)
                    corrected_obj = CorrectOO(cargs, correct_obj.corrected)
                    corrected_n_new = get_corrected_num(
                        seq_no_idx_num, corrected_obj.corrected)
                    round_n += 1
                    if (corrected_n_new - corrected_n) / float(corrected_n +
                                                               0.01) <= 0.01:
                        break
                    else:
                        corrected_n = corrected_n_new
                final_seq_no_idx = corrected_obj.corrected.replace(
                    [0, 2, 1, 9],
                    [cargs.gt_a, cargs.gt_b, cargs.gt_h, cargs.gt_miss])
            final_seq_no_idx.index = orig_idx
            final_seq = final_seq_no_idx
            tmp_sm_list.append(final_seq)
        df_sm_tmp = pd.concat(tmp_sm_list, axis=1)
        tmp_chr_list.append(df_sm_tmp)
    df_corrected = pd.concat(tmp_chr_list)

    df_corrected.to_csv(opf, sep='\t', index=True)

    if opts.debug:
        logging.debug('generating the tmp file for debug use...')
        df_uncorrected = pd.read_csv(inputmatrix,
                                     delim_whitespace=True,
                                     index_col=[0, 1])
        df_debug = df_corrected.where(df_corrected == df_uncorrected,
                                      other=df_corrected + '(' +
                                      df_uncorrected + ')')
        df_debug.to_csv(opf + '.debug', sep='\t', index=True)
    print('Done!')
Exemplo n.º 6
0
def quick_search(args):
    '''
    %prog quick_search
    
    Perform quick serach and get ids for items from search resutls
    '''
    p = OptionParser(quick_search.__doc__)
    p.add_option('-o',
                 '--output',
                 default="searches.csv",
                 help='specify output file')
    p.add_option('--geom',
                 default="lindsay_james.geojson",
                 help='speficy the geojson file containing the geometry info')
    p.add_option('--cloud',
                 default=True,
                 action='store_false',
                 help='disable cloud filter if add --cloud option')
    p.add_option('--coverage',
                 default=False,
                 action='store_true',
                 help='add area coverage filter if add --coverage option')
    p.add_option('--instrument',
                 default=False,
                 action='store_true',
                 help='add instrument filter if add --instrument option')
    p.add_option('--date_range',
                 default=True,
                 action='store_false',
                 help='disable date filter if add --date_range option')
    p.add_option('--map_footprint',
                 default=False,
                 action='store_true',
                 help='add mapping footprints if add --map_footprint option')

    q = OptionGroup(p, "options for date filter")
    p.add_option_group(q)
    q.add_option('--start',
                 default="2018-01-01",
                 help='the start date. use yyyy-mm-dd format.')
    q.add_option('--end', help='the end date. use yyyy-mm-dd format')

    r = OptionGroup(p, "options for request")
    p.add_option_group(r)
    r.add_option(
        '--item_types',
        default='PSScene4Band',
        help='specify the item types. use comma separated if more than one item'
    )

    opts, args = p.parse_args(args)
    if len(args) != 0:
        sys.exit(not p.print_help())

    # all filters
    Filters, Filters_names = [], []
    geojson_fn = default_geojson if opts.geom == 'lindsay_james.geojson' else opts.geom
    with open(geojson_fn) as f:
        data = json.load(f)
    geometry = data['features'][0]['geometry']
    filter_geom = Filter().get_filter_geometry(geometry)
    Filters.append(filter_geom)
    Filters_names.append(filter_geom['field_name'])
    if opts.cloud:
        filter_cloud = Filter().get_filter_cloud()
        Filters.append(filter_cloud)
        Filters_names.append(filter_cloud['field_name'])
    if opts.coverage:
        filter_coverage = Filter().get_filter_coverage()
        Filters.append(filter_coverage)
        Filters_names.append(filter_coverage['field_name'])
    if opts.instrument:
        filter_instrument = Filter().get_filter_instrument()
        Filters.append(filter_instrument)
        Filters_names.append(filter_instrument['field_name'])
    if opts.date_range:
        st, ed = opts.start, opts.end
        filter_date = Filter().get_filter_date(st=st, ed=ed)
        Filters.append(filter_date)
        Filters_names.append('date_range')
    Final_Filters = {'type': 'AndFilter', 'config': Filters}
    print('Applied Filters: %s' % (', '.join(Filters_names)))

    # request
    items = opts.item_types.split(',')
    print('Item Types: ', items)
    Requests = Request(item_types=items)
    Final_Requests = Requests.get_request(Final_Filters)
    #rint(Final_Requests)

    # post
    client = Client()
    res = client.ses.post(client.url_quick_search,
                          json=Final_Requests,
                          params={"_page_size": 10})
    geojson = res.json(
    )  # important keys: _links(current link and link of the next page), features(id)
    link_first_page = geojson['_links']['_first']

    ids, cloud_cover, item_type, assets_url = [], [], [], []

    def parse_page(session, search_url, map_footprint):
        '''
        loop pages and extract ids from each page
        '''
        res = session.get(search_url)
        if map_footprint:
            url = geojsonio.display(res.text)
        page = res.json()
        for feature in page['features']:
            id = feature['id']
            cc = feature['properties']['cloud_cover']
            it = feature['properties']['item_type']
            au = feature['_links']['assets']
            ids.append(id)
            cloud_cover.append(cc)
            item_type.append(it)
            assets_url.append(au)
        next_url = page["_links"].get("_next")
        if next_url:
            parse_page(session, next_url, map_footprint)

    parse_page(client.ses, link_first_page, opts.map_footprint)

    df = pd.DataFrame(
        dict(
            zip(['id', 'cloud_cover', 'item_type', 'assets_url'],
                [ids, cloud_cover, item_type, assets_url])))
    df.to_csv(opts.output, index=False, sep='\t')
    print('%s items found, please check search resutls in %s!' %
          (df.shape[0], opts.output))
Exemplo n.º 7
0
def stat(args):
    '''
    %prog stat
    
    Check availabe images on Planet
    '''
    p = OptionParser(stat.__doc__)
    p.add_option('-o',
                 '--output',
                 default="stats.csv",
                 help='specify output file')
    p.add_option('--geom',
                 default="lindsay_james.geojson",
                 help='speficy the geojson file containing the geometry info')
    p.add_option('--cloud',
                 default=True,
                 action='store_false',
                 help='disable cloud filter if add --cloud option')
    p.add_option('--coverage',
                 default=False,
                 action='store_true',
                 help='add area coverage filter if add --coverage option')
    p.add_option('--instrument',
                 default=False,
                 action='store_true',
                 help='add instrument filter if add --instrument option')
    p.add_option('--date_range',
                 default=True,
                 action='store_false',
                 help='disable date filter if add --date_range option')

    q = OptionGroup(p, "options for date filter")
    p.add_option_group(q)
    q.add_option('--start',
                 default="2018-01-01",
                 help='the start date. use yyyy-mm-dd format.')
    q.add_option('--end', help='the end date. use yyyy-mm-dd format')

    r = OptionGroup(p, "options for request")
    p.add_option_group(r)
    r.add_option('--interval',
                 default="year",
                 choices=('year', 'month', 'week', 'day', 'hour'),
                 help='specify the interval in the request')
    r.add_option(
        '--item_types',
        default='PSScene4Band,PSOrthoTile',
        help='specify the item types. use comma separated if more than one item'
    )

    opts, args = p.parse_args(args)
    if len(args) != 0:
        sys.exit(not p.print_help())

    # all filters
    Filters, Filters_names = [], []
    geojson_fn = default_geojson if opts.geom == 'lindsay_james.geojson' else opts.geom
    with open(geojson_fn) as f:
        data = json.load(f)
    geometry = data['features'][0]['geometry']
    filter_geom = Filter().get_filter_geometry(geometry)
    Filters.append(filter_geom)
    Filters_names.append(filter_geom['field_name'])
    if opts.cloud:
        filter_cloud = Filter().get_filter_cloud()
        Filters.append(filter_cloud)
        Filters_names.append(filter_cloud['field_name'])
    if opts.coverage:
        filter_coverage = Filter().get_filter_coverage()
        Filters.append(filter_coverage)
        Filters_names.append(filter_coverage['field_name'])
    if opts.instrument:
        filter_instrument = Filter().get_filter_instrument()
        Filters.append(filter_instrument)
        Filters_names.append(filter_instrument['field_name'])
    if opts.date_range:
        st, ed = opts.start, opts.end
        filter_date = Filter().get_filter_date(st=st, ed=ed)
        Filters.append(filter_date)
        Filters_names.append(filter_date['field_name'])
    Final_Filters = {'type': 'AndFilter', 'config': Filters}
    print('Applied Filters: %s' % (', '.join(Filters_names)))

    # request
    Requests = Request(item_types=opts.item_types.split(','),
                       interval=opts.interval)
    Final_Requests = Requests.get_request(Final_Filters)

    # post
    client = Client()
    res = client.ses.post(client.url_stat, json=Final_Requests)
    df_stat = pd.DataFrame.from_dict(
        res.json()['buckets'])[['start_time', 'count']]
    print(df_stat)
    df_stat.to_csv(opts.output, index=False, sep='\t')
    print('also saved to %s!' % opts.output)