示例#1
0
文件: groupinfo.py 项目: trmznt/pys
def groupinfo(args):

    # open and read the first line of infile
    if args.fmt in ['pickle', 'npy']:

        from seqpy.core.bioio import naltparser
        from types import SimpleNamespace
        nalt_args = SimpleNamespace(infile=args.infile, fmt=args.fmt, n=-1)
        nalt_parser = naltparser.NAltLineParser(nalt_args,
                                                with_group=False,
                                                with_position=False)
        samples = nalt_parser.samples

    elif args.fmt == 'list':
        with gzopen(args.infile) as f:
            buf = f.read()
            samples = buf.split()

    else:
        with gzopen(args.infile) as f:
            samples = f.readline().strip().split()

    group_parser = grpparser.GroupParser(args)
    groups = group_parser.assign_groups(samples)
    total = 0
    cout('Groups:')
    for g in sorted(groups.keys()):
        c = len(groups[g])
        cout('  %3d - %s' % (c, g))
        total += c
    cout('Total: %d samples' % total)
示例#2
0
def ralt2nalt(args):

    ralt_parser = naltparser.NAltLineParser(args,
                                            datatype='ralt',
                                            with_group=False,
                                            with_position=False)

    region = ralt_parser.parse_whole()

    # convert to n_alt
    cerr('[I - converting to nalt format]')
    cerr('[ M dtype: {}]'.format(region.M.dtype))
    region.ralt_to_nalt(hetratio=args.hetratio if not args.major else -1)
    cerr('[ M dtype: {}]'.format(region.M.dtype))

    region.save(args.outfmt,
                prefixname=args.outfile,
                autofilename=args.autofilename,
                with_position=False)
    return

    # write to outfile
    with open(args.outfile, 'w') as outfile:
        # write header
        outfile.write(ralt_parser.get_sample_header())
        outfile.write('\n')
        np.savetxt(outfile, region.M, fmt='%d', delimiter='\t')

    cerr('[I: finish writing to %s' % args.outfile)
示例#3
0
def dist2clonalqc(args):

    # read distance matrix
    df = pd.read_csv(args.infile, sep='\t')
    samples = df.columns
    D = df.values

    # read quality file or pickled ralt/nalt file
    if args.datafile:

        nalt_args = SimpleNamespace(infile=args.datafile, fmt=args.fmt, n=-1)
        nalt_parser = naltparser.NAltLineParser(nalt_args,
                                                with_group=False,
                                                with_position=False)
        region = nalt_parser.parse_whole()
        qual = np.count_nonzero(region.M == -1, axis=0)

    else:
        cexit('ERR: other input file has not been defined')

    clonal_samples = clonal_index(D,
                                  qual.max() - qual, samples, args.threshold)
    cerr('[I - removing %d clonal samples]' % len(clonal_samples))
    if args.outfile:
        np.savetxt(args.outfile, samples[clonal_samples], fmt='%s')
示例#4
0
def nalt2dist(args):

    with_position = True if args.includepos else False

    nalt_parser = naltparser.NAltLineParser(args,
                                            with_group=False,
                                            with_position=with_position)
    whole_region = nalt_parser.parse_whole()

    if with_position:
        with open(args.includepos) as f_posline:
            poslines = [x.split() for x in f_posline]
            if poslines[0][0] == 'CHROM' and poslines[0][1] == 'POS':
                del poslines[0]
        whole_region.filter_poslines(poslines, inplace=True)

    # read whole genotype, and release all unused memory
    cerr('[I - converting to haplotypes]')
    haplotypes = whole_region.haplotypes()

    cerr('[I - calculating pairwise dxy for %d samples]' % len(haplotypes))
    distm = fastdx.pwdistm(haplotypes, args.countmissing)

    cerr('[I - writing to %s]' % args.outfile)
    with open(args.outfile, 'w') as outfile:
        outfile.write('\t'.join(nalt_parser.parse_samples()))
        outfile.write('\n')

        # write the matrix
        np.savetxt(outfile, distm, delimiter='\t', fmt='%.6f')
示例#5
0
def ralt2pickle(args):

    alt_parser = naltparser.NAltLineParser(args,
                                           datatype=args.type,
                                           with_group=False,
                                           with_position=False)

    whole_region = alt_parser.parse_whole()
    whole_region.df_M.to_pickle(args.outfile)
示例#6
0
文件: simxval.py 项目: trmznt/pys
def simxval(args):

    start_time = time.monotonic()

    model = get_model(args)

    logfh = open(args.logfile, 'w') if args.logfile else None

    cerr('[I - repeats: %d, k-fold: %d, iteration: %d, k: %s]' % (args.repeats, args.fold, args.iter, args.k))

    nalt_parser = naltparser.NAltLineParser( args, datatype='nalt')

    # we need group info
    nalt_parser.parse_grouping()
    group_keys = nalt_parser.group_parser.group_keys

    # remove groups whose samples are less than 3
    suitable_groups = set( [ g for g in nalt_parser.group_parser.groups
                                if len(nalt_parser.group_parser.groups[g]) > 2])

    cerr('[I - reading %s samples]' % len(nalt_parser.samples))
    mask = [False] * len(nalt_parser.samples)
    for i in range(len(nalt_parser.samples)):
        if group_keys[i] in suitable_groups:
            mask[i] = True
    samples = list(itertools.compress(nalt_parser.samples, mask))
    group_keys = np.array(list(itertools.compress(group_keys, mask)))

    cerr('[I - masking to %d samples]' % len(samples))
    region = nalt_parser.parse_whole(mask=mask)

    if args.mac > 0:
        cerr('[I - filtering for MAC = %d]' % args.mac)
        region.filter_mac(args.mac, inplace=True)

    cerr('[I - preparing haplotypes]')
    haplotypes = region.haplotypes()

    k_list = [ int(x) for x in args.k.split(',') ]
    results, snp_table = validate( model, haplotypes, group_keys, k_list = k_list,
        repeats = args.repeats, fold = args.fold, iteration = args.iter,
        procs = args.j, with_snp = (args.outsnp is not None ), logfh = logfh )

    results.to_csv(args.outfile, sep='\t', index=False)
    cerr('[I - writing scores to %s]' % args.outfile)

    if args.outsnp:
        import yaml
        yaml.dump(snp_table, open(args.outsnp, 'w'))
        cerr('[I - writing SNP table to %s]' % args.outsnp )

    if logfh:
        cerr('[I - writing log to %s]' % args.logfile)

    cerr('[I - finished in %d secs with %d results]'
            % (time.monotonic() - start_time, len(results)))
示例#7
0
文件: lkest.py 项目: trmznt/pys
def train(args):

    model = get_model(args)
    cerr('[I - train model: %s iteration: %d')
    nalt_parser = naltparser.NAltLineParser( args, dataype='nalt')

    # we need group info
    nalt_parser.parse_grouping()
    group_keys = nalt_parser.group_parser.group_keys


    cerr('[I - masking to %d samples]' % len(samples))
    region = nalt_parser.parse_whole(mask=mask)
    haplotypes = region.haplotypes()

    matrix_profiles = cross_train( haplotypes, group_keys, repeats, fold, )
示例#8
0
def nalt2haplotypes(args):

    nalt_parser = naltparser.NAltLineParser(args, with_group=False)
    region = nalt_parser.parse_whole()
    samples = nalt_parser.parse_samples()

    if args.includepos:
        with open(args.includepos) as f:
            poslines = [x.strip().split() for x in f]
            if poslines[0][0] == 'CHROM' and poslines[0][1] == 'POS':
                del poslines[0]
    else:
        poslines = None

    if poslines:
        region.filter_poslines(poslines, inplace=True)

    # read whole genotype, and release all unused memory
    cerr('[I - converting to haplotypes]')
    haplotypes = region.haplotypes()
    positions = region.P

    lines = []
    for i in range(len(haplotypes)):
        snps = []
        for j in range(len(haplotypes[i])):
            allel = haplotypes[i][j]
            if allel == 0:
                snps.append(positions[j][2])
            elif allel == 2:
                snps.append(positions[j][3])
            elif allel < 0:
                snps.append('X')
            else:
                snps.append('N')
        lines.append((samples[i], ''.join(snps)))

    if args.outfile:
        with open(args.outfile, 'w') as f:
            if args.revfmt:
                for line in lines:
                    f.write('{}\t{}\n'.format(line[1], line[0]))
            else:
                for line in lines:
                    f.write('{}\t{}\n'.format(line[0], line[1]))
示例#9
0
def read_samplefile(infile, fmt):

    if fmt in ['pickle', 'npy']:
        from seqpy.core.bioio import naltparser
        from types import SimpleNamespace
        nalt_args = SimpleNamespace(infile=infile, fmt=fmt, n=-1)
        nalt_parser = naltparser.NAltLineParser(nalt_args,
                                                with_group=False,
                                                with_position=False)
        samples = nalt_parser.samples

    elif fmt == 'list':
        with gzopen(infile) as f:
            buf = f.read()
            samples = buf.strip().split()

    else:
        # only read the first line
        with gzopen(infile) as f:
            samples = f.readline().strip().split()

    return samples
示例#10
0
文件: train.py 项目: trmznt/lkc-bisnp
def train(args):

    # load profiles if exists

    nalt_parser = naltparser.NAltLineParser(args, datatype='nalt')

    nalt_parser.parse_grouping()
    group_keys = nalt_parser.group_parser.group_keys

    region = nalt_parser.parse_whole()
    samples = nalt_parser.parse_samples()

    poslines = [line.split() for line in open(args.includepos)][1:]
    region.filter_poslines(poslines, inplace=True, sort_position=False)
    haplotypes = region.haplotypes()

    #import IPython; IPython.embed()

    cerr('[I - fitting for {}]'.format(args.code))
    classifier = lkest.SNPLikelihoodEstimator()
    classifier.fit(haplotypes, group_keys)
    profile = classifier.get_profile()
    profile.positions = region.P
    profile.code = args.code
    profile.remark = args.remark

    try:
        with open(args.profile, 'rb') as f:
            profiles = pickle.load(f)
    except FileNotFoundError:
        profiles = {}
    if args.code in profiles and not args.replace:
        cexit('ERR: cannot replace ')
    profiles[args.code] = profile.to_dict()
    with open(args.profile, 'wb') as f:
        pickle.dump(profiles, f)

    cerr('[I - profiles saved to {}]'.format(args.profile))
示例#11
0
def nalt2qc(args):
    """ write to out.imiss & out.lmiss
        for each sample and SNPS, evaluate:
            N_MISS
            F_MISS
            N_HETS
            F_HETS

        out.imiss:
        SAMPLE N_SNP N_MISS F_MISS N_HETS F_HETS

        out.lmiss:
        CHR POS N_SAMPLE N_MISS F_MISS N_HETS F_HETS
    """

    start_time = time.monotonic()
    nalt_parser = naltparser.NAltLineParser(args,
                                            datatype='nalt',
                                            with_group=False)
    samples = nalt_parser.parse_samples()
    whole = nalt_parser.parse_whole()
    cerr('[I - reading input file in %s secs]' %
         (time.monotonic() - start_time))

    # create an array for N samples with column:
    # N_SNP N_MISS N_HETS
    asamples = np.zeros(shape=(len(samples), 3))

    # container for lmiss output
    chr_pos = []
    snps = []

    # -1 indicate missing data, 1 indicate heterozygous SNP
    for pos, n_alt in whole.parse_positions():
        # gather imiss data
        asamples[range(n_alt.size), 0] += 1
        asamples[n_alt == -1, 1] += 1
        asamples[n_alt == 1, 2] += 1

        # gather lmiss data
        chromosome = pos[0]
        position = pos[1]
        n_sample = len(n_alt)
        n_miss = np.where(n_alt == -1)[0].size
        n_het = np.where(n_alt == 1)[0].size
        chr_pos.append((chromosome, position))
        snps.append((n_sample, n_miss, n_het))

    # create imiss stats
    imiss = np.zeros(shape=(len(samples), 5))
    n_snps = asamples[:, 0]
    imiss[:, 0] = asamples[:, 0]
    imiss[:, 1] = asamples[:, 1]
    imiss[:, 2] = asamples[:, 1] / n_snps
    imiss[:, 3] = asamples[:, 2]
    imiss[:, 4] = asamples[:, 2] / n_snps

    # create lmiss stats
    snps = np.array(snps)
    n_samples = snps[:, 0]
    lmiss = np.zeros(shape=(len(snps), 5))
    lmiss[:, 0] = snps[:, 0]
    lmiss[:, 1] = snps[:, 1]
    lmiss[:, 2] = snps[:, 1] / n_samples
    lmiss[:, 3] = snps[:, 2]
    lmiss[:, 4] = snps[:, 2] / n_samples

    # output imiss
    with open('out.imiss', 'w') as iout:
        iout.write('INDV\tN_SNP\tN_MISS\tF_MISS\tN_HETS\tF_HETS\n')
        for sample, i in zip(samples, imiss):
            iout.write('{}\t'.format(sample))
            iout.write('{}\n'.format('\t'.join(map(str, i))))

    with open('out.lmiss', 'w') as lout:
        lout.write('CHROM\tPOS\tN_SAMPLE\tN_MISS\tF_MISS\tN_HETS\tF_HETS\n')
        for (chrom, pos), l in zip(chr_pos, lmiss):
            lout.write('{}\t{}\t'.format(chrom, pos))
            lout.write('{}\n'.format('\t'.join(map(str, l))))
示例#12
0
文件: ralt2exhqc.py 项目: trmznt/pys
def read_data( args ):

    nalt_parser = naltparser.NAltLineParser( args, with_group=False, with_position=False)
    region = nalt_parser.parse_whole()
    return region.M, np.arange(len(region.M[0])), np.arange(len(region.M))
示例#13
0
文件: ralt2ralt.py 项目: trmznt/pys
def ralt2ralt( args ):

    cerr('[I - reading input files]')

    start_time = time.monotonic()

    alt_parser = naltparser.NAltLineParser(args, datatype=args.type)

    whole_region = alt_parser.parse_whole()
    samples = alt_parser.parse_samples()

    if args.posindex:
        pos_indexes = np.loadtxt(args.posindex, dtype=int)
        cerr('[I - filtering for %d SNP position]' % len(pos_indexes))
        whole_region.filter_positions(pos_indexes)
    elif args.includepos:
        with gzopen(args.includepos) as f_posline:
            poslines = [ x.split() for x in f_posline ]
            if poslines[0][0] == 'CHROM' and poslines[0][1] == 'POS':
                del poslines[0]
        whole_region.filter_poslines(poslines, inplace=True)

    if args.indvindex:
        indv_indexes = np.loadtxt(args.indvindex, dtype=int)
        whole_region.filter_samples(indv_indexes)
        samples = samples[indv_indexes]

    if args.excludesample:
        excluded_samples = np.loadtxt(args.excludesample, dtype=str)
        excluded_indexes = np.where(np.array(samples) == excluded_samples[:,None])[1]
        indv_indexes = np.array( list(set( range(len(samples))) - set(excluded_indexes)) )
        cerr('[I - excluding %d samples]' % len(excluded_indexes))
        whole_region.filter_samples(indv_indexes)
        samples = samples[indv_indexes]

    if args.includesample:
        included_samples = np.loadtxt(args.includesample, dtype=str)
        included_indexes = np.where(np.array(samples) == included_samples[:,None])[1]
        cerr('[I - including {} | {} out of {} samples]'.format(
            len(included_samples), len(included_indexes), len(samples)))
        whole_region.filter_samples(included_indexes)
        samples = samples[included_indexes]

    if args.completesamples:
        # only output samples without missing SNPs
        indv_missing = whole_region.get_snp_missingness()
        complete_indv = np.where(indv_missing == 0.0)[0]
        #import IPython; IPython.embed()
        whole_region.filter_samples(complete_indv)
        samples = samples[complete_indv]

    if args.mac > 0:
        whole_region.filter_mac(args.mac)

    # save to outfile
    whole_region.save(args.outfmt, prefixname=args.outfile, autofilename=args.autofilename
            , with_position=True)
    return


    if args.autofilename:
        args.outfile = '%s-%d-%d' % (
                'r' if args.type == 'ralt' else 'n',
                len(samples), len(whole_region.M)
        )

    cerr('[I - writing to outfiles]')
    outmatrix = args.outfile + ('.ralt' if args.type == 'ralt' else '.nalt')
    if args.outfmt == 'pickle':
        outmatrix = outmatrix + '.pickle.gz'
        whole_region.df_M.to_pickle(outmatrix)
    else:
        outmatrix = outmatrix + '.txt.gz'
        whole_region.df_M.to_csv(outmatrix, sep='\t', index=False)

    outpos = args.outfile + '.pos.txt.gz'
    whole_region.df_P.to_csv(outpos, sep='\t', index=False)
    cerr('[I - writing to file: %s and %s' % (outmatrix, outpos))

    return

    with open(outmatrix, 'wt') as f_matrix, open(outpos, 'wt') as f_pos:
        f_matrix.write('\t'.join(samples))
        f_matrix.write('\n')
        np.savetxt(f_matrix, whole_region.M, delimiter='\t',
                fmt='%4.3f' if args.type == 'ralt' else '%d')

        f_pos.write('\t'.join(alt_parser.position_parser.header))
        np.savetxt(f_pos, whole_region.P, delimiter='\t')

    cerr('[I - writing to file: %s and %s' % (outmatrix, outpos))