예제 #1
0
파일: geno2geno.py 프로젝트: trmznt/pys
def geno2geno(args):
    """ perform pair-wise FST by population """

    genoparser = tabparser.GenotypeLineParser(args)

    sample_header = genoparser.get_sample_header()
    filename = args.outfile
    with open(filename + '.geno.txt',
              'wt') as outfile, open(filename + '.pos.txt', 'wt') as outpos:

        outfile.write(sample_header)
        outfile.write('\n')
        outpos.write(genoparser.get_position_header())
        outpos.write('\n')
        c = 0

        for posline, genoline in genoparser.parse_raw_lines():

            # split posline
            tokens = posline.split()
            if (tokens[0], tokens[1]) in genoparser.include_positions:
                outfile.write(genoline)
                outpos.write(posline)
                c += 1

    cerr('I: writing %d positions' % c)
예제 #2
0
파일: mpsim.py 프로젝트: trmznt/seqpy
def prepare_stratified_samples(haplotypes,
                               group_keys,
                               k_fold,
                               haplotype_func=None):
    """ check the suitability of sample sets and modify haplotypes and group_keys properly """

    groups = []
    for group_key, count in zip(*np.unique(group_keys, return_counts=True)):
        # we make sure that every group has at least 2 * k_fold member
        if count < k_fold * 2:
            groups.append((group_key, math.ceil(k_fold * 2 / count)))

    if len(groups) == 0:
        # nothing to modify
        return (haplotypes, group_keys)

    cerr('[I - prepare_stratified_sample() replicated group: %s]' %
         ' '.join(x[0] for x in groups))
    #import IPython; IPython.embed()
    new_haplotypes = [haplotypes]
    new_group_keys = [group_keys]
    for group_key, m_factor in groups:
        indexes = np.where(group_keys == group_key)
        for i in range(m_factor):
            new_haplotypes.append(haplotypes[indexes])
            new_group_keys.append(group_keys[indexes])

    haplotypes = np.concatenate(new_haplotypes, axis=0)
    group_keys = np.concatenate(new_group_keys, axis=0)

    return (haplotypes, group_keys)
예제 #3
0
파일: genoparser.py 프로젝트: trmznt/seqpy
    def parse_snps(self):

        """ this is generator that yield (legends, genotype) """

        with open(self.genofile) as genofile, open(self.legendfile) as legendfile:

            # sanity check for sample number in genotype file
            header = next(genofile).strip()
            samples = header.split()
            if len(samples) != self.no_of_samples:
                cerr('ERR: no of sample in infile does not match the sample file')

            next(legendfile)
            gene = next(gene_iter)
            genotype = []

            # generate genotype_line
            for (idx, line) in enumerate(zip(genofile, legendfile)):
                geno_line, legend_line = line
                tokens = geno_line.strip().split()
                legends = legend_line.replace('"', '').strip().split()
                seqid = legends[0]
                pos = int(legends[1])

                if len(tokens) != self.no_of_samples:
                    cerr('ERR: line %d - no of sample in infile does not match the sample file' % (idx + 1))

                g = self.translate(tokens)

                yield( (legends, g))
예제 #4
0
def dist2clonalqc(args):

    # read distance matrix
    df = pd.read_csv(args.infile, sep='\t')
    samples = df.columns
    D = df.values

    # read quality file or pickled ralt/nalt file
    if args.datafile:

        nalt_args = SimpleNamespace(infile=args.datafile, fmt=args.fmt, n=-1)
        nalt_parser = naltparser.NAltLineParser(nalt_args,
                                                with_group=False,
                                                with_position=False)
        region = nalt_parser.parse_whole()
        qual = np.count_nonzero(region.M == -1, axis=0)

    else:
        cexit('ERR: other input file has not been defined')

    clonal_samples = clonal_index(D,
                                  qual.max() - qual, samples, args.threshold)
    cerr('[I - removing %d clonal samples]' % len(clonal_samples))
    if args.outfile:
        np.savetxt(args.outfile, samples[clonal_samples], fmt='%s')
예제 #5
0
파일: pruner.py 프로젝트: trmznt/seqpy
def prune_2(genotypes, positions, threshold=0.5, score=None):
	""" prune by r^2 except on CDS, only CDS within the same segment/region will be pruned
	"""

	if score == None:
		# we use MAC as default score
		score = np.min( genotypes.count_alleles(), axis=1)

	N = len(genotypes)

	if N != len(score) or N != len(positions):
		cexit('E: length of genotypes != length of score nor positions!')

	index = arrange_index( score )
	compress_index = np.ones( len(index), dtype=np.int8 )

	# calculate r^2
	r_2 = calculate_r_2( genotypes )

	# walk through index
	cerr('I: scanning r^2 matrix')
	for i in range(N):
		if not compress_index[i]:
			continue

		for _, j in scoring_index[i+1:]:
			if r_2[i,j] > threshold:
				# check if this is a CDS region
				if positions[j][4] and positions[j][4] != positions[i][4]:
					continue
				compress_index[j] = 0

	return compress_index
예제 #6
0
파일: pruner.py 프로젝트: trmznt/seqpy
def prune_1(genotypes, threshold=0.5, score=None):
	""" prune by r^2 with score as priority,
		returning indexing array
	"""

	N = len(genotypes)

	if score == None:
		# we use MAC as default score
		score = np.min( count_allele(genotypes), axis=1)

	if N != len(score):
		cexit('E: length of genotypes !+ length of score! ({} vs {})'.format(N, len(score)))

	index = arrange_index( score )
	compress_index = np.ones( len(index), dtype=np.int8 )

	# calculate r^2
	r_2 = calculate_r_2( genotypes )
	count = 0

	# walk through index
	cerr('[I - pruning for {} SNPs]'.format(N))
	for i in range(N):
		if not compress_index[i]:
			continue

		for j in index[i+1:]:
			if r_2[i,j] > threshold:
				compress_index[j] = 0
				count += 1

	pruned_index = np.nonzero(compress_index)[0]
	return pruned_index
예제 #7
0
def calculate_required_aspect(data):
    total_width = 1.37 * len(data['REG'].unique())
    total_height = total_width / 1.42 / 2 # bad approximation of square root of 2
    cerr('[I - Using figure size: {:.2f} x {:.2f} inches]'.format(
              total_height, total_width))

    return total_width, total_height
예제 #8
0
파일: funcs.py 프로젝트: trmznt/seqpy
def align( seqs, method=None, matrix='DNA', degap=True):
    """ aligned a list of sequences in seqs, returning a list of aligned sequences """

    if len(seqs) == 2:
        # perform pairwise alignment

        from seqpy.core.pwaligner import calign
        if degap:
            s_0 = degapped( seqs[0] )
            s_1 = degapped( seqs[1] )
        else:
            s_0 = seqs[0]
            s_1 = seqs[1]

        if not method:
            method = 'global_cfe'

        a_0, a_1, score = calign.aligner(s_0.upper(), s_1.upper(), method=method,
                        matrix=matrix)
        cerr('pairwise aligned with score: %f' % score)
        return (preserve_case(s_0, a_0), preserve_case(s_1, a_1), score)

    elif len(seqs) > 2:
        # perform multiple sequence alignment
        if method is None or method.startswith('muscle'):
            pass

    else:
        raise RuntimerError('Alignment must involve 2 or more sequences')
예제 #9
0
파일: naltparser.py 프로젝트: trmznt/seqpy
    def save(self,
             fmt,
             prefixname=None,
             autofilename=False,
             with_position=False):

        # we make assumption on the type of data
        if self.M.dtype == np.int8:
            datatype = 'nalt'
        else:
            datatype = 'ralt'

        if autofilename:
            prefixname = '%s-%d-%d' % ('r' if datatype == 'ralt' else 'n',
                                       len(self.df_M.columns), len(self.M))

        outmatrix = prefixname + ('.ralt' if datatype == 'ralt' else '.nalt')
        if fmt == 'pickle':
            outmatrix = outmatrix + '.pickle.gz'
            self.df_M.to_pickle(outmatrix)
        elif fmt == 'npy':
            outmatrix = outmatrix + '.npy.gz'
            with gzopen(outmatrix, 'wb') as f:
                a = np.array([np.array(self.df_M.columns), self.df_M.values])
                np.save(f, a)
        else:
            outmatrix = outmatrix + '.txt.gz'
            self.df_M.to_csv(outmatrix, sep='\t', index=False)
        cerr('[I - writing genotype data to %s]' % outmatrix)

        if with_position:
            outpos = prefixname + '.pos.txt.gz'
            self.df_P.to_csv(outpos, sep='\t', index=False)

            cerr('[I - writing position data to %s]' % (outpos))
예제 #10
0
def geno2filtindv(args):

    cerr('I: reading genotype file')
    genoparser = tabparser.GenotypeLineParser(args)
    cerr('I: generating haplotypes')
    haplotypes = genoparser.parse_haplotypes()

    cerr('I: scanning haplotypes')
    flags = [True] * len(haplotypes)
    for idx, haplo in enumerate(haplotypes):
        missingness = haplo.count(b'-') / len(haplo)
        if missingness > args.cutoff:
            flags[idx] = False
        #cerr('I: %4d : %f' % (idx, haplo.count(b'-')/len(haplo))

    cerr('I: filtering samples')
    outfile = open(args.outfile, 'w')
    genoparser2 = tabparser.GenotypeLineParser(args)
    samples = itertools.compress(genoparser2.samples, flags)
    outfile.write('\t'.join(samples))
    outfile.write('\n')

    for posline, genoline in genoparser2.parse_raw_lines():
        new_genotypes = itertools.compress(genoline.strip().split(), flags)
        outfile.write('\t'.join(new_genotypes))
        outfile.write('\n')

    cerr('I: writing for %d samples' % flags.count(True))
예제 #11
0
    def __init__(self, vcffile, chroms=None, filters='', **kwargs):
        self.vcffile = vcffile
        self._hdl = open(self.vcffile)

        # parse filter
        cerr('Filters: %s' % filters)
        self.filters = {}
        for filter_item in filters.split(','):
            if not filter_item: continue
            if '=' in filter_item:
                k, v = filter_item.split('=', 1)
                k = k.strip()
                self._check_keyword(k)
                self.filters[k.strip()] = float(v.strip())
            else:
                filter_item = filter_item.strip()
                self._check_keyword(filter_item)
                self.filters[filter_item] = True

        self.sample_labels = None
        if ',' in chroms:
            chroms = [c.strip() for c in chroms.split(',')]
        self.chroms = chroms
        self.installed_filters = [
            self._filter_MissingThreshold,
            self._filter_HetThreshold,
            self._filter_MAF,
            self._filter_MAC,
        ]
        self.dp = 0 if 'DP' not in kwargs else int(kwargs['DP'])
        self.ad = 0 if 'AD' not in kwargs else int(kwargs['AD'])
        self.init_params(**kwargs)
예제 #12
0
파일: debug.py 프로젝트: trmznt/insane
def D( level, text ):
    cf = getouterframes( currentframe() )[ 1 ]
    #print cf
    #infos = getframeinfo( cf )
    if level >= debug_level:
        cerr("[%s] %s [%s:%s]:: %s " % (time.strftime('%H:%M:%S'),
                cf[3], cf[1], cf[2],  text))
예제 #13
0
파일: ralt2exhqc.py 프로젝트: trmznt/pys
def do_export_ralt(M, sample_idx, site_idx, indv_idx, args):

    cerr('[I - exporting sample and position indexes for %d samples' % args.s)
    collected_samples = indv_idx[:args.s]
    filt_site_idx, inf_site_idx = filter_site_idx(M, collected_samples, site_idx, mac=args.mac)
    np.savetxt('exhqc.indv.txt', collected_samples, fmt='%d')
    np.savetxt('exhqc.pos.txt', inf_site_idx, fmt='%d')
예제 #14
0
def main():

    greet()
    if len(sys.argv) == 1:
        usage()

    if sys.argv[1].endswith('.py'):
        # will execute a script file
        seqpy.cerr('Attempting to run script: %s' % sys.argv[1])
        with open(sys.argv[1]) as fh:
            code = compile(fh.read(), sys.argv[1], 'exec')
            sys.argv = sys.argv[1:]
            _l = {}
            module = exec(code, None, _l)
            if 'main' in _l:
                globals().update(_l)
                main = _l['main']
                if 'init_argparser' in _l:
                    init_argparser = _l['init_argparser']
                    p = init_argparser()
                    args = p.parse_args(sys.argv[1:])
                    main(args)
                else:
                    main()

    elif sys.argv[1] == '-i':
        # interactive
        pass

    else:
        from seqpy import cmds
        cmds.execute(sys.argv[1:])
예제 #15
0
파일: barplot.py 프로젝트: trmznt/pys
def barplot( args ):

    cerr('I: reading data...')
    df = pandas.read_table( args.infile )

    column = df.columns[args.column - 1]
    cerr('I: selecting column %s' % column)

    if args.asc:
        cerr('I: sorting ascending...')
        df = df.sort_values( column )
    elif args.desc:
        cerr('I: sorting descending...')
        df = df.sort_values( column, ascending=False)

    heights = df[column]

    cerr('I: plotting...')
    #plt.bar( np.arange(0, len(heights)), heights, 1.0)
    #plt.plot( heights )
    plt.scatter( np.arange(0, len(heights)), heights, 0.25 )
    if args.xlabel:
        plt.xlabel(args.xlabel)
    if args.ylabel:
        plt.ylabel(args.ylabel)
    if args.title:
        plt.title(args.title)
    plt.savefig(args.outfile, dpi = args.dpi)
예제 #16
0
파일: tabparser.py 프로젝트: trmznt/seqpy
    def parse_haplotypes(self, maxline=-1):
        """ this return a list like the following:
        [   '0000022020',
            '0002020000' ]
        """

        if not self.posfile:
            self.parse_position_header()

        M = []
        for (idx, paired_line) in enumerate(zip(self.posfile, self.infile)):
            if maxline > 0 and idx >= maxline:
                break

            posline, genoline = paired_line
            if self.include_positions:
                posinfo = posline.strip('\n').split('\t')
                if (posinfo[0], posinfo[1]) not in self.include_positions:
                    continue
            tokens = genoline.strip().split('\t')

            M.append(x[0] for x in tokens)

        cerr('I: haplotyping for %d SNP positions' % len(M))

        # do transpose
        M_t = [*zip(*M)]
        H = [''.join(x).encode('UTF-8') for x in M_t]
        return H
예제 #17
0
def txt2select(args):

    df = pandas.read_table(args.infile, delimiter='\t', na_values='  nan')

    filtered_positions = {}

    for i in args.column.split(','):

        i = int(i) - 1
        column = df.columns[i]
        cerr('I: selecting with column %s' % column)

        if args.minthreshold is not None:
            df_filtered = df[df[column] > args.minthreshold]
        else:
            df_filtered = df.nlargest(args.topmax, column)
        for r in df_filtered.itertuples():
            filtered_positions[(r[1], int(r[2]))] = True

    sorted_positions = sorted(filtered_positions.keys())

    with open(args.outfile, 'w') as outfile:
        outfile.write('CHROM\tPOS\n')
        for k in sorted_positions:
            outfile.write('%s\t%d\n' % (k))

    cerr('I: writing %d positions' % len(sorted_positions))
예제 #18
0
파일: mpsim.py 프로젝트: trmznt/seqpy
def cross_validate(models,
                   haplotypes,
                   group_keys,
                   repeats,
                   fold,
                   outfile,
                   outsnp=None,
                   logfile=None,
                   outpred=None,
                   procs=1):
    """ distribute the repeats over multi process
    """

    start_time = time.monotonic()
    cerr('[I - cross_validate() for %d model(s)]' % (len(models)))

    seed = np.random.randint(1e8)
    group_keys = np.array(
        group_keys) if type(group_keys) != np.ndarray else group_keys

    arguments = [(group_keys, fold, seed + n) for n in range(repeats)]

    worker_func = cross_validate_worker
    run_worker(models, haplotypes, arguments, worker_func, procs, outfile,
               outsnp, logfile, outpred)

    cerr('[I - cross_validate() finished in %6.2f minute(s) at %s]' %
         ((time.monotonic() - start_time) / 60, datetime.datetime.now()))
예제 #19
0
파일: naltparser.py 프로젝트: trmznt/seqpy
    def get_position_indexes(self, poslines):

        # create dictionary of [chr][pos] = index
        d = {}
        for i, line in enumerate(self.P):
            try:
                d[line[0]][line[1]] = i
            except KeyError:
                d[line[0]] = {line[1]: i}

        indexes = []
        counter = 0
        for line in poslines:
            if not line: continue
            try:
                counter += 1
                indexes.append(d[line[0]][int(line[1])])
            except KeyError:
                cerr('[I - warning: position not found: %s %s]' %
                     (line[0], line[1]))

        cerr('[I - warning: only found %d out of %d positions]' %
             (len(indexes), counter))

        return indexes
예제 #20
0
파일: pos2bed.py 프로젝트: trmznt/pys
def pos2bed_microhaps(args, positions):

    if args.namecol < 0:
        cexit('ERR: microhaps mode needs --namecol option!')

    with open(args.outfile, 'w') as fout:
        mh_name = ''
        mh_seq = ''
        mh_1pos = -1
        mh_2pos = -1
        for entry in positions:
            seq = entry[0]
            pos = int(entry[1])
            name = entry[args.namecol]

            if name == mh_name and mh_seq == seq:
                mh_2pos = pos
                continue

            if mh_name:
                fout.write('%s\t%d\t%d\t%s\n' %
                           (mh_seq, mh_1pos, mh_2pos, mh_name))

            mh_name = name
            mh_seq = seq
            mh_1pos = pos - 1

        fout.write('%s\t%d\t%d\t%s\n' % (mh_seq, mh_1pos, mh_2pos, mh_name))

    cerr('[I - writing microhap-based BED to %s]' % args.outfile)
예제 #21
0
파일: naltparser.py 프로젝트: trmznt/seqpy
    def filter_mac(self, mac=1, inplace=True):

        # get posindex whose MAC >= mac
        snpindex = self.get_snpindex(mac=mac)
        cerr('[I - filtering MAC = %d from %d SNPs to %d SNPs]' %
             (mac, len(allele_mac), len(snpindex)))

        return self.filter_positions(snpindex, inplace)
예제 #22
0
파일: seq2fst.py 프로젝트: trmznt/pys
def seq2fst(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        cexit('[ERR - seq2fst.py requires group information!]')

    for grp_seq in group_seqs:
        cerr('[I - group %s has %d sample(s)]' %
             (grp_seq, len(group_seqs[grp_seq])))

    if args.sitefile:
        # perform FST site-wise
        FST_sites = calc_site_fst(group_seqs, args.nantozero)

        with open(args.sitefile, 'w') as fout:
            for (label, mat) in FST_sites:
                fout.write(label)
                fout.write('\t')
                np.savetxt(fout,
                           mat,
                           fmt='%5.4f',
                           delimiter='\t',
                           newline='\t')
                fout.write('\n')

        cerr('[I - site FST written to %s]' % (args.sitefile))
        return

    FST_mat, groups = calc_fst(group_seqs)

    with open(args.outfile, 'w') as fout:
        fout.write('\t'.join(groups))
        fout.write('\n')
        np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
예제 #23
0
def vcf2ped( args ):
    """ create a ped and map file based on vcf and metafile, suitable for isoRelate """

    # open group file
    group_parser = grpparser.GroupParser( args )

    # open VCF file
    cerr('[I: reading VCF...]')
    start_time = time.monotonic()
    vcfset = allel.read_vcf(args.infile,
                fields = ['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT'])
    cerr('[I: read %s site, %s samples in %d secs]' % (len(vcfset['variants/CHROM']),
         len(vcfset['samples']), time.monotonic() - start_time))

    # assign groups
    samples = vcfset['samples']
    group_parser.assign_groups(samples)
    groups = group_parser.group_keys
    #import IPython; IPython.embed()

    # write to PED
    with open(args.outprefix + '.ped', 'w') as outf:
        for i in range(len(samples)):
            outf.write('%s\t%s\t0\t0\t1\t0\t' % (groups[i], samples[i]))
            alleles = []
            for gt in vcfset['calldata/GT'][:,i]:
                allele_1, allele_2 = gt
                #print(allele_1, allele_2)
                if allele_1 == allele_2:
                    if allele_1 == -1:
                        alleles += [0, 0]
                    elif allele_1 == 0:
                        alleles += [1, 1]
                    elif allele_1 == 1:
                        alleles += [2, 2]
                    else:
                        alleles += [1, 1]
                else:
                    alleles += [1, 2]
            outf.write('\t'.join( str(i) for i in alleles))
            outf.write('\n')
            #import IPython; IPython.embed()

    # write to MAP
    with open(args.outprefix + '.map', 'w') as outf:
        last_pos = 0
        curr_chr = None
        for (chrom, pos) in zip( vcfset['variants/CHROM'], vcfset['variants/POS'] ):
            if curr_chr != chrom:
                curr_chr = chrom
                last_pos = 0
            dist = (pos - last_pos) * 1e-6
            last_pos = pos
            outf.write('%s\t%s:%d\t%8.6f\t%d\n' % (chrom, chrom, pos, dist, pos))
예제 #24
0
def filter_imiss(M, site_idx, sample_idx, imiss):

    cerr('[I - filtering for sample missingness < %4.3f]' % imiss)
    check_sanity(M, site_idx, sample_idx)
    indv_missingness = np.count_nonzero(M < 0, axis=0) / len(site_idx)
    indexes = np.where( indv_missingness <= (1.0 - imiss) * indv_missingness.max() )

    M2 = M[:, indexes[0]]
    sample_idx2 = sample_idx[ indexes[0] ]
    cerr('[I - keeping %d from %d samples]' % (len(sample_idx2), len(sample_idx)))

    return M2, site_idx, sample_idx2
예제 #25
0
파일: fas2table.py 프로젝트: trmznt/pys
def fas2table(args):

    msa = load(args.infile)
    ref = load(args.reffile)

    table = generate_table(msa, ref)

    with open(args.outfile, 'w') as fout:
        for (label, muts) in table:
            fout.write('%s/\t%s\n' % (label, ' '.join(muts)))

    cerr('[Writing table to %s]' % args.outfile)
예제 #26
0
파일: geno2fst.py 프로젝트: trmznt/pys
def geno2fst( args ):

    lineparser = tabparser.GenotypeLineParser( args )
    lineparser.set_translator(lineparser.diploid_translator)

    cout('Grouping:')
    groups = lineparser.parse_grouping()
    for k in groups:
        cout(' %12s %3d' % (k, len(groups[k])))

    FST = [] # FST indexed by group_keys
    group_keys = sorted(groups.keys())
    cout(group_keys)

    # output to file
    cout('Writing outfile...')
    outfile = open(args.outfile, 'w')

    outfile.write('CHROM\tPOS\tREGION\tMAX\tMEAN\tMEDIAN\tMAF\t%s\n' % '\t'.join(group_keys) )

    idx = 0
    for (posinfo, genolist) in lineparser.parse():

        idx += 1
        genoarray = allel.GenotypeArray( [genolist]  )

        # calculate MAF
        ac = genoarray.count_alleles()
        num = np.min(ac)
        denom = np.sum(ac)
        if num == denom:
            maf = 0
        else:
            maf = np.min(ac)/np.sum(ac)

        # calculate FST per group against other samples

        fst_sites = []
        for g in group_keys:
            ac_g = genoarray.count_alleles(subpop = groups[g])
            ac_ng = genoarray.count_alleles(subpop = list( lineparser.sample_idx - set(groups[g])))
            num, den = allel.stats.hudson_fst(ac_g, ac_ng)
            fst = num[0]/den[0]
            if not (0.0 <= fst <= 1.0):
                fst = 0
            fst_sites.append( fst )

        if idx % 100 == 0:
            cerr('I: writing position no %d' % idx)

        outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%5.4f\t%s\n' %
                        (posinfo[0], posinfo[1], posinfo[4], np.max(fst_sites), np.mean(fst_sites), np.median(fst_sites), maf,
                            '\t'.join( '%5.4f' % x for x in fst_sites)))
예제 #27
0
def filter_lmiss(M, site_idx, sample_idx, lmiss):

    cerr('[I - filtering for SNP missingness < %4.3f]' % lmiss)
    check_sanity(M, site_idx, sample_idx)
    site_missingness = np.count_nonzero(M < 0, axis=1) / len(sample_idx)
    indexes = np.where( site_missingness <= (1.0 - lmiss) * site_missingness.max())

    M2 = M[ indexes[0], : ]
    site_idx2 = site_idx[ indexes[0] ]
    cerr('[I - keeping %d from %d sites]' % (len(site_idx2), len(site_idx)))
    #import IPython; IPython.embed()

    return M2, site_idx2, sample_idx
예제 #28
0
def ralt2nalt(args):

    ralt_parser = naltparser.NAltLineParser(args,
                                            datatype='ralt',
                                            with_group=False,
                                            with_position=False)

    region = ralt_parser.parse_whole()

    # convert to n_alt
    cerr('[I - converting to nalt format]')
    cerr('[ M dtype: {}]'.format(region.M.dtype))
    region.ralt_to_nalt(hetratio=args.hetratio if not args.major else -1)
    cerr('[ M dtype: {}]'.format(region.M.dtype))

    region.save(args.outfmt,
                prefixname=args.outfile,
                autofilename=args.autofilename,
                with_position=False)
    return

    # write to outfile
    with open(args.outfile, 'w') as outfile:
        # write header
        outfile.write(ralt_parser.get_sample_header())
        outfile.write('\n')
        np.savetxt(outfile, region.M, fmt='%d', delimiter='\t')

    cerr('[I: finish writing to %s' % args.outfile)
예제 #29
0
def ralt2iterqc( args ):

    cerr('[I - reading input files]')

    start_time = time.monotonic()
    df = pd.read_csv(args.infile, sep='\t', dtype=float,
            nrows=args.n if args.n > 0 else None)
    samples = df.columns
    sample_idx = np.arange(len(samples))
    M = df.values
    site_idx = np.arange(len(M))

    cerr('[I - reading %d sites for %d samples in %d secs]'
        % (len(site_idx), len(sample_idx), time.monotonic() - start_time))

    for i in range(args.iter):
        cerr('[I - ITER -> %d]' % (i+1))
        site_N = len(site_idx)
        sample_N = len(sample_idx)
        if args.lmiss > 0:
            M, site_idx, sample_idx = filter_lmiss(M, site_idx, sample_idx, args.lmiss)
        if args.imiss > 0:
            M, site_idx, sample_idx = filter_imiss(M, site_idx, sample_idx, args.imiss)
        if args.mac > 0:
            M, site_idx, sample_idx = filter_mac(M, site_idx, sample_idx, args.mac)
        if site_N == len(site_idx) and sample_N == len(sample_idx):
            cerr('[I - filtering has converged]')
            break
예제 #30
0
    def assign_groups(self, samples):

        if not self.group_info:
            self.parse()

        groups = {}
        sample_idx = []
        group_keys = []
        for idx, code in enumerate(samples):
            grp_key = self.group_info[code]
            if grp_key in groups:
                groups[grp_key].append(idx)
            else:
                groups[grp_key] = [idx]
            sample_idx.append(idx)
            group_keys.append(grp_key)

        self.samples = samples
        self.sample_idx = set(sample_idx)
        self.groups = groups
        self.group_keys = group_keys

        if self.colourfile:
            # parse colour file
            self.colourfile.seek(0)
            next(self.colourfile)
            for line in self.colourfile:
                tokens = line.strip().split('\t')
                self.group_colours[tokens[0]] = tokens[1]

            # checking whether all groups has been assigned with colours
            for k in self.groups:
                if k not in self.group_colours:
                    cexit('E: group %s is not assigned' % k)

            cerr('[I: assigning manual colours to %d groups]' %
                 (len(self.group_colours)))

        else:
            colour_wheel = cycle(colour_list)
            for k in sorted(self.groups.keys()):
                self.group_colours[k] = next(colour_wheel)

            if len(self.groups.keys()) > len(colour_list):
                cerr(
                    "W: warning, no of groups (%d) exceeds available colour list!"
                    % len(self.groups.keys()))

        return self.groups