Exemplo n.º 1
0
 def get_sample_header(self, bytestring=False):
     if not self.samples:
         cexit('E: need to parse sample header first')
     header = '\t'.join(self.samples)
     if bytestring:
         return header.encode('UTF-8')
     return header
Exemplo n.º 2
0
def pos2bed_microhaps(args, positions):

    if args.namecol < 0:
        cexit('ERR: microhaps mode needs --namecol option!')

    with open(args.outfile, 'w') as fout:
        mh_name = ''
        mh_seq = ''
        mh_1pos = -1
        mh_2pos = -1
        for entry in positions:
            seq = entry[0]
            pos = int(entry[1])
            name = entry[args.namecol]

            if name == mh_name and mh_seq == seq:
                mh_2pos = pos
                continue

            if mh_name:
                fout.write('%s\t%d\t%d\t%s\n' %
                           (mh_seq, mh_1pos, mh_2pos, mh_name))

            mh_name = name
            mh_seq = seq
            mh_1pos = pos - 1

        fout.write('%s\t%d\t%d\t%s\n' % (mh_seq, mh_1pos, mh_2pos, mh_name))

    cerr('[I - writing microhap-based BED to %s]' % args.outfile)
Exemplo n.º 3
0
def dist2clonalqc(args):

    # read distance matrix
    df = pd.read_csv(args.infile, sep='\t')
    samples = df.columns
    D = df.values

    # read quality file or pickled ralt/nalt file
    if args.datafile:

        nalt_args = SimpleNamespace(infile=args.datafile, fmt=args.fmt, n=-1)
        nalt_parser = naltparser.NAltLineParser(nalt_args,
                                                with_group=False,
                                                with_position=False)
        region = nalt_parser.parse_whole()
        qual = np.count_nonzero(region.M == -1, axis=0)

    else:
        cexit('ERR: other input file has not been defined')

    clonal_samples = clonal_index(D,
                                  qual.max() - qual, samples, args.threshold)
    cerr('[I - removing %d clonal samples]' % len(clonal_samples))
    if args.outfile:
        np.savetxt(args.outfile, samples[clonal_samples], fmt='%s')
Exemplo n.º 4
0
def prune_2(genotypes, positions, threshold=0.5, score=None):
	""" prune by r^2 except on CDS, only CDS within the same segment/region will be pruned
	"""

	if score == None:
		# we use MAC as default score
		score = np.min( genotypes.count_alleles(), axis=1)

	N = len(genotypes)

	if N != len(score) or N != len(positions):
		cexit('E: length of genotypes != length of score nor positions!')

	index = arrange_index( score )
	compress_index = np.ones( len(index), dtype=np.int8 )

	# calculate r^2
	r_2 = calculate_r_2( genotypes )

	# walk through index
	cerr('I: scanning r^2 matrix')
	for i in range(N):
		if not compress_index[i]:
			continue

		for _, j in scoring_index[i+1:]:
			if r_2[i,j] > threshold:
				# check if this is a CDS region
				if positions[j][4] and positions[j][4] != positions[i][4]:
					continue
				compress_index[j] = 0

	return compress_index
Exemplo n.º 5
0
def prune_1(genotypes, threshold=0.5, score=None):
	""" prune by r^2 with score as priority,
		returning indexing array
	"""

	N = len(genotypes)

	if score == None:
		# we use MAC as default score
		score = np.min( count_allele(genotypes), axis=1)

	if N != len(score):
		cexit('E: length of genotypes !+ length of score! ({} vs {})'.format(N, len(score)))

	index = arrange_index( score )
	compress_index = np.ones( len(index), dtype=np.int8 )

	# calculate r^2
	r_2 = calculate_r_2( genotypes )
	count = 0

	# walk through index
	cerr('[I - pruning for {} SNPs]'.format(N))
	for i in range(N):
		if not compress_index[i]:
			continue

		for j in index[i+1:]:
			if r_2[i,j] > threshold:
				compress_index[j] = 0
				count += 1

	pruned_index = np.nonzero(compress_index)[0]
	return pruned_index
Exemplo n.º 6
0
    def __init__(self,
                 model_id,
                 k,
                 guide_tree=None,
                 min_fst=0.9,
                 ultimate_fst=1.0,
                 priority=None,
                 max_leaf_snp=0,
                 snpindex=None,
                 iteration=1,
                 seed=None):
        super().__init__(model_id, k, snpindex, iteration, seed)
        """
            min_fst: the minimum FST before using select_2() (alternate SNP selection)
            ultimate_fst: the minimun FST to be prioritized for selection
            priority: index of SNPs to be prioritized for selection (after filtering
                        by ultimate_fst)
        """

        if guide_tree is None:
            cexit('[E - HierarchicalFSTSelector requires guide tree]')
        self.guide_tree = guide_tree
        self.min_fst = min_fst
        self.priority = priority
        self.max_leaf_snp = max_leaf_snp
        self.ultimate_fst = ultimate_fst
Exemplo n.º 7
0
def check_sanity(M, site_idx, sample_idx):
    shape = M.shape
    # sanity checking
    if len(sample_idx) != shape[1]:
        print( len(sample_idx), shape )
        cexit('[E - inconsistent M shape and no of samples!]')
    if len(site_idx) != shape[0]:
        cexit('[E - inconsistent M shape and no of sites!]')
Exemplo n.º 8
0
def seq2fst(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        cexit('[ERR - seq2fst.py requires group information!]')

    for grp_seq in group_seqs:
        cerr('[I - group %s has %d sample(s)]' %
             (grp_seq, len(group_seqs[grp_seq])))

    if args.sitefile:
        # perform FST site-wise
        FST_sites = calc_site_fst(group_seqs, args.nantozero)

        with open(args.sitefile, 'w') as fout:
            for (label, mat) in FST_sites:
                fout.write(label)
                fout.write('\t')
                np.savetxt(fout,
                           mat,
                           fmt='%5.4f',
                           delimiter='\t',
                           newline='\t')
                fout.write('\n')

        cerr('[I - site FST written to %s]' % (args.sitefile))
        return

    FST_mat, groups = calc_fst(group_seqs)

    with open(args.outfile, 'w') as fout:
        fout.write('\t'.join(groups))
        fout.write('\n')
        np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
Exemplo n.º 9
0
def execute(args):
    if len(args) < 1:
        usage()

    command = args[0]
    M = importlib.import_module('seqpy.cmds.' + command)
    print(M)
    parser = M.init_argparser()
    if not parser:
        seqpy.cexit('Fatal ERR: init_argparser() does not return properly')

    args = parser.parse_args(args[1:])
    M.main(args)
Exemplo n.º 10
0
    def __init__(self, args):

        # positions
        if args.posfile is None:
            cexit('ERROR: required --posfile')
        self.posfilename = args.posfile
        self.posfile = None
        self.posfile_header = None
        self.positions = None
        self.header = None
        self.n = args.n
        self.df = None
        self.M = None
Exemplo n.º 11
0
    def assign_groups(self, samples):

        if not self.group_info:
            self.parse()

        groups = {}
        sample_idx = []
        group_keys = []
        for idx, code in enumerate(samples):
            grp_key = self.group_info[code]
            if grp_key in groups:
                groups[grp_key].append(idx)
            else:
                groups[grp_key] = [idx]
            sample_idx.append(idx)
            group_keys.append(grp_key)

        self.samples = samples
        self.sample_idx = set(sample_idx)
        self.groups = groups
        self.group_keys = group_keys

        if self.colourfile:
            # parse colour file
            self.colourfile.seek(0)
            next(self.colourfile)
            for line in self.colourfile:
                tokens = line.strip().split('\t')
                self.group_colours[tokens[0]] = tokens[1]

            # checking whether all groups has been assigned with colours
            for k in self.groups:
                if k not in self.group_colours:
                    cexit('E: group %s is not assigned' % k)

            cerr('[I: assigning manual colours to %d groups]' %
                 (len(self.group_colours)))

        else:
            colour_wheel = cycle(colour_list)
            for k in sorted(self.groups.keys()):
                self.group_colours[k] = next(colour_wheel)

            if len(self.groups.keys()) > len(colour_list):
                cerr(
                    "W: warning, no of groups (%d) exceeds available colour list!"
                    % len(self.groups.keys()))

        return self.groups
Exemplo n.º 12
0
    def parse_np_haplotypes(self, maxline=-1):
        """ this return a numpy array haplotypes
        [   [0, 0, 0, 0, 2, 2, 0, 2, 0],
            [0, 0, 0, 2, 0, 0, -1, -0. -1] ]
        """

        token2value = {'0': 0, '1': 1, '2': 2, '-': -1}

        S = len(self.samples)

        if self.include_positions:
            M = np.zeros((S, len(self.include_positions)), np.int8)

            l = 0
            for (idx, paired_line) in enumerate(zip(self.posfile,
                                                    self.infile)):
                posline, genoline = paired_line
                posinfo = posline.strip('\n').split('\t')
                if (posinfo[0], posinfo[1]) in self.include_positions:
                    tokens = genoline.strip().split('\t')
                    if len(tokens) != S:
                        cexit('E: inconsistent number of samples!')

                    for i in range(S):
                        M[i, l] = token2value[tokens[i][0]]

                    l += 1

        else:
            # we need to parse positions first
            positions = self.parse_position()
            L = maxline if maxline > 0 else len(positions)
            M = np.zeros((S, L), np.int8)

            for (idx, genoline) in enumerate(self.infile):
                if idx >= L:
                    break

                tokens = genoline.strip().split('\t')
                if len(tokens) != S:
                    raise RuntimeError('E: inconsistent number of samples!')

                for i in range(S):
                    M[i, idx] = token2value[tokens[i][0]]

        return M
Exemplo n.º 13
0
    def __init__(self,
                 model_id,
                 k,
                 guide_tree=None,
                 min_fst=0.9,
                 priority=None,
                 max_leaf_snp=0,
                 snpindex=None,
                 iteration=1,
                 seed=None):
        super().__init__(model_id, k, snpindex, iteration, seed)

        if guide_tree is None:
            cexit('[E - HierarchicalFSTSelector requires guide tree]')
        self.guide_tree = guide_tree
        self.min_fst = min_fst
        self.priority = priority
        self.max_leaf_snp = max_leaf_snp
Exemplo n.º 14
0
    def parse(self):
        """ this is a generator, returning (pos_info, geno_array) """

        if not self.posfile:
            self.parse_position_header()

        for (idx, paired_line) in enumerate(zip(self.posfile, self.infile)):
            pos_line, geno_line = paired_line
            tokens = geno_line.strip().split('\t')
            pos_info = pos_line.strip('\n').split('\t')

            if len(tokens) != len(self.samples):
                cexit(
                    'E: genotype file does not match sample number at line %d'
                    % idx)

            g = self.translate(tokens)

            yield ((pos_info, g))
Exemplo n.º 15
0
Arquivo: lkest.py Projeto: trmznt/pys
def lkest(args, config=None):

    # check for config as dictionary
    if config:

        # general
        args.mode = config.get('mode', None) or args.mode


    try:
        func = mode[args.mode]
    except KeyError:
        cexit('[E - mode %s does not exist!]' % args.mode)

    # run mode
    start_time = time.monotonic()
    func(args)
    cerr('[I - finished in %6.2f minute(s) at %s]'
            % ((time.monotonic() - start_time)/60, datetime.datetime.now()))
Exemplo n.º 16
0
Arquivo: lkest.py Projeto: trmznt/pys
def get_model(args):

    if args.method == 'rand':
        return RandomSelector()

    elif args.method == 'dt':
        return DecisionTreeSelector()

    elif args.method == 'hfst':
        return HierarchicalFSTSelector( guide_tree = parse_guide_tree( open(args.guidetree) ) )

    elif args.method == 'hfst+dt':
        return HHFSTDTSelector( guide_tree = parse_guide_tree( open(args.guidetree) ) )

    elif args.method == 'list':
        return FixSNPSelector(snpfile = args.snpfile)

    else:
        cexit('ERR: please provide method')
Exemplo n.º 17
0
    def parse_all(self, maxline=-1):
        """ this return a full array from the data
            as such, ensure that the memory is big enough before calling this method
        """
        M = []
        for (idx, line) in enumerate(self.infile):
            if maxline > 0 and idx >= maxline:
                break

            tokens = line.strip().split('\t')

            if len(tokens) != len(self.samples):
                cexit(
                    'E: genotype file does not match sample number at line %d'
                    % idx)

            M.append(self.translate(tokens))

        self.parse_position(maxline)
        return M
Exemplo n.º 18
0
def geno2prune(args):

    lineparser = tabparser.GenotypeLineParser(args)
    lineparser.set_translator(lineparser.haploid_translator)

    cerr('I: start parsing genotype file...')

    outfile = open(args.outfile, 'w')
    outfile.write(lineparser.get_position_header())
    outfile.write('\n')

    iter_func = {
        'chrom': lineparser.parse_chromosomes,
        'whole': lineparser.parse_whole,
        'gene': lineparser.parse_genes
    }[args.region]

    for region in iter_func():
        cerr('I: generating genotype array for %s' % region.name)
        genoarray = allel.GenotypeArray(region.genotypes())

        cerr('I: pruning for %s' % region.name)
        if args.scheme == 1:
            index = pruner.prune_1(genoarray, args.threshold)
        elif args.scheme == 2:
            index = pruner.prune_2(genoarray, region.positions(),
                                   args.threshold)
        elif args.scheme == 21:
            index = pruner.prune_21(genoarray, region.positions(),
                                    args.threshold)
        elif args.scheme == 3:
            index = pruner.prune_3(genoarray, region.positions(),
                                   args.threshold)
        else:
            cexit('E: scheme type undefined!')

        new_positions = itertools.compress(region.positions(), index)

        for pos in new_positions:
            outfile.write('%s\n' % '\t'.join(pos))
Exemplo n.º 19
0
def train(args):

    # load profiles if exists

    nalt_parser = naltparser.NAltLineParser(args, datatype='nalt')

    nalt_parser.parse_grouping()
    group_keys = nalt_parser.group_parser.group_keys

    region = nalt_parser.parse_whole()
    samples = nalt_parser.parse_samples()

    poslines = [line.split() for line in open(args.includepos)][1:]
    region.filter_poslines(poslines, inplace=True, sort_position=False)
    haplotypes = region.haplotypes()

    #import IPython; IPython.embed()

    cerr('[I - fitting for {}]'.format(args.code))
    classifier = lkest.SNPLikelihoodEstimator()
    classifier.fit(haplotypes, group_keys)
    profile = classifier.get_profile()
    profile.positions = region.P
    profile.code = args.code
    profile.remark = args.remark

    try:
        with open(args.profile, 'rb') as f:
            profiles = pickle.load(f)
    except FileNotFoundError:
        profiles = {}
    if args.code in profiles and not args.replace:
        cexit('ERR: cannot replace ')
    profiles[args.code] = profile.to_dict()
    with open(args.profile, 'wb') as f:
        pickle.dump(profiles, f)

    cerr('[I - profiles saved to {}]'.format(args.profile))
Exemplo n.º 20
0
    def parse(self):

        # create a dictionary of groups <> sample_idx
        if self.groupfile:
            # this is a YAML/JSON file, open with YAML
            import yaml
            grouping = yaml.load(self.groupfile)
            groups = {}
            for g in grouping:
                for s in grouping[g]:
                    groups[s] = g
            self.group_info = groups

        elif self.metafile:
            # this is a tab/comma delimited file
            metadf = pandas.read_csv(self.metafile, sep=self.delimiter)
            sample_column, group_column = self.column.split(',')
            if sample_column.isdigit():
                sample_column = metadf.columns[int(sample_column) - 1]
            if group_column.isdigit():
                group_column = metadf.columns[int(group_column) - 1]

            cerr('[I: reading metafile for column: %s %s]' %
                 (sample_column, group_column))

            sampledf = metadf.loc[:, [sample_column, group_column]]
            groups = {}
            for i in range(len(sampledf)):
                r = sampledf.loc[i]
                groups[r[0]] = r[1]

            self.group_info = groups

        else:
            cexit('E: need groupfile or metafile')

        return self.group_info
Exemplo n.º 21
0
 def parse_position_header(self):
     if not self.posfilename:
         cexit('E: need --posfile')
     self.posfile = gzopen(self.posfilename)
     self.posfile_header = next(self.posfile).strip()
Exemplo n.º 22
0
# tabparser

from seqpy import cout, cerr, cexit, gzopen
from seqpy.cmds import arg_parser

from seqpy.core.bioio import grpparser

import numpy as np
import pandas
import attr

# requires scikit-allel
try:
    import allel
except:
    cexit('ERR: require properly installed scikit-allel!')


def init_argparser(p=None):

    if p is None:
        p = arg_parser('Genotype file parser')

    p = grpparser.init_argparser(p)

    p.add_argument('--posfile', default=None)
    p.add_argument('--includepos', default='')
    p.add_argument('infile')

    return p
Exemplo n.º 23
0
Arquivo: lkest.py Projeto: trmznt/pys
 def __init__(self, seed=None, guide_tree=None, min_fst = 0.9):
     super().__init__(seed)
     if guide_tree is None:
         cexit('[E - HierarchicalFSTSelector requires guide tree]')
     self.guide_tree = guide_tree
     self.min_fst = min_fst
Exemplo n.º 24
0
from seqpy import cout, cerr, cexit
from seqpy.cmds import arg_parser

from itertools import cycle
import numpy as np

try:
    import pandas
except:
    cexit('ERR: rquire proper pandas instalation [pip3 install pandas]')


def init_argparser(p=None):

    if p is None:
        p = arg_parser('Group file parser')

    p.add_argument('--groupfile', default='')
    p.add_argument('--metafile', default='')
    p.add_argument('--colourfile', default='')
    p.add_argument('--column', default='1,2')

    return p


# TODO: need to provide mechanism to select colour scheme

# 12 colours from ColorBrewer2
colour_list = [
    '#1f78b4', '#33a02c', '#e31a1c', '#ff7f00', '#6a3d9a', '#b15928',
    '#a6cee3', '#b2df8a', '#fb9a99', '#fdbf6f', '#cab2d6', '#ffff99'