def groupinfo(args): # open and read the first line of infile if args.fmt in ['pickle', 'npy']: from seqpy.core.bioio import naltparser from types import SimpleNamespace nalt_args = SimpleNamespace(infile=args.infile, fmt=args.fmt, n=-1) nalt_parser = naltparser.NAltLineParser(nalt_args, with_group=False, with_position=False) samples = nalt_parser.samples elif args.fmt == 'list': with gzopen(args.infile) as f: buf = f.read() samples = buf.split() else: with gzopen(args.infile) as f: samples = f.readline().strip().split() group_parser = grpparser.GroupParser(args) groups = group_parser.assign_groups(samples) total = 0 cout('Groups:') for g in sorted(groups.keys()): c = len(groups[g]) cout(' %3d - %s' % (c, g)) total += c cout('Total: %d samples' % total)
def __init__(self, args, datatype='nalt', with_group=True, with_position=True): self.group_parser = grpparser.GroupParser(args) if with_group else None self.position_parser = PositionParser(args) if with_position else None self.infile = args.infile self.fmt = args.fmt self.n = args.n self.dtype = np.int8 if datatype == 'nalt' else np.float #self.convert_data = lambda line: np.loadtxt(io.StringIO(line), # dtype = self.dtype, delimiter='\t') #self.convert_data = lambda line: pd.read_csv(io.StringIO(line), # dtype = dtype, delimiter='\t', header=None).values self.convert_data = lambda line: np.fromfile( io.StringIO(line), dtype=dtype, delimiter='\t') self.df = None self.M = None self.samples = None self.parse_samples()
def pcoa(args): cerr('I: reading group info') group_parser = grpparser.GroupParser(args) group_parser.parse() with open(args.infile, 'rb') as infile: cerr('I: reading sample header...') samples = next(infile).decode('UTF-8').strip().split() groups = group_parser.assign_groups(samples) cerr('I: reading distance matrix') distm = np.loadtxt(infile, delimiter='\t') pcoa = allel.pcoa(distm) fig = plt.figure(figsize=(27, 9), dpi=args.dpi) fig_idx = 1 colour_list = group_parser.colour_list() for pcx, pcy in combinations([0, 1, 2], 2): ax = fig.add_subplot(1, 3, fig_idx) fig_idx += 1 make_plot(ax, pcoa[0][:, pcx], pcoa[0][:, pcy], colour_list, args.dotsize) fig.tight_layout() fig.savefig(args.outfile)
def grp2anno(args): # read group file group_parser = grpparser.GroupParser(args) # open infile with gzopen(args.infile) as infile: header = next(infile) if args.delimiter is not None: samples = header.strip().split(args.delimiter) else: samples = header.strip().split() #import IPython; IPython.embed() groups = group_parser.assign_groups(samples) group_keys = sorted(groups.keys()) colours = group_parser.colour_list() with open(args.outfile + '.indv.txt', 'w') as outfile: outfile.write('SAMPLE\tCOLOUR\n') for s, c in zip(samples, colours): outfile.write('%s\t%s\n' % (s, c)) with open(args.outfile + '.group.txt', 'w') as outfile: outfile.write('GROUP\tCOLOUR\n') for g, c in zip(group_keys, group_parser.group_colour_list(group_keys)): if args.s: g = '%s (%d)' % (g, len(groups[g])) outfile.write('%s\t%s\n' % (g, c))
def seq2fst(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: cexit('[ERR - seq2fst.py requires group information!]') for grp_seq in group_seqs: cerr('[I - group %s has %d sample(s)]' % (grp_seq, len(group_seqs[grp_seq]))) if args.sitefile: # perform FST site-wise FST_sites = calc_site_fst(group_seqs, args.nantozero) with open(args.sitefile, 'w') as fout: for (label, mat) in FST_sites: fout.write(label) fout.write('\t') np.savetxt(fout, mat, fmt='%5.4f', delimiter='\t', newline='\t') fout.write('\n') cerr('[I - site FST written to %s]' % (args.sitefile)) return FST_mat, groups = calc_fst(group_seqs) with open(args.outfile, 'w') as fout: fout.write('\t'.join(groups)) fout.write('\n') np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
def vcf2ped( args ): """ create a ped and map file based on vcf and metafile, suitable for isoRelate """ # open group file group_parser = grpparser.GroupParser( args ) # open VCF file cerr('[I: reading VCF...]') start_time = time.monotonic() vcfset = allel.read_vcf(args.infile, fields = ['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT']) cerr('[I: read %s site, %s samples in %d secs]' % (len(vcfset['variants/CHROM']), len(vcfset['samples']), time.monotonic() - start_time)) # assign groups samples = vcfset['samples'] group_parser.assign_groups(samples) groups = group_parser.group_keys #import IPython; IPython.embed() # write to PED with open(args.outprefix + '.ped', 'w') as outf: for i in range(len(samples)): outf.write('%s\t%s\t0\t0\t1\t0\t' % (groups[i], samples[i])) alleles = [] for gt in vcfset['calldata/GT'][:,i]: allele_1, allele_2 = gt #print(allele_1, allele_2) if allele_1 == allele_2: if allele_1 == -1: alleles += [0, 0] elif allele_1 == 0: alleles += [1, 1] elif allele_1 == 1: alleles += [2, 2] else: alleles += [1, 1] else: alleles += [1, 2] outf.write('\t'.join( str(i) for i in alleles)) outf.write('\n') #import IPython; IPython.embed() # write to MAP with open(args.outprefix + '.map', 'w') as outf: last_pos = 0 curr_chr = None for (chrom, pos) in zip( vcfset['variants/CHROM'], vcfset['variants/POS'] ): if curr_chr != chrom: curr_chr = chrom last_pos = 0 dist = (pos - last_pos) * 1e-6 last_pos = pos outf.write('%s\t%s:%d\t%8.6f\t%d\n' % (chrom, chrom, pos, dist, pos))
def dist2popdist( args ): # read group assignment group_parser = grpparser.GroupParser( args ) # read distance matrix df = pd.read_csv(args.infile, sep='\t') samples = df.columns D = df.values groups = group_parser.assign_groups(samples) group_keys = sorted(groups.keys()) n = len(groups) M = np.zeros( (n, n) ) # calculate intra population #for i, g in enumerate(group_keys): # d = c = 0 # for x,y in combinations( groups[g], 2): # d += D[x,y] # c += 1 # M[i,i] = d/c # calculate inter population for i, j in combinations_with_replacement(range(n), 2): d = c = 0 for x,y in product(groups[ group_keys[i] ], groups[ group_keys[j] ] ): d += D[x,y] c += 1 M[i,j] = M[j,i] = d/c # perform Dxy calculation P = np.zeros( (n,n) ) for i, j in combinations( range(n), 2 ): P[i,j] = P[j,i] = M[i,j] - 0.5*(M[i,i] + M[j,j]) # write distance matrix with open(args.outfile + '.popdxy.txt','wt') as outfile: # write dxy outfile.write( '%s\n' % '\t'.join( group_keys ) ) np.savetxt(outfile, M, delimiter='\t', fmt='%4.3f') with open(args.outfile + '.popdist.txt', 'wt') as outfile: # write distance outfile.write( '%s\n' % '\t'.join( group_keys ) ) np.savetxt(outfile, P, delimiter='\t', fmt='%4.3f')
def seq2pi(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: group_seqs = {'ALL': seqs} print('Groups:') outf = open(args.outfile, 'w') if args.outfile else None if outf: outf.write('GROUP\tN\tPI\tSTDDEV\n') for g in group_seqs: avg, stddev = calc_pi(group_seqs[g]) cout(' %20s [%3d]: %f +- %f' % (g, len(group_seqs[g]), avg, stddev)) if outf: outf.write('%s\t%d\t%5.4f\t%5.4f\n' % (g, len(group_seqs[g]), avg, stddev)) if outf: cerr('[I - result written to %s' % args.outfile)
def __init__(self, args): self.group_parser = grpparser.GroupParser(args) self.infile = gzopen(args.infile, 'rt') self.posfilename = args.posfile self.position = None self.posfile_header = None self.posfile = None self.sample_header = None self.samples = None # read included positions self.include_positions = {} if args.includepos: with open(args.includepos) as infile: next(infile) for line in infile: tokens = line.strip().split('\t') self.include_positions[(tokens[0], tokens[1])] = True # need to read header of genotype self.parse_sample()
def consolidate_predictions(args): outreport = None if args.samplefile: samples = read_samplefile(args.samplefile, args.fmt) else: samples = None group_parser = grpparser.GroupParser(args) group_parser.assign_groups(samples) #group_parser.group_keys contains [ 'grp1', 'grp2', etc] group_keys = group_parser.group_keys with open(args.infile, 'rb') as f: predictions = pickle.load(f) if args.outreport: outreport = open(args.outreport, 'wb') from sklearn.metrics import confusion_matrix reports = {} normalize = True for model in predictions: model_pred = predictions[model] for k in model_pred: cerr('Preparing for model: {} k: {}'.format(model, k)) df = generate_dataframe(model_pred[k]) group_indexes = np.argmax(df.values, axis=1) group_predictions = df.columns[group_indexes[:, None]] for i in range(len(group_indexes)): predicted_group = df.columns[group_indexes[i]] prediction_confidence = df.values[i, group_indexes[i]] if prediction_confidence < args.threshold or predicted_group != group_keys[ i]: cout('{}: {} -> {} ({})'.format(samples[i], group_keys[i], predicted_group, prediction_confidence)) if outreport: score = lkmodels.calculate_scores(group_keys, group_predictions) confmat = confusion_matrix(group_keys, group_predictions) if normalize: confmat = confmat.astype('float') / confmat.sum( axis=1)[:, np.newaxis] cerr("[I - Normalized confusion matrix]") else: cerr('[I - Confusion matrix, without normalization]') reports['{}|{}'.format(model, k)] = { 'score': score, 'confmat': confmat } if outreport: pickle.dump(reports, outreport) cerr('[I - writing pickled report to {}]'.format(args.outreport))