def test_matrix2dst(self): matrix = lingpy.algorithm.squareform([0.5, 0.75, 0.8]) # we choose same format for taxa as default taxa = ['t_1', 't_2', 't_3'] phylA = matrix2dst(matrix, taxa=taxa) phylB = matrix2dst(matrix) assert phylA == phylB phylC = matrix2dst(matrix, taxa=taxa, stamp='# Written with joy.') phylD = matrix2dst(matrix, stamp='# Written with joy.') assert phylC == phylD phylE = matrix2dst(matrix, taxa=taxa, taxlen=20) phylF = matrix2dst(matrix, taxlen=30) assert 18 * ' ' in phylE and 28 * ' ' in phylF # check for tab-stop output when taxlen is set to 0 self.assertEqual(matrix2dst(matrix, taxlen=0).count('\t'), 9)
def test_matrix2dst(): matrix = squareform([0.5, 0.75, 0.8]) # we choose same format for taxa as default taxa = ['t_1', 't_2', 't_3'] phyl_a = matrix2dst(matrix, taxa=taxa) phyl_b = matrix2dst(matrix) assert phyl_a == phyl_b phyl_c = matrix2dst(matrix, taxa=taxa, stamp='# Written with joy.') phyl_d = matrix2dst(matrix, stamp='# Written with joy.') assert phyl_c == phyl_d phyl_e = matrix2dst(matrix, taxa=taxa, taxlen=20) phyl_f = matrix2dst(matrix, taxlen=30) assert 18 * ' ' in phyl_e and 28 * ' ' in phyl_f # check for tab-stop output when taxlen is set to 0 assert matrix2dst(matrix, taxlen=0).count('\t') == 9
def _output(self, fileformat, **keywords): """ Internal function that eases its modification by daughter classes. """ # check for stamp attribute keywords["stamp"] = getattr(self, '_stamp', '') # add the default parameters, they will be checked against the keywords util.setdefaults( keywords, cols=False, distances=False, entries=("concept", "counterpart"), entry='concept', fileformat=fileformat, filename=rcParams['filename'], formatter='concept', modify_ref=False, meta=self._meta, missing=0, prettify='false', ignore='all', ref='cogid', rows=False, subset=False, # setup a subset of the data, taxa='taxa', threshold=0.6, # threshold for flat clustering tree_calc='neighbor') if fileformat in ['triple', 'triples', 'triples.tsv']: return tsv2triple(self, keywords['filename'] + '.' + fileformat) if fileformat in ['paps.nex', 'paps.csv']: paps = self.get_paps( ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing']) kw = dict(filename=keywords['filename'] + '.paps') if fileformat == 'paps.nex': kw['missing'] = keywords['missing'] return pap2nex(self.cols, paps, **kw) return pap2csv(self.cols, paps, **kw) # simple printing of taxa if fileformat == 'taxa': assert hasattr(self, 'taxa') return util.write_text_file(keywords['filename'] + '.taxa', self.cols) # csv-output if fileformat in ['csv', 'qlc', 'tsv']: # get the header line header = sorted( [s for s in set(self._alias.values()) if s in self._header], key=lambda x: self._header[x]) header = [h.upper() for h in header] self._meta.setdefault('taxa', self.cols) # get the data, in case a subset is chosen if not keywords['subset']: # write stuff to file return wl2qlc(header, self._data, **keywords) cols, rows = keywords['cols'], keywords['rows'] if not isinstance(cols, (list, tuple, bool)): raise ValueError("[i] Argument 'cols' should be list or tuple.") if not isinstance(rows, (dict, bool)): raise ValueError("[i] Argument 'rows' should be a dictionary.") # check for chosen header if cols: # get indices for header indices = [self._header[x] for x in cols] header = [c.upper() for c in cols] else: indices = [r for r in range(len(self.header))] if rows: stmts = [] for key, value in rows.items(): if key == 'ID': stmts += ["key " + value] else: idx = self._header[key] stmts += ["line[{0}] ".format(idx) + value] log.debug("calculated what should be excluded") # get the data out = {} for key, line in self._data.items(): log.debug(key) if rows: if eval(" and ".join(stmts)): out[key] = [line[i] for i in indices] else: out[key] = [line[i] for i in indices] log.debug("passing data to wl2qlc") return wl2qlc(header, out, **keywords) # output dst-format (phylip) if fileformat == 'dst': # check for distances as keyword if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self, **keywords) out = matrix2dst(self._meta['distances'], self.taxa, stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0)) return _write_file(keywords['filename'], out, fileformat) # output tre-format (newick) if fileformat in ['tre', 'nwk']: # ,'cluster','groups']: if 'tree' not in self._meta: # check for distances if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # we look up a function to calculate a tree in the cluster module: tree = getattr(cluster, keywords['tree_calc'])( self._meta['distances'], self.cols, distances=keywords['distances']) else: tree = self._meta['tree'] return _write_file(keywords['filename'], '{0}'.format(tree), fileformat) if fileformat in ['cluster', 'groups']: if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # check for keywords if 'groups' not in self._meta: self._meta['groups'] = cluster.matrix2groups( keywords['threshold'], self._meta['distances'], self.taxa) lines = [] for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]): lines.append('{0}\t{1}'.format(taxon, group)) return _write_file(keywords['filename'], lines, fileformat) if fileformat in ['starling', 'star.csv']: # make lambda inline for data-check l = lambda x: ['-' if x == 0 else x][0] lines = [] if 'cognates' not in keywords: lines.append('ID\tConcept\t' + '\t'.join(self.taxa)) for i, concept in enumerate(self.concepts): for line in self.get_list(row=concept, entry=keywords['entry']): lines.append( str(i + 1) + '\t' + concept + '\t' + '\t'.join( [l(t) for t in line])) else: lines.append( 'ID\tConcept\t' + '\t'.join( ['{0}\t COG'.format(t) for t in self.taxa])) for i, concept in enumerate(self.concepts): cogs = self.get_list(row=concept, entry=keywords['cognates']) for j, line in enumerate( self.get_list(row=concept, entry=keywords['entry'])): part = '\t'.join( '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j])) lines.append(util.tabjoin(i + 1, concept, part)) return _write_file( keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv') if fileformat == 'multistate.nex': if not keywords['filename'].endswith('.multistate.nex'): keywords['filename'] += '.multistate.nex' matrix = wl2multistate(self, keywords['ref'], keywords['missing']) return multistate2nex(self.taxa, matrix, keywords['filename']) if fileformat == 'separated': if not os.path.isdir(keywords['filename']): os.mkdir(keywords['filename']) for l in self.cols: lines = [''] if 'ignore_keys' in keywords else ['ID\t'] lines[0] += '\t'.join(x.upper() for x in keywords['entries']) for key in self.get_list(col=l, flat=True): line = [] if 'ignore_keys' in keywords else [key] for entry in keywords['entries']: tmp = self[key, entry] if isinstance(tmp, list): tmp = ' '.join([str(x) for x in tmp]) line += [tmp] lines.append('\t'.join('{0}'.format(x) for x in line)) _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
def wl2qlc(header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults(keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json' ] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (str, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: (data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += str(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([str(v) for v in value]) elif type(value) == int: out += '\t' + str(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file(filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return
def wl2qlc( header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults( keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json'] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (text_type, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: ( data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += text_type(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([text_type(v) for v in value]) elif type(value) == int: out += '\t' + text_type(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file( filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return