def test_nwk2tree_matrix(): newick = '(((a,b),(c,d)),e);' matrix, taxa = tree.nwk2tree_matrix(newick) assert taxa == Tree(newick).taxa
def test_nwk2tree_matrix(self): matrix, taxa = tree.nwk2tree_matrix(self.newick) assert taxa == self.tree.taxa
def plot_heatmap(wordlist, filename="heatmap", fileformat="pdf", ref='cogid', normalized=False, refB='', **keywords): """ Create a heatmap-representation of shared cognates for a given wordlist. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. filename : str (default="heatmap") Name of the file to which the heatmap will be written. fileformat : str (default="pdf") A regular matplotlib-fileformat (pdf, png, pgf, svg). ref : str (default="cogid') The name of the column that contains the cognate identifiers. normalized : {bool str} (default=True) If set to c{False}, don't normalize the data. Otherwise, select the normalization method, choose between: * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for details), and * "swadesh" for traditional lexicostatistical calculation of shared cognate percentages. cmap : matplotlib.cm (default=matplotlib.cm.jet) The color scheme to be used for the heatmap. steps : int (default=5) The number of steps in which names of taxa will be written to the axes. xrotation : int (default=45) The rotation of the taxon-names on the x-axis. colorbar : bool (default=True) Specify, whether a colorbar should be added to the plot. figsize : tuple (default=(10,10)) Specify the size of the figure. tree : str (default='') A tree passed for the taxa in Newick-format. If no tree is specified, the method looks for a tree object in the Wordlist. Notes ----- This function plots shared cognate percentages. """ defaults = dict( bottom=0.01, # rcParams['phybo_ylimb'] cmap=mpl.cm.jet, colorbar=True, colorbar_label="Shared Cognates", colorbar_shrink=0.75, colorbar_textsize=10, figsize=(10, 5), height=0.8, labels={}, # taxon labels passed for the taxa, left=0.01, # rcParams['phybo_xlimr'], matrix=False, normalization="jaccard", right=0.95, # rcParams['phybo_xliml'], scale=0.075, show_tree=True, steps=20, textsize=5, top=0.95, # rcParams['phybo_ylimt'], tree='', tree_bottom=0.1, tree_left=0.1, tree_width=0.2, vmax=1.0, vmin=0.0, width=0.8, xrotation=90, distances=False) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # access the reference tree of the wordlist and create a function that # orders the taxa accordingly if not keywords['tree']: try: tree = wordlist.tree except: raise ValueError("[i] No tree could be found") else: tree = keywords["tree"] # check for normalization if normalized: if normalized not in ["jaccard", "swadesh"]: raise ValueError( "Keyword 'normalized' must be one of 'jaccard','swadesh',False." ) # create an empty matrix if not normalized: matrix = np.zeros((wordlist.width, wordlist.width), dtype=int) else: matrix = np.zeros((wordlist.width, wordlist.width), dtype=float) # create the figure fig = plt.figure(figsize=keywords['figsize']) # plot the reference tree if keywords['show_tree']: tree_matrix, taxa = nwk2tree_matrix(tree) ax1 = fig.add_axes([ keywords['left'], keywords['bottom'], 0.25 * keywords['width'], keywords['height'] ]) # [0.01,0.1,0.2,0.7]) d = sch.dendrogram( np.array(tree_matrix), labels=[t for t in taxa], orientation='left', ) taxa = d['ivl'][::-1] ax1.set_xticks([]) ax1.set_yticks([]) ax1.spines['bottom'].set_color('#ffffff') ax1.spines['top'].set_color('#ffffff') ax1.spines['left'].set_color('#ffffff') ax1.spines['right'].set_color('#ffffff') left = keywords['left'] + keywords['scale'] * keywords['width'] else: left = keywords['left'] taxa = tree.taxa # start iterating over taxa in order of the reference tree and fill in the # matrix with numbers of shared cognates if keywords['matrix']: matrix = keywords['matrix'] else: for i, taxonA in enumerate(taxa): for j, taxonB in enumerate(taxa): if i < j: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list(taxa=taxonA, flat=True, entry=ref) cogsB = wordlist.get_list(taxa=taxonB, flat=True, entry=ref) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict(taxa=taxonA, entry=ref) cogsB = wordlist.get_dict(taxa=taxonB, entry=ref) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning( str([ shared, slots, len(cogsA), len(cogsB), taxonA, taxonB ])) shared = 0.0 matrix[i][j] = shared # if refB is also a possibiltiy if not refB: matrix[j][i] = shared elif i > j and refB: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list(taxa=taxonA, flat=True, entry=refB) cogsB = wordlist.get_list(taxa=taxonB, flat=True, entry=refB) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict(taxa=taxonA, entry=refB) cogsB = wordlist.get_dict(taxa=taxonB, entry=refB) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning( str([ shared, slots, len(cogsA), len(cogsB), taxonA, taxonB ])) shared = 0.0 matrix[i][j] = shared elif i == j: cogs = wordlist.get_list(taxa=taxonA, flat=True, entry=ref) if normalized: matrix[i][j] = 1.0 else: matrix[i][j] = len(set(cogs)) ax2 = fig.add_axes([ left, # keywords['left']+0.25 * keywords['width']+0.05, keywords['bottom'], keywords['width'], keywords['height'] ]) cmap = keywords['cmap'] # [0.15,0.1,0.7,0.7]) if 'distances' in keywords and keywords['distances']: for i, line in enumerate(matrix): for j, cell in enumerate(matrix): matrix[i][j] = 1 - matrix[i][j] nmatrix = [[keywords['vmax'], keywords['vmin']], [keywords['vmin'], keywords['vmax']]] im = ax2.matshow(nmatrix, aspect='auto', origin='lower', interpolation='nearest', cmap=keywords['cmap'], vmax=keywords['vmax'], vmin=keywords['vmin']) # set the xticks steps = int(len(taxa) / keywords['steps'] + 0.5) start = int(steps / 2 + 0.5) idxs = [0] + list(range(start, len(taxa), steps)) selected_taxa = [taxa[i] for i in idxs] # modify taxon names if this is specified for i, t in enumerate(selected_taxa): if t in keywords['labels']: selected_taxa[i] = keywords['labels'][t] ax2.set_xticks([]) ax2.set_yticks([]) plt.xticks(idxs, selected_taxa, size=keywords['textsize'], rotation=keywords['xrotation'], rotation_mode="default") plt.yticks( idxs, selected_taxa, size=keywords['textsize'], ) if keywords["colorbar"]: plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax']) c = plt.colorbar(im, shrink=keywords['colorbar_shrink']) c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize']) plt.subplots_adjust(left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom']) plt.savefig(filename + '.' + fileformat) f = open(filename + '.matrix', 'w') for i, t in enumerate(taxa): f.write('{0:20}'.format(t)) for j, c in enumerate(matrix[i]): if not normalized: f.write('\t{0:3}'.format(int(c))) else: f.write('\t{0:.2f}'.format(c)) f.write('\n') f.close() log.file_written(filename + '.' + fileformat)
def plot_heatmap( wordlist, filename="heatmap", fileformat="pdf", ref='cogid', normalized=False, refB='', **keywords ): """ Create a heatmap-representation of shared cognates for a given wordlist. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. filename : str (default="heatmap") Name of the file to which the heatmap will be written. fileformat : str (default="pdf") A regular matplotlib-fileformat (pdf, png, pgf, svg). ref : str (default="cogid') The name of the column that contains the cognate identifiers. normalized : {bool str} (default=True) If set to c{False}, don't normalize the data. Otherwise, select the normalization method, choose between: * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for details), and * "swadesh" for traditional lexicostatistical calculation of shared cognate percentages. cmap : matplotlib.cm (default=matplotlib.cm.jet) The color scheme to be used for the heatmap. steps : int (default=5) The number of steps in which names of taxa will be written to the axes. xrotation : int (default=45) The rotation of the taxon-names on the x-axis. colorbar : bool (default=True) Specify, whether a colorbar should be added to the plot. figsize : tuple (default=(10,10)) Specify the size of the figure. tree : str (default='') A tree passed for the taxa in Newick-format. If no tree is specified, the method looks for a tree object in the Wordlist. Notes ----- This function plots shared cognate percentages. """ defaults = dict( bottom=0.01, # rcParams['phybo_ylimb'] cmap=mpl.cm.jet, colorbar=True, colorbar_label="Shared Cognates", colorbar_shrink=0.75, colorbar_textsize=10, figsize=(10, 5), height=0.8, labels={}, # taxon labels passed for the taxa, left=0.01, # rcParams['phybo_xlimr'], matrix=False, normalization="jaccard", right=0.95, # rcParams['phybo_xliml'], scale=0.075, show_tree=True, steps=20, textsize=5, top=0.95, # rcParams['phybo_ylimt'], tree='', tree_bottom=0.1, tree_left=0.1, tree_width=0.2, vmax=1.0, vmin=0.0, width=0.8, xrotation=90, distances=False ) for k in defaults: if k not in keywords: keywords[k] = defaults[k] # access the reference tree of the wordlist and create a function that # orders the taxa accordingly if not keywords['tree']: try: tree = wordlist.tree except: raise ValueError("[i] No tree could be found") else: tree = keywords["tree"] # check for normalization if normalized: if normalized not in ["jaccard", "swadesh"]: raise ValueError( "Keyword 'normalized' must be one of 'jaccard','swadesh',False.") # create an empty matrix if not normalized: matrix = np.zeros((wordlist.width, wordlist.width), dtype=int) else: matrix = np.zeros((wordlist.width, wordlist.width), dtype=float) # create the figure fig = plt.figure(figsize=keywords['figsize']) # plot the reference tree if keywords['show_tree']: tree_matrix, taxa = nwk2tree_matrix(tree) ax1 = fig.add_axes( [ keywords['left'], keywords['bottom'], 0.25 * keywords['width'], keywords['height'] ] ) # [0.01,0.1,0.2,0.7]) d = sch.dendrogram( np.array(tree_matrix), labels=[t for t in taxa], orientation='left', ) taxa = d['ivl'][::-1] ax1.set_xticks([]) ax1.set_yticks([]) ax1.spines['bottom'].set_color('#ffffff') ax1.spines['top'].set_color('#ffffff') ax1.spines['left'].set_color('#ffffff') ax1.spines['right'].set_color('#ffffff') left = keywords['left'] + keywords['scale'] * keywords['width'] else: left = keywords['left'] taxa = tree.taxa # start iterating over taxa in order of the reference tree and fill in the # matrix with numbers of shared cognates if keywords['matrix']: matrix = keywords['matrix'] else: for i, taxonA in enumerate(taxa): for j, taxonB in enumerate(taxa): if i < j: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=ref ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=ref ) cogsB = wordlist.get_dict( taxa=taxonB, entry=ref ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared # if refB is also a possibiltiy if not refB: matrix[j][i] = shared elif i > j and refB: if normalized in [False, "jaccard"]: cogsA = wordlist.get_list( taxa=taxonA, flat=True, entry=refB ) cogsB = wordlist.get_list( taxa=taxonB, flat=True, entry=refB ) cogsA, cogsB = set(cogsA), set(cogsB) shared = len(cogsA.intersection(cogsB)) if normalized: shared = shared / len(cogsA.union(cogsB)) else: cogsA = wordlist.get_dict( taxa=taxonA, entry=refB ) cogsB = wordlist.get_dict( taxa=taxonB, entry=refB ) shared = 0 slots = 0 # iterate over cognate sets in meaning slots for key in cogsA.keys(): # check whether keys are present, we follow the # STARLING procedure in ignoring missing data if key in cogsA and key in cogsB: # check for shared items if [k for k in cogsA[key] if k in cogsB[key]]: shared += 1 slots += 1 try: shared = shared / slots except ZeroDivisionError: log.warning(str( [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB])) shared = 0.0 matrix[i][j] = shared elif i == j: cogs = wordlist.get_list( taxa=taxonA, flat=True, entry=ref ) if normalized: matrix[i][j] = 1.0 else: matrix[i][j] = len(set(cogs)) ax2 = fig.add_axes( [ left, # keywords['left']+0.25 * keywords['width']+0.05, keywords['bottom'], keywords['width'], keywords['height'] ] ) cmap = keywords['cmap'] # [0.15,0.1,0.7,0.7]) if 'distances' in keywords and keywords['distances']: for i, line in enumerate(matrix): for j, cell in enumerate(matrix): matrix[i][j] = 1 - matrix[i][j] nmatrix = [ [keywords['vmax'], keywords['vmin']], [keywords['vmin'], keywords['vmax']] ] im = ax2.matshow(nmatrix, aspect='auto', origin='lower', interpolation='nearest', cmap=keywords['cmap'], vmax=keywords['vmax'], vmin=keywords['vmin'] ) # set the xticks steps = int(len(taxa) / keywords['steps'] + 0.5) start = int(steps / 2 + 0.5) idxs = [0] + list(range(start, len(taxa), steps)) selected_taxa = [taxa[i] for i in idxs] # modify taxon names if this is specified for i, t in enumerate(selected_taxa): if t in keywords['labels']: selected_taxa[i] = keywords['labels'][t] ax2.set_xticks([]) ax2.set_yticks([]) plt.xticks( idxs, selected_taxa, size=keywords['textsize'], rotation=keywords['xrotation'], rotation_mode="default" ) plt.yticks( idxs, selected_taxa, size=keywords['textsize'], ) if keywords["colorbar"]: plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax']) c = plt.colorbar(im, shrink=keywords['colorbar_shrink']) c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize']) plt.subplots_adjust( left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) plt.savefig(filename + '.' + fileformat) f = open(filename + '.matrix', 'w') for i, t in enumerate(taxa): f.write('{0:20}'.format(t)) for j, c in enumerate(matrix[i]): if not normalized: f.write('\t{0:3}'.format(int(c))) else: f.write('\t{0:.2f}'.format(c)) f.write('\n') f.close() log.file_written(filename + '.' + fileformat)