Пример #1
0
    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
Пример #2
0
def calculate_data(wordlist,
                   data,
                   taxa='taxa',
                   concepts='concepts',
                   ref='cogid',
                   **keywords):
    """
    Manipulate a wordlist object by adding different kinds of data.

    Parameters
    ----------
    data : str
        The type of data that shall be calculated. Currently supports

        * "tree": calculate a reference tree based on shared cognates
        * "dst": get distances between taxa based on shared cognates
        * "cluster": cluster the taxa into groups using different methods


    """
    logger = log.get_logger()
    util.setdefaults(keywords,
                     distances=False,
                     tree_calc="upgma",
                     cluster="upgma",
                     force=False,
                     threshold=0.5,
                     cluster_method='upgma')

    # get taxa for current calculation
    these_taxa = eval('wordlist.' + taxa)

    # calculate distances
    if data in ['distances', 'dst']:
        wordlist._meta['distances'] = wl2dst(wordlist, taxa, concepts, ref,
                                             **keywords)
    elif data in ['diversity', 'div']:
        etd = wordlist.get_etymdict(ref=ref)
        wordlist._meta['diversity'] = \
            (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height)
    elif data in ['tre', 'tree', 'nwk']:
        if 'distances' not in wordlist._meta:
            wordlist._meta['distances'] = \
                wl2dst(wordlist, taxa, concepts, ref, **keywords)
        distances = wordlist._meta['distances']
        if 'tree' in wordlist._meta and not keywords['force']:
            logger.warning("Reference tree has already been calculated, "
                           "force overwrite by "
                           "setting 'force' to 'True'.")
            return
        wordlist._meta['tree'] = clustering.matrix2tree(
            distances, these_taxa, keywords['tree_calc'],
            keywords['distances'])

    elif data in ['groups', 'cluster']:
        if 'distances' not in wordlist._meta:
            distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
        else:
            distances = wordlist._meta['distances']
        if 'groups' in wordlist._meta and not keywords['force']:
            logger.warning("Distance matrix has already been calculated, "
                           "force overwrite by "
                           "setting 'force' to 'True'.")
            return
        wordlist._meta['groups'] = clustering.matrix2groups(
            keywords['threshold'], distances, these_taxa,
            keywords['cluster_method'])
    log.info("Successfully calculated {0}.".format(data))
Пример #3
0
    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
Пример #4
0
 def test_matrix2groups(self):
     for method in 'upgma mcl simple complete'.split():
         matrix2groups(0.5, self.matrix, self.taxa, cluster_method=method)
Пример #5
0
def calculate_data(
        wordlist,
        data,
        taxa='taxa',
        concepts='concepts',
        ref='cogid',
        **keywords):
    """
    Manipulate a wordlist object by adding different kinds of data.

    Parameters
    ----------
    data : str
        The type of data that shall be calculated. Currently supports

        * "tree": calculate a reference tree based on shared cognates
        * "dst": get distances between taxa based on shared cognates
        * "cluster": cluster the taxa into groups using different methods


    """
    logger = log.get_logger()
    util.setdefaults(
        keywords,
        distances=False,
        tree_calc="upgma",
        cluster="upgma",
        force=False,
        threshold=0.5,
        cluster_method='upgma')

    # get taxa for current calculation
    these_taxa = eval('wordlist.' + taxa)

    # calculate distances
    if data in ['distances', 'dst']:
        wordlist._meta['distances'] = wl2dst(
                wordlist, taxa, concepts, ref, **keywords)
    elif data in ['diversity', 'div']:
        etd = wordlist.get_etymdict(ref=ref)
        wordlist._meta['diversity'] = \
            (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height)
    elif data in ['tre', 'tree', 'nwk']:
        if 'distances' not in wordlist._meta:
            wordlist._meta['distances'] = \
                wl2dst(wordlist, taxa, concepts, ref, **keywords)
        distances = wordlist._meta['distances']
        if 'tree' in wordlist._meta and not keywords['force']:
            logger.warn(
                    "Reference tree has already been calculated, "
                    "force overwrite by "
                    "setting 'force' to 'True'.")
            return
        wordlist._meta['tree'] = clustering.matrix2tree(
            distances, these_taxa, keywords['tree_calc'],
            keywords['distances'])

    elif data in ['groups', 'cluster']:
        if 'distances' not in wordlist._meta:
            distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
        else:
            distances = wordlist._meta['distances']
        if 'groups' in wordlist._meta and not keywords['force']:
            logger.warn(
                    "Distance matrix has already been calculated, "
                    "force overwrite by "
                    "setting 'force' to 'True'.")
            return
        wordlist._meta['groups'] = clustering.matrix2groups(
            keywords['threshold'], distances, these_taxa,
            keywords['cluster_method'])
    log.info("Successfully calculated {0}.".format(data))
Пример #6
0
    def test_matrix2groups(self):
        from lingpy.algorithm.clustering import matrix2groups

        for method in 'upgma mcl simple complete'.split():
            matrix2groups(0.5, [[1, 0], [0, 1]], ['a', 'b'],
                          cluster_method=method)
Пример #7
0
    def test_matrix2groups(self):
        from lingpy.algorithm.clustering import matrix2groups

        for method in 'upgma mcl simple complete'.split():
            matrix2groups(0.5, [[1, 0], [0, 1]], ['a', 'b'], cluster_method=method)
Пример #8
0
    def test_matrix2groups(self):
        from lingpy.algorithm.clustering import matrix2groups

        for method in 'upgma mcl simple complete'.split():
            matrix2groups(0.5, self.matrix, self.taxa, cluster_method=method)
Пример #9
0
    def test_matrix2groups(self):
        from lingpy.algorithm.clustering import matrix2groups

        for method in 'upgma mcl simple complete'.split():
            matrix2groups(0.5, self.matrix, self.taxa, cluster_method=method)
Пример #10
0
def test_matrix2groups(matrix, taxa):
    for method in 'upgma mcl simple complete'.split():
        matrix2groups(0.5, matrix, taxa, cluster_method=method)