예제 #1
0
 def validate_vocabulary(self):
     vocabulary = self.vocabulary
     if vocabulary is not None:
         if isinstance(vocabulary, set):
             vocabulary = sorted(vocabulary)
         if not isinstance(vocabulary, Mapping):
             vocab = {}
             for i, t in enumerate(vocabulary):
                 if vocab.setdefault(t, i) != i:
                     msg = "Duplicate term in vocabulary: %r" % t
                     raise ValueError(msg)
             vocabulary = vocab
         else:
             indices = set(six.itervalues(vocabulary))
             if len(indices) != len(vocabulary):
                 raise ValueError("Vocabulary contains repeated indices.")
             for i in xrange(len(vocabulary)):
                 if i not in indices:
                     msg = ("Vocabulary of size %d doesn't contain index "
                            "%d." % (len(vocabulary), i))
                     raise ValueError(msg)
         if not vocabulary:
             raise ValueError("empty vocabulary passed to fit")
         self.fixed_vocabulary_ = True
         self.vocabulary_ = dict(vocabulary)
     else:
         self.fixed_vocabulary_ = False
예제 #2
0
    def _term_counts_to_matrix(self, n_doc, i_indices, j_indices, values):
        """Construct COO matrix from indices and values.

        i_indices and j_indices should be constructed with _make_int_array.
        """
        # array("i") corresponds to np.intc, which is also what scipy.sparse
        # wants for indices, so they won't be copied by the coo_matrix ctor.
        # The length check works around a bug in old NumPy versions:
        # http://projects.scipy.org/numpy/ticket/1943
        if len(i_indices) > 0:
            i_indices = np.frombuffer(i_indices, dtype=np.intc)
        if len(j_indices) > 0:
            j_indices = np.frombuffer(j_indices, dtype=np.intc)

        if self.dtype == np.intc and len(values) > 0:
            values = np.frombuffer(values, dtype=np.intc)
        else:
            # In Python 3.2, SciPy 0.10.1, the coo_matrix ctor won't accept an
            # array.array.
            values = np.asarray(values, dtype=self.dtype)

        shape = (n_doc, max(six.itervalues(self.vocabulary_)) + 1)
        spmatrix = sp.coo_matrix((values, (i_indices, j_indices)),
                                 shape=shape, dtype=self.dtype)
        if self.binary:
            spmatrix.data.fill(1)
        return spmatrix
예제 #3
0
파일: knn.py 프로젝트: cimor/skl-groups
def topological_sort(deps):
    '''
    Topologically sort a DAG, represented by a dict of child => set of parents.
    The dependency dict is destroyed during operation.

    Uses the Kahn algorithm: http://en.wikipedia.org/wiki/Topological_sorting
    Not a particularly good implementation, but we're just running it on tiny
    graphs.
    '''
    order = []
    available = set()

    def _move_available():
        to_delete = []
        for n, parents in iteritems(deps):
            if not parents:
                available.add(n)
                to_delete.append(n)
        for n in to_delete:
            del deps[n]

    _move_available()
    while available:
        n = available.pop()
        order.append(n)
        for parents in itervalues(deps):
            parents.discard(n)
        _move_available()

    if available:
        raise ValueError("dependency cycle found")
    return order
예제 #4
0
파일: knn.py 프로젝트: zshwuhan/skl-groups
def topological_sort(deps):
    '''
    Topologically sort a DAG, represented by a dict of child => set of parents.
    The dependency dict is destroyed during operation.

    Uses the Kahn algorithm: http://en.wikipedia.org/wiki/Topological_sorting
    Not a particularly good implementation, but we're just running it on tiny
    graphs.
    '''
    order = []
    available = set()

    def _move_available():
        to_delete = []
        for n, parents in iteritems(deps):
            if not parents:
                available.add(n)
                to_delete.append(n)
        for n in to_delete:
            del deps[n]

    _move_available()
    while available:
        n = available.pop()
        order.append(n)
        for parents in itervalues(deps):
            parents.discard(n)
        _move_available()

    if available:
        raise ValueError("dependency cycle found")
    return order
예제 #5
0
 def __init__(self, input='content', encoding='utf-8',
              decode_error='strict', strip_accents=None,
              lowercase=True, preprocessor=None, tokenizer=None,
              stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
              ngram_range=(1, 1), analyzer='word',
              max_df=1.0, min_df=1, max_features=None,
              vocabulary=None, binary=False, dtype=np.int64):
     self.input = input
     self.encoding = encoding
     self.decode_error = decode_error
     self.strip_accents = strip_accents
     self.preprocessor = preprocessor
     self.tokenizer = tokenizer
     self.analyzer = analyzer
     self.lowercase = lowercase
     self.token_pattern = token_pattern
     self.stop_words = stop_words
     self.max_df = max_df
     self.min_df = min_df
     if max_df < 0 or min_df < 0:
         raise ValueError("negative value for max_df of min_df")
     self.max_features = max_features
     if max_features is not None:
         if (not isinstance(max_features, numbers.Integral) or
                 max_features <= 0):
             raise ValueError(
                 "max_features=%r, neither a positive integer nor None"
                 % max_features)
     self.ngram_range = ngram_range
     if vocabulary is not None:
         if not isinstance(vocabulary, Mapping):
             vocab = {}
             for i, t in enumerate(vocabulary):
                 if vocab.setdefault(t, i) != i:
                     msg = "Duplicate term in vocabulary: %r" % t
                     raise ValueError(msg)
             vocabulary = vocab
         else:
             indices = set(six.itervalues(vocabulary))
             if len(indices) != len(vocabulary):
                 raise ValueError("Vocabulary contains repeated indices.")
             for i in xrange(len(vocabulary)):
                 if i not in indices:
                     msg = ("Vocabulary of size %d doesn't contain index "
                            "%d." % (len(vocabulary), i))
                     raise ValueError(msg)
         if not vocabulary:
             raise ValueError("empty vocabulary passed to fit")
         self.fixed_vocabulary = True
         self.vocabulary_ = dict(vocabulary)
     else:
         self.fixed_vocabulary = False
     self.binary = binary
     self.dtype = dtype
예제 #6
0
    def _term_count_dicts_to_matrix(self, term_count_dicts):
        i_indices = []
        j_indices = []
        values = []
        vocabulary = self.vocabulary_

        for i, term_count_dict in enumerate(term_count_dicts):
            for term, count in six.iteritems(term_count_dict):
                j = vocabulary.get(term)
                if j is not None:
                    i_indices.append(i)
                    j_indices.append(j)
                    values.append(count)
            # free memory as we go
            term_count_dict.clear()

        shape = (i + 1, max(six.itervalues(vocabulary)) + 1)
        spmatrix = sp.coo_matrix((values, (i_indices, j_indices)),
                                 shape=shape, dtype=self.dtype)
        if self.binary:
            spmatrix.data.fill(1)
        return spmatrix
예제 #7
0
    def __init__(self, input='content', encoding='utf-8', charset=None,
                 decode_error='strict', charset_error=None,
                 strip_accents=None,
                 lowercase=True, preprocessor=None, tokenizer=None,
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), analyzer='word',
                 max_df=1.0, min_df=1, max_features=None,
                 vocabulary=None, binary=False, dtype=np.int64):
        self.input = input
        self.encoding = encoding
        self.decode_error = decode_error
        if charset is not None:
            warnings.warn("The charset parameter is deprecated as of version "
                          "0.14 and will be removed in 0.16. Use encoding "
                          "instead.",
                          DeprecationWarning)
            self.encoding = charset
        if charset_error is not None:
            warnings.warn("The charset_error parameter is deprecated as of "
                          "version 0.14 and will be removed in 0.16. Use "
                          "decode_error instead.",
                          DeprecationWarning)
            self.decode_error = charset_error

        self.strip_accents = strip_accents
        self.preprocessor = preprocessor
        self.tokenizer = tokenizer
        self.analyzer = analyzer
        self.lowercase = lowercase
        self.token_pattern = token_pattern
        self.stop_words = stop_words
        self.max_df = max_df
        self.min_df = min_df
        if max_df < 0 or min_df < 0:
            raise ValueError("negative value for max_df of min_df")
        self.max_features = max_features
        if max_features is not None:
            if (not isinstance(max_features, numbers.Integral) or
                    max_features <= 0):
                raise ValueError(
                    "max_features=%r, neither a positive integer nor None"
                    % max_features)
        self.ngram_range = ngram_range
        if vocabulary is not None:
            if not isinstance(vocabulary, Mapping):
                vocabulary = dict((t, i) for i, t in enumerate(vocabulary))
            if not vocabulary:
                raise ValueError("empty vocabulary passed to fit")
            indices = set(six.itervalues(vocabulary))
            if len(indices) != len(vocabulary):
                raise ValueError("Vocabulary contains repeated indices.")
            for i in xrange(len(vocabulary)):
                if i not in indices:
                    msg = "Vocabulary of size %d doesn't contain index %d."
                    raise ValueError(msg % (len(vocabulary), i))
            self.fixed_vocabulary = True
            self.vocabulary_ = dict(vocabulary)
        else:
            self.fixed_vocabulary = False
        self.binary = binary
        self.dtype = dtype
예제 #8
0
    def __init__(self,
                 input='content',
                 encoding='utf-8',
                 charset=None,
                 decode_error='strict',
                 charset_error=None,
                 strip_accents=None,
                 lowercase=True,
                 preprocessor=None,
                 tokenizer=None,
                 stop_words=None,
                 token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1),
                 analyzer='word',
                 max_df=1.0,
                 min_df=1,
                 max_features=None,
                 vocabulary=None,
                 binary=False,
                 dtype=np.int64):
        self.input = input
        self.encoding = encoding
        self.decode_error = decode_error
        if charset is not None:
            warnings.warn(
                "The charset parameter is deprecated as of version "
                "0.14 and will be removed in 0.16. Use encoding "
                "instead.", DeprecationWarning)
            self.encoding = charset
        if charset_error is not None:
            warnings.warn(
                "The charset_error parameter is deprecated as of "
                "version 0.14 and will be removed in 0.16. Use "
                "decode_error instead.", DeprecationWarning)
            self.decode_error = charset_error

        self.strip_accents = strip_accents
        self.preprocessor = preprocessor
        self.tokenizer = tokenizer
        self.analyzer = analyzer
        self.lowercase = lowercase
        self.token_pattern = token_pattern
        self.stop_words = stop_words
        self.max_df = max_df
        self.min_df = min_df
        if max_df < 0 or min_df < 0:
            raise ValueError("negative value for max_df of min_df")
        self.max_features = max_features
        if max_features is not None:
            if (not isinstance(max_features, numbers.Integral)
                    or max_features <= 0):
                raise ValueError(
                    "max_features=%r, neither a positive integer nor None" %
                    max_features)
        self.ngram_range = ngram_range
        if vocabulary is not None:
            if not isinstance(vocabulary, Mapping):
                vocab = {}
                for i, t in enumerate(vocabulary):
                    if vocab.setdefault(t, i) != i:
                        msg = "Duplicate term in vocabulary: %r" % t
                        raise ValueError(msg)
                vocabulary = vocab
            else:
                indices = set(six.itervalues(vocabulary))
                if len(indices) != len(vocabulary):
                    raise ValueError("Vocabulary contains repeated indices.")
                for i in xrange(len(vocabulary)):
                    if i not in indices:
                        msg = ("Vocabulary of size %d doesn't contain index "
                               "%d." % (len(vocabulary), i))
                        raise ValueError(msg)
            if not vocabulary:
                raise ValueError("empty vocabulary passed to fit")
            self.fixed_vocabulary = True
            self.vocabulary_ = dict(vocabulary)
        else:
            self.fixed_vocabulary = False
        self.binary = binary
        self.dtype = dtype
예제 #9
0
    def fit_transform(self, raw_documents, y=None):
        """Learn the vocabulary dictionary and return the count vectors.

        This is more efficient than calling fit followed by transform.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        vectors : array, [n_samples, n_features]
        """
        # We intentionally don't call the transform method to make
        # fit_transform overridable without unwanted side effects in
        # TfidfVectorizer.
        fixed_vocab = self.fixed_vocabulary

        if fixed_vocab:
            vocab = self.vocabulary_
            vocab_max_ind = max(six.itervalues(self.vocabulary_)) + 1
        else:
            vocab = {}
            vocab_max_ind = 0

        # Result of document conversion to term count arrays.
        row_ind = _make_int_array()
        col_ind = _make_int_array()
        feature_values = _make_int_array()
        term_counts = Counter()

        # term counts across entire corpus (count each term maximum once per
        # document)
        document_counts = Counter()

        analyze = self.build_analyzer()

        for n_doc, doc in enumerate(raw_documents):
            term_count_current = Counter(analyze(doc))
            term_counts.update(term_count_current)

            if not fixed_vocab:
                for term in six.iterkeys(term_count_current):
                    if term not in vocab:
                        vocab[term] = vocab_max_ind
                        vocab_max_ind += 1

            document_counts.update(six.iterkeys(term_count_current))

            for term, count in six.iteritems(term_count_current):
                if term in vocab:
                    row_ind.append(n_doc)
                    col_ind.append(vocab[term])
                    feature_values.append(count)
        n_doc += 1

        if fixed_vocab:
            # XXX max_df, min_df and max_features have no effect
            # with a fixed vocabulary.
            i_indices = row_ind
            j_indices = col_ind
            values = feature_values
        else:
            max_features = self.max_features
            max_df = self.max_df
            min_df = self.min_df

            max_doc_count = (max_df if isinstance(max_df, numbers.Integral)
                                    else max_df * n_doc)
            min_doc_count = (min_df if isinstance(min_df, numbers.Integral)
                                    else min_df * n_doc)

            # filter out stop words: terms that occur in almost all documents
            if max_doc_count < n_doc or min_doc_count > 1:
                stop_words = set(t for t, dc in six.iteritems(document_counts)
                                   if not min_doc_count <= dc <= max_doc_count)
            else:
                stop_words = set()

            # list the terms that should be part of the vocabulary
            if max_features is None:
                terms = set(term_counts) - stop_words
            else:
                # extract the most frequent terms for the vocabulary
                terms = set()
                for t, tc in term_counts.most_common():
                    if t not in stop_words:
                        terms.add(t)
                    if len(terms) >= max_features:
                        break

            # store the learned stop words to make it easier to debug the value
            # of max_df
            self.stop_words_ = stop_words

            # free memory
            term_counts.clear()
            document_counts.clear()

            # store map from term name to feature integer index: we sort the
            # terms to have reproducible outcome for the vocabulary structure:
            # otherwise the mapping from feature name to indices might depend
            # on the memory layout of the machine. Furthermore sorted terms
            # might make it possible to perform binary search in the feature
            # names array.
            terms = sorted(terms)

            # reorder term indices
            reorder_indices = dict((vocab[term], i)
                                   for i, term in enumerate(terms))
            self.vocabulary_ = dict(((t, i) for i, t in enumerate(terms)))

            # create term count arrays with new vocabulary structure
            i_indices = _make_int_array()
            j_indices = _make_int_array()
            values = _make_int_array()
            for i, col in enumerate(col_ind):
                if col in reorder_indices:
                    i_indices.append(row_ind[i])
                    j_indices.append(reorder_indices[col_ind[i]])
                    values.append(feature_values[i])

            # free memory
            del reorder_indices
            del row_ind
            del col_ind
            del feature_values

        if not vocab:
            msg = "Empty vocabulary; "
            if fixed_vocab:
                msg += "%r passed to constructor." % vocab
            else:
                msg += "perhaps your documents contain stop words only?"
            raise ValueError(msg)

        # the term_counts and document_counts might be useful statistics, are
        # we really sure want we want to drop them? They take some memory but
        # can be useful for corpus introspection
        return self._term_counts_to_matrix(n_doc, i_indices, j_indices, values)
예제 #10
0
파일: knn.py 프로젝트: cimor/skl-groups
def _parse_specs(specs, Ks):
    '''
    Set up the different functions we need to call.

    Returns:
        - a dict mapping base estimator functions to _FuncInfo objects.
          If the function needs_alpha, then the alphas attribute is an array
          of alpha values and pos is a corresponding array of indices.
          Otherwise, alphas is None and pos is a list containing a single index.
          Indices are >= 0 if they correspond to something in a spec,
          and negative if they're just used for a meta estimator but not
          directly requested.
        - an OrderedDict mapping functions to _MetaFuncInfo objects.
          alphas and pos are like for _FuncInfo; deps is a list of indices
          which should be passed to the estimator. Note that these might be
          other meta functions; this list is guaranteed to be in an order
          such that all dependencies are resolved before calling that function.
          If no such order is possible, raise ValueError.
        - the number of meta-only results

    # TODO: update doctests for _parse_specs

    >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9'])
    ({<function alpha_div at 0x10954f848>:
            _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3])},
     OrderedDict([
        (<function hellinger at 0x10954fc80>,
            _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])),
        (<function renyi at 0x10954fcf8>,
            _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3]))
     ]), 3)

    >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9', 'l2'])
    ({<function alpha_div at 0x10954f848>:
        _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3]),
      <function linear at 0x10954f758>: _FuncInfo(alphas=None, pos=[-4])
     }, OrderedDict([
        (<function hellinger at 0x10954fc80>,
            _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])),
        (<function l2 at 0x10954fde8>,
            _MetaFuncInfo(alphas=None, pos=[3], deps=[-4])),
        (<function renyi at 0x10954fcf8>,
            _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3]))
     ]), 4)

    >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9', 'l2', 'linear'])
    ({<function alpha_div at 0x10954f848>:
        _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3]),
      <function linear at 0x10954f758>: _FuncInfo(alphas=None, pos=[4])
     }, OrderedDict([
        (<function hellinger at 0x10954fc80>,
            _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])),
        (<function l2 at 0x10954fde8>,
            _MetaFuncInfo(alphas=None, pos=[3], deps=[4])),
        (<function renyi at 0x10954fcf8>,
            _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3]))
     ]), 3)
    '''
    funcs = {}
    metas = {}
    meta_deps = defaultdict(set)

    def add_func(func, alpha=None, pos=None):
        needs_alpha = getattr(func, 'needs_alpha', False)
        is_meta = hasattr(func, 'needs_results')

        d = metas if is_meta else funcs
        if func not in d:
            if needs_alpha:
                args = {'alphas': [alpha], 'pos': [pos]}
            else:
                args = {'alphas': None, 'pos': [pos]}

            if not is_meta:
                d[func] = _FuncInfo(**args)
            else:
                d[func] = _MetaFuncInfo(deps=[], **args)
                for req in func.needs_results:
                    if callable(req.alpha):
                        req_alpha = req.alpha(alpha)
                    else:
                        req_alpha = req.alpha
                    add_func(req.func, alpha=req_alpha)
                    meta_deps[func].add(req.func)
                    meta_deps[req.func]  # make sure required func is in there

        else:
            # already have an entry for the func
            # need to give it this pos, if it's not None
            # and also make sure that the alpha is present
            info = d[func]
            if not needs_alpha:
                if pos is not None:
                    if info.pos != [None]:
                        msg = "{} passed more than once"
                        raise ValueError(msg.format(func_name))

                    info.pos[0] = pos
            else:  # needs alpha
                try:
                    idx = info.alphas.index(alpha)
                except ValueError:
                    # this is a new alpha value we haven't seen yet
                    info.alphas.append(alpha)
                    info.pos.append(pos)
                    if is_meta:
                        for req in func.needs_results:
                            if callable(req.alpha):
                                req_alpha = req.alpha(alpha)
                            else:
                                req_alpha = req.alpha
                            add_func(req.func, alpha=req_alpha)
                else:
                    # repeated alpha value
                    if pos is not None:
                        if info.pos[idx] is not None:
                            msg = "{} with alpha {} passed more than once"
                            raise ValueError(msg.format(func_name, alpha))
                        info.pos[idx] = pos

    # add functions for each spec
    for i, spec in enumerate(specs):
        func_name, alpha = (spec.split(':', 1) + [None])[:2]
        if alpha is not None:
            alpha = float(alpha)

        try:
            func = func_mapping[func_name]
        except KeyError:
            msg = "'{}' is not a known function type"
            raise ValueError(msg.format(func_name))

        needs_alpha = getattr(func, 'needs_alpha', False)
        if needs_alpha and alpha is None:
            msg = "{} needs alpha but not passed in spec '{}'"
            raise ValueError(msg.format(func_name, spec))
        elif not needs_alpha and alpha is not None:
            msg = "{} doesn't need alpha but is passed in spec '{}'"
            raise ValueError(msg.format(func_name, spec))

        add_func(func, alpha, i)

    # number things that are dependencies only
    meta_counter = itertools.count(-1, step=-1)
    for info in itertools.chain(itervalues(funcs), itervalues(metas)):
        for i, pos in enumerate(info.pos):
            if pos is None:
                info.pos[i] = next(meta_counter)

    # fill in the dependencies for metas
    for func, info in iteritems(metas):
        deps = info.deps
        assert deps == []

        for req in func.needs_results:
            f = req.func
            req_info = (metas if hasattr(f, 'needs_results') else funcs)[f]
            if req.alpha is not None:
                if callable(req.alpha):
                    req_alpha = req.alpha(info.alphas)
                else:
                    req_alpha = req.alpha

                find_alpha = np.vectorize(req_info.alphas.index, otypes=[int])
                pos = np.asarray(req_info.pos)[find_alpha(req_alpha)]
                if np.isscalar(pos):
                    deps.append(pos[()])
                else:
                    deps.extend(pos)
            else:
                pos, = req_info.pos
                deps.append(pos)

    # topological sort of metas
    meta_order = topological_sort(meta_deps)
    metas_ordered = OrderedDict(
        (f, metas[f]) for f in meta_order if hasattr(f, 'needs_results'))

    return funcs, metas_ordered, -next(meta_counter) - 1
예제 #11
0
파일: knn.py 프로젝트: zshwuhan/skl-groups
def _parse_specs(specs, Ks):
    '''
    Set up the different functions we need to call.

    Returns:
        - a dict mapping base estimator functions to _FuncInfo objects.
          If the function needs_alpha, then the alphas attribute is an array
          of alpha values and pos is a corresponding array of indices.
          Otherwise, alphas is None and pos is a list containing a single index.
          Indices are >= 0 if they correspond to something in a spec,
          and negative if they're just used for a meta estimator but not
          directly requested.
        - an OrderedDict mapping functions to _MetaFuncInfo objects.
          alphas and pos are like for _FuncInfo; deps is a list of indices
          which should be passed to the estimator. Note that these might be
          other meta functions; this list is guaranteed to be in an order
          such that all dependencies are resolved before calling that function.
          If no such order is possible, raise ValueError.
        - the number of meta-only results

    # TODO: update doctests for _parse_specs

    >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9'])
    ({<function alpha_div at 0x10954f848>:
            _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3])},
     OrderedDict([
        (<function hellinger at 0x10954fc80>,
            _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])),
        (<function renyi at 0x10954fcf8>,
            _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3]))
     ]), 3)

    >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9', 'l2'])
    ({<function alpha_div at 0x10954f848>:
        _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3]),
      <function linear at 0x10954f758>: _FuncInfo(alphas=None, pos=[-4])
     }, OrderedDict([
        (<function hellinger at 0x10954fc80>,
            _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])),
        (<function l2 at 0x10954fde8>,
            _MetaFuncInfo(alphas=None, pos=[3], deps=[-4])),
        (<function renyi at 0x10954fcf8>,
            _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3]))
     ]), 4)

    >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9', 'l2', 'linear'])
    ({<function alpha_div at 0x10954f848>:
        _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3]),
      <function linear at 0x10954f758>: _FuncInfo(alphas=None, pos=[4])
     }, OrderedDict([
        (<function hellinger at 0x10954fc80>,
            _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])),
        (<function l2 at 0x10954fde8>,
            _MetaFuncInfo(alphas=None, pos=[3], deps=[4])),
        (<function renyi at 0x10954fcf8>,
            _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3]))
     ]), 3)
    '''
    funcs = {}
    metas = {}
    meta_deps = defaultdict(set)

    def add_func(func, alpha=None, pos=None):
        needs_alpha = getattr(func, 'needs_alpha', False)
        is_meta = hasattr(func, 'needs_results')

        d = metas if is_meta else funcs
        if func not in d:
            if needs_alpha:
                args = {'alphas': [alpha], 'pos': [pos]}
            else:
                args = {'alphas': None, 'pos': [pos]}

            if not is_meta:
                d[func] = _FuncInfo(**args)
            else:
                d[func] = _MetaFuncInfo(deps=[], **args)
                for req in func.needs_results:
                    if callable(req.alpha):
                        req_alpha = req.alpha(alpha)
                    else:
                        req_alpha = req.alpha
                    add_func(req.func, alpha=req_alpha)
                    meta_deps[func].add(req.func)
                    meta_deps[req.func]  # make sure required func is in there

        else:
            # already have an entry for the func
            # need to give it this pos, if it's not None
            # and also make sure that the alpha is present
            info = d[func]
            if not needs_alpha:
                if pos is not None:
                    if info.pos != [None]:
                        msg = "{} passed more than once"
                        raise ValueError(msg.format(func_name))

                    info.pos[0] = pos
            else:  # needs alpha
                try:
                    idx = info.alphas.index(alpha)
                except ValueError:
                    # this is a new alpha value we haven't seen yet
                    info.alphas.append(alpha)
                    info.pos.append(pos)
                    if is_meta:
                        for req in func.needs_results:
                            if callable(req.alpha):
                                req_alpha = req.alpha(alpha)
                            else:
                                req_alpha = req.alpha
                            add_func(req.func, alpha=req_alpha)
                else:
                    # repeated alpha value
                    if pos is not None:
                        if info.pos[idx] is not None:
                            msg = "{} with alpha {} passed more than once"
                            raise ValueError(msg.format(func_name, alpha))
                        info.pos[idx] = pos

    # add functions for each spec
    for i, spec in enumerate(specs):
        func_name, alpha = (spec.split(':', 1) + [None])[:2]
        if alpha is not None:
            alpha = float(alpha)

        try:
            func = func_mapping[func_name]
        except KeyError:
            msg = "'{}' is not a known function type"
            raise ValueError(msg.format(func_name))

        needs_alpha = getattr(func, 'needs_alpha', False)
        if needs_alpha and alpha is None:
            msg = "{} needs alpha but not passed in spec '{}'"
            raise ValueError(msg.format(func_name, spec))
        elif not needs_alpha and alpha is not None:
            msg = "{} doesn't need alpha but is passed in spec '{}'"
            raise ValueError(msg.format(func_name, spec))

        add_func(func, alpha, i)

    # number things that are dependencies only
    meta_counter = itertools.count(-1, step=-1)
    for info in itertools.chain(itervalues(funcs), itervalues(metas)):
        for i, pos in enumerate(info.pos):
            if pos is None:
                info.pos[i] = next(meta_counter)

    # fill in the dependencies for metas
    for func, info in iteritems(metas):
        deps = info.deps
        assert deps == []

        for req in func.needs_results:
            f = req.func
            req_info = (metas if hasattr(f, 'needs_results') else funcs)[f]
            if req.alpha is not None:
                if callable(req.alpha):
                    req_alpha = req.alpha(info.alphas)
                else:
                    req_alpha = req.alpha

                find_alpha = np.vectorize(req_info.alphas.index, otypes=[int])
                pos = np.asarray(req_info.pos)[find_alpha(req_alpha)]
                if np.isscalar(pos):
                    deps.append(pos[()])
                else:
                    deps.extend(pos)
            else:
                pos, = req_info.pos
                deps.append(pos)

    # topological sort of metas
    meta_order = topological_sort(meta_deps)
    metas_ordered = OrderedDict(
        (f, metas[f]) for f in meta_order if hasattr(f, 'needs_results'))

    return funcs, metas_ordered, -next(meta_counter) - 1