Exemplo n.º 1
0
class same_subfolder(web.Filter):
    def __init__(self, kvlclient, label_store):
        super(same_subfolder, self).__init__()
        self.kvl = kvlclient
        self.label_store = label_store
        self.folders = Folders(self.kvl)

    def create_predicate(self):
        subfolders = self.folders.parent_subfolders(self.query_content_id)
        cids = set()
        for folder_id, subfolder_id in subfolders:
            for cid, subid in self.folders.items(folder_id, subfolder_id):
                cids.add(cid)
                # Also add directly connected labels too.
                for lab in self.label_store.directly_connected((cid, subid)):
                    cids.add(lab.other(cid))
        return lambda (content_id, fc): content_id not in cids
Exemplo n.º 2
0
class same_subfolder(web.Filter):
    def __init__(self, kvlclient, label_store):
        super(same_subfolder, self).__init__()
        self.kvl = kvlclient
        self.label_store = label_store
        self.folders = Folders(self.kvl)

    def create_predicate(self):
        subfolders = self.folders.parent_subfolders(self.query_content_id)
        cids = set()
        for folder_id, subfolder_id in subfolders:
            for cid, subid in self.folders.items(folder_id, subfolder_id):
                cids.add(cid)
                # Also add directly connected labels too.
                for lab in self.label_store.directly_connected((cid, subid)):
                    cids.add(lab.other(cid))
        return lambda (content_id, fc): content_id not in cids
Exemplo n.º 3
0
class PairwiseFeatureLearner(object):
    '''A pairwise active learning model.

    This active learning model applies
    :class:`~sklearn.linear_model.LogisticRegression` on-the-fly
    as a user (or simulated user) interacts with content
    via the web services provided by :mod:`dossier.web`.

    This reads :class:`~dossier.label.Label` objects from
    :class:`~dossier.label.LabelStore` and provides predictions of
    pairwise equivalence, which can be used for coreference resolution,
    clustering, and ranking.

    .. automethod:: dossier.models.PairwiseFeatureLearner.__init__
    .. automethod:: dossier.models.PairwiseFeatureLearner.probabilities
    '''
    def __init__(self,
                 store,
                 label_store,
                 content_id,
                 subtopic_id=None,
                 canopy_limit=None,
                 label_limit=None):
        '''Build a new model.

        :param store: A store of feature collections.
        :type store: :class:`dossier.store.Store`
        :param label_store: A store of labels (ground truth data).
        :type label_store: :class:`dossier.label.LabelStore`
        :param str content_id: The query content id (which should correspond
                               to a feature collection in the ``store``).
                               If it doesn't, no results are returned.
        :param int canopy_limit: A limit on the number of results to return
                                 in the canopy (the initial index scan).
                                 This is meant to be a mechanism for resource
                                 control.
        :param int label_limit: A limit on the number of labels to use in
                                training. This is meant to be a mechanism for
                                resource control.
        '''
        self.store = store
        self.label_store = label_store
        self.folders = Folders(store.kvl)
        self.query_content_id = content_id
        self.query_subtopic_id = subtopic_id
        self.query_fc = None
        self.canopy_limit = canopy_limit
        self.label_limit = label_limit

    def as_result(self, cid, fc, p):
        fnames = sorted(set(self.query_fc.keys()).intersection(fc.keys()))
        intermediates = dict([(n, {
            'kernel': 'cosine',
            'feature1': n,
            'feature2': n,
            'kernel_value': None,
            'weight': None,
            'common_feature_values': []
        }) for n in fnames])
        for n in fnames:
            intermediates[n]['weight'] = self.feature_weights.get(n)
        for n, qfeat, cfeat in ((n, self.query_fc[n], fc[n]) for n in fnames):
            if not isinstance(qfeat, StringCounter) \
                    or not isinstance(cfeat, StringCounter):
                continue
            vals = set(qfeat.keys()).intersection(cfeat.keys())
            intermediates[n]['common_feature_values'] = \
                sorted(filter(None, vals))

            all_vals = sorted(set(qfeat.keys()).union(cfeat.keys()))
            if len(all_vals) > 0:
                qcounts = [qfeat.get(v, 0) for v in all_vals]
                ccounts = [cfeat.get(v, 0) for v in all_vals]
                sim = cosine(qcounts, ccounts)
                if not math.isnan(sim):
                    intermediates[n]['kernel_value'] = sim
        return (cid, fc, {
            'probability': p,
            'intermediate_model_results': intermediates.values(),
        })

    def probabilities(self):
        '''Trains a model and predicts recommendations.

        If the query feature collection could not be found or if there
        is insufficient training data, an empty list is returned.

        Otherwise, a list of content objects (tuples of content
        id and feature collection) and probabilities is returned.
        The probability is generated from the model, and reflects
        confidence of the model that the corresponding content object
        is related to the query based on the ground truth data.

        On a large database, random samples are used for training, so
        this function is not deterministic.

        :rtype: ``list`` of
          ((``content_id``, :class:`dossier.fc.FeatureCollection`),
          probability)
        '''
        self.query_fc = self.store.get(self.query_content_id)
        if self.query_fc is None:
            logger.warning('Could not find FC for %s', self.query_content_id)
            return []

        # Try the canopy query before training, because if the canopy query
        # gives us nothing, then there's no point in the additional work.
        #
        # Possible optimization: If the canopy query yields fewer than N
        # results, then can we just return all of them? ---AG
        #
        # N.B Doing the canopy query first will cause things to be slower
        # when there is insufficient training data.
        candidates = self.canopy(limit=self.canopy_limit)
        if len(candidates) == 0:
            logger.info(
                'Could not find any candidates in a canopy query by '
                'scanning the following indexes: %s',
                ', '.join(self.store.index_names()))
            return []

        # Get labels from the database and translate them to the form
        # `[{-1, 1}, i, j]` where `i, j` are indices into the list
        # `content_objs`, which has type `[(content_id, FeatureCollection)]`.
        logger.info('Fetching labels...')
        labels = list(self.labels_from_query(limit=self.label_limit))
        logger.info('Fetching FCs from labels...')
        content_objs = self.content_objs_from_labels(labels)
        indexed_labels = labels_to_indexed_coref_values(content_objs, labels)

        logger.info('Training...')
        model = self.train(content_objs, indexed_labels)
        if model is None:
            logger.info(
                'Could not train model: insufficient training data. '
                '(query content id: %s)', self.query_content_id)
            raise InsufficientTrainingData

        feature_names, classifier, transformer = model
        return zip(
            candidates,
            self.classify(feature_names, classifier, transformer, candidates))

    def train(self, content_objs, idx_labels):
        '''Trains and returns a model using sklearn.

        If there are new labels to add, they can be added, returns an
        sklearn model which can be used for prediction and getting
        features.

        This method may return ``None`` if there is insufficient
        training data to produce a model.

        :param labels: Ground truth data.
        :type labels: list of ``({-1, 1}, index1, index2)``.
        '''
        # We have insufficient training data when there is only one or
        # fewer classes of labels.
        if len(set([lab[0] for lab in idx_labels])) <= 1:
            return None

        fcs = [fc for _, fc in content_objs]
        feature_names = vectorizable_features(fcs)
        dis = dissimilarities(feature_names, fcs)

        phi_dicts, labels = [], []  # lists are in correspondence
        for coref_value, i, j in idx_labels:
            # i, j are indices into the list `fcs`
            labels.append(coref_value)  # either -1 or 1
            phi_dict = dict([(name, dis[name][i, j])
                             for name in feature_names])
            phi_dicts.append(phi_dict)

        vec = dict_vector()
        training_data = vec.fit_transform(phi_dicts)

        model = LogisticRegression(class_weight='auto', penalty='l1')
        model.fit(training_data, labels)
        self.feature_weights = dict([(name, model.coef_[0][i])
                                     for i, name in enumerate(feature_names)])
        return feature_names, model, vec

    def classify(self,
                 feature_names,
                 classifier,
                 transformer,
                 candidates,
                 query_fc=None):
        '''Returns ``[probability]`` in correspondence with
        ``candidates``.

        Where each ``probability`` corresponds to the probability that
        the corresponding candidate is classified with a positive label
        given the training data.

        The list returned is in correspondence with the list of
        candidates given.

        N.B. The contract of this method should be simplified by
        bundling ``feature_names``, ``classifier`` and ``transformer``
        into one thing known as "the model." ---AG
        '''
        if query_fc is None:
            query_fc = self.query_fc
        dis = {}
        for name in feature_names:
            vec = dict_vector()
            query = vec.fit_transform([get_feat(query_fc, name)])
            cans = vec.transform(get_feat(fc, name) for _, fc in candidates)
            dis[name] = 1 - pairwise_distances(
                cans, query, metric='cosine', n_jobs=1)[:, 0]

        # in correspondence with `candidates`
        phi_dicts = transformer.transform([
            dict([(name, dis[name][i]) for name in feature_names])
            for i in xrange(len(candidates))
        ])
        return classifier.predict_proba(phi_dicts)[:, 1]

    def canopy(self, limit=None):
        ids = web.streaming_sample(
            self.canopy_ids(limit_hint=hard_limit(limit)), limit,
            hard_limit(limit))
        # I don't think it ever makes sense to include the query
        # as part of the candidate set.
        return filter(lambda (_, fc): fc is not None, self.store.get_many(ids))

    def canopy_ids(self, limit_hint=None):
        limit_hint = limit_hint or 1000
        # TODO: It seems like this should pre-emptively discard content
        # ids that have already participated in a *direct* label with
        # the query. But I think this is a premature optimization since
        # the filtering functions will take care of it. (This optimization
        # would mean fewer kernel computations.)
        blacklist = set([self.query_content_id])
        cids = set()

        # OK, so it turns out that a naive index scan is pretty inflexible and
        # arbitrary. The issue is that in a big enough data set, the first
        # index scan will probably exhaust all of our result set, which
        # means result sets will never see any variety.
        #
        # Instead, we'll try to sample from each index in small batch sizes.
        # This is a heuristic; not a principled approach. ---AG
        index_names = self.store.index_names()
        batch_size = limit_hint / 10
        progress = {}  # idx, name |--> last end
        # When `progress` is empty, the following loop will terminate.
        # An index is removed from `progress` when it no longer produces
        # results.
        for idx_name in index_names:
            feat = self.query_fc.get(idx_name)
            if isinstance(feat, StringCounter):
                for name in feat:
                    if len(name) > 0:
                        progress[(idx_name, name)] = 0

        logger.info('starting index scan (query content id: %s)',
                    self.query_content_id)
        while len(progress) > 0:
            for idx_name in index_names:
                for name in self.query_fc.get(idx_name, []):
                    key = (idx_name, name)
                    if key not in progress:
                        continue
                    logger.info('[StringCounter index: %s] scanning for "%s"',
                                idx_name, name)
                    scanner = self.store.index_scan(idx_name, name)
                    progressed = 0
                    for cid in islice(scanner, progress[key], None):
                        if progressed >= batch_size:
                            break
                        if cid not in cids and cid not in blacklist:
                            cids.add(cid)
                            progressed += 1
                            yield cid
                    if progressed == 0:
                        progress.pop(key)
                    else:
                        progress[key] += progressed

    def labels_from_query(self, limit=None):
        '''ContentId -> [Label]'''
        return self.infer_subtopic_labels(limit=limit)

    def infer_subtopic_labels(self, limit=None):
        # The basic idea here is to aggressively gather truth data while
        # avoiding cross contamination with other subfolders. Since our query
        # is a (content_id, subtopic_id), we can use subtopic connected
        # components to achieve this.

        # Short aliases.
        cid, subid = self.query_content_id, self.query_subtopic_id

        # For positive labels, the only thing we can do is traverse the
        # subtopic connected component.
        # Don't impose a hard limit on positive labels. (There are probably
        # very few of them.)
        logger.info('Inferring positive labels for: %r', (cid, subid))
        pos_labels = (self.label_store.expand(
            (cid, subid)) + list(self.positive_subtopic_labels()))
        logger.info('Inferring negative labels for: %r', (cid, subid))
        neg_labels = self.negative_subtopic_labels()

        pos_sample = web.streaming_sample(pos_labels,
                                          limit,
                                          limit=hard_limit(limit))
        neg_sample = web.streaming_sample(neg_labels,
                                          limit,
                                          limit=hard_limit(limit))
        print('-' * 79)
        print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n')
        print('-' * 79)
        print('NEGATIVES\n', '\n'.join(map(repr, neg_sample)))
        print('-' * 79)
        return pos_sample + neg_sample

    def positive_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        subfolders = list(self.folders.parent_subfolders((cid, subid)))

        for fid, subfolder_id in subfolders:
            for cid2, subid2 in self.folders.items(fid, subfolder_id):
                # Since this item is in the same folder as our query, we
                # consider it a positive example. But there's no explicit
                # label for it, so manufacture one.
                #
                # TODO: Fix annotator id here. (We need to push annotator
                # information down into the search engine; the rest is
                # trivial.) ---AG
                yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID,
                            CorefValue.Positive, subid, subid2)

                # Sometimes the user will directly attach a positive label
                # to an item in the folder. This will grab those.
                for lab in self.label_store.directly_connected(cid2):
                    if lab.value == CorefValue.Positive \
                            and lab.subtopic_for(cid2) == subid2:
                        yield lab

    def negative_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        for lab in negative_subtopic_labels(self.label_store, self.folders,
                                            cid, subid):
            yield lab

    def content_objs_from_labels(self, labels):
        '''[Label] -> [(content_id, FeatureCollection)]'''
        is_mapping = lambda obj: isinstance(obj, collections.Mapping)

        def is_valid_fc((cid, fc)):
            if fc is None:
                return False
            if sum(1 for name in fc if is_mapping(fc[name])) == 0:
                return False
            return True

        ids = set()
        for lab in labels:
            ids.add(lab.content_id1)
            ids.add(lab.content_id2)
        return list(ifilter(is_valid_fc, self.store.get_many(ids)))
Exemplo n.º 4
0
class PairwiseFeatureLearner(object):
    '''A pairwise active learning model.

    This active learning model applies
    :class:`~sklearn.linear_model.LogisticRegression` on-the-fly
    as a user (or simulated user) interacts with content
    via the web services provided by :mod:`dossier.web`.

    This reads :class:`~dossier.label.Label` objects from
    :class:`~dossier.label.LabelStore` and provides predictions of
    pairwise equivalence, which can be used for coreference resolution,
    clustering, and ranking.

    .. automethod:: dossier.models.PairwiseFeatureLearner.__init__
    .. automethod:: dossier.models.PairwiseFeatureLearner.probabilities
    '''
    def __init__(self, store, label_store, content_id, subtopic_id=None,
                 canopy_limit=None, label_limit=None):
        '''Build a new model.

        :param store: A store of feature collections.
        :type store: :class:`dossier.store.Store`
        :param label_store: A store of labels (ground truth data).
        :type label_store: :class:`dossier.label.LabelStore`
        :param str content_id: The query content id (which should correspond
                               to a feature collection in the ``store``).
                               If it doesn't, no results are returned.
        :param int canopy_limit: A limit on the number of results to return
                                 in the canopy (the initial index scan).
                                 This is meant to be a mechanism for resource
                                 control.
        :param int label_limit: A limit on the number of labels to use in
                                training. This is meant to be a mechanism for
                                resource control.
        '''
        self.store = store
        self.label_store = label_store
        self.folders = Folders(store.kvl)
        self.query_content_id = content_id
        self.query_subtopic_id = subtopic_id
        self.query_fc = None
        self.canopy_limit = canopy_limit
        self.label_limit = label_limit

    def as_result(self, cid, fc, p):
        fnames = sorted(set(self.query_fc.keys()).intersection(fc.keys()))
        intermediates = dict([(n, {'kernel': 'cosine',
                             'feature1': n,
                             'feature2': n,
                             'kernel_value': None,
                             'weight': None,
                             'common_feature_values': []}) for n in fnames])
        for n in fnames:
            intermediates[n]['weight'] = self.feature_weights.get(n)
        for n, qfeat, cfeat in ((n, self.query_fc[n], fc[n]) for n in fnames):
            if not isinstance(qfeat, StringCounter) \
                    or not isinstance(cfeat, StringCounter):
                continue
            vals = set(qfeat.keys()).intersection(cfeat.keys())
            intermediates[n]['common_feature_values'] = \
                sorted(filter(None, vals))

            all_vals = sorted(set(qfeat.keys()).union(cfeat.keys()))
            if len(all_vals) > 0:
                qcounts = [qfeat.get(v, 0) for v in all_vals]
                ccounts = [cfeat.get(v, 0) for v in all_vals]
                sim = cosine(qcounts, ccounts)
                if not math.isnan(sim):
                    intermediates[n]['kernel_value'] = sim
        return (cid, fc, {
            'probability': p,
            'intermediate_model_results': intermediates.values(),
        })

    def probabilities(self):
        '''Trains a model and predicts recommendations.

        If the query feature collection could not be found or if there
        is insufficient training data, an empty list is returned.

        Otherwise, a list of content objects (tuples of content
        id and feature collection) and probabilities is returned.
        The probability is generated from the model, and reflects
        confidence of the model that the corresponding content object
        is related to the query based on the ground truth data.

        On a large database, random samples are used for training, so
        this function is not deterministic.

        :rtype: ``list`` of
          ((``content_id``, :class:`dossier.fc.FeatureCollection`),
          probability)
        '''
        self.query_fc = self.store.get(self.query_content_id)
        if self.query_fc is None:
            logger.warning('Could not find FC for %s', self.query_content_id)
            return []

        # Try the canopy query before training, because if the canopy query
        # gives us nothing, then there's no point in the additional work.
        #
        # Possible optimization: If the canopy query yields fewer than N
        # results, then can we just return all of them? ---AG
        #
        # N.B Doing the canopy query first will cause things to be slower
        # when there is insufficient training data.
        candidates = self.canopy(limit=self.canopy_limit)
        if len(candidates) == 0:
            logger.info(
                'Could not find any candidates in a canopy query by '
                'scanning the following indexes: %s',
                ', '.join(self.store.index_names()))
            return []

        # Get labels from the database and translate them to the form
        # `[{-1, 1}, i, j]` where `i, j` are indices into the list
        # `content_objs`, which has type `[(content_id, FeatureCollection)]`.
        logger.info('Fetching labels...')
        labels = list(self.labels_from_query(limit=self.label_limit))
        logger.info('Fetching FCs from labels...')
        content_objs = self.content_objs_from_labels(labels)
        indexed_labels = labels_to_indexed_coref_values(content_objs, labels)

        logger.info('Training...')
        model = self.train(content_objs, indexed_labels)
        if model is None:
            logger.info(
                'Could not train model: insufficient training data. '
                '(query content id: %s)', self.query_content_id)
            raise InsufficientTrainingData

        feature_names, classifier, transformer = model
        return zip(candidates, self.classify(
            feature_names, classifier, transformer, candidates))

    def train(self, content_objs, idx_labels):
        '''Trains and returns a model using sklearn.

        If there are new labels to add, they can be added, returns an
        sklearn model which can be used for prediction and getting
        features.

        This method may return ``None`` if there is insufficient
        training data to produce a model.

        :param labels: Ground truth data.
        :type labels: list of ``({-1, 1}, index1, index2)``.
        '''
        # We have insufficient training data when there is only one or
        # fewer classes of labels.
        if len(set([lab[0] for lab in idx_labels])) <= 1:
            return None

        fcs = [fc for _, fc in content_objs]
        feature_names = vectorizable_features(fcs)
        dis = dissimilarities(feature_names, fcs)

        phi_dicts, labels = [], []  # lists are in correspondence
        for coref_value, i, j in idx_labels:
            # i, j are indices into the list `fcs`
            labels.append(coref_value)  # either -1 or 1
            phi_dict = dict([(name, dis[name][i,j]) for name in feature_names])
            phi_dicts.append(phi_dict)

        vec = dict_vector()
        training_data = vec.fit_transform(phi_dicts)

        model = LogisticRegression(class_weight='auto', penalty='l1')
        model.fit(training_data, labels)
        self.feature_weights = dict([(name, model.coef_[0][i])
                                     for i, name in enumerate(feature_names)])
        return feature_names, model, vec

    def classify(self, feature_names, classifier, transformer, candidates,
                 query_fc=None):
        '''Returns ``[probability]`` in correspondence with
        ``candidates``.

        Where each ``probability`` corresponds to the probability that
        the corresponding candidate is classified with a positive label
        given the training data.

        The list returned is in correspondence with the list of
        candidates given.

        N.B. The contract of this method should be simplified by
        bundling ``feature_names``, ``classifier`` and ``transformer``
        into one thing known as "the model." ---AG
        '''
        if query_fc is None:
            query_fc = self.query_fc
        dis = {}
        for name in feature_names:
            vec = dict_vector()
            query = vec.fit_transform([get_feat(query_fc, name)])
            cans = vec.transform(get_feat(fc, name) for _, fc in candidates)
            dis[name] = 1 - pairwise_distances(
                cans, query, metric='cosine', n_jobs=1)[:,0]

        # in correspondence with `candidates`
        phi_dicts = transformer.transform(
            [dict([(name, dis[name][i]) for name in feature_names])
             for i in xrange(len(candidates))])
        return classifier.predict_proba(phi_dicts)[:,1]

    def canopy(self, limit=None):
        ids = web.streaming_sample(
            self.canopy_ids(limit_hint=hard_limit(limit)),
            limit, hard_limit(limit))
        # I don't think it ever makes sense to include the query
        # as part of the candidate set.
        return filter(lambda (_, fc): fc is not None, self.store.get_many(ids))

    def canopy_ids(self, limit_hint=None):
        limit_hint = limit_hint or 1000
        # TODO: It seems like this should pre-emptively discard content
        # ids that have already participated in a *direct* label with
        # the query. But I think this is a premature optimization since
        # the filtering functions will take care of it. (This optimization
        # would mean fewer kernel computations.)
        blacklist = set([self.query_content_id])
        cids = set()

        # OK, so it turns out that a naive index scan is pretty inflexible and
        # arbitrary. The issue is that in a big enough data set, the first
        # index scan will probably exhaust all of our result set, which
        # means result sets will never see any variety.
        #
        # Instead, we'll try to sample from each index in small batch sizes.
        # This is a heuristic; not a principled approach. ---AG
        index_names = self.store.index_names()
        batch_size = limit_hint / 10
        progress = {}  # idx, name |--> last end
        # When `progress` is empty, the following loop will terminate.
        # An index is removed from `progress` when it no longer produces
        # results.
        for idx_name in index_names:
            feat = self.query_fc.get(idx_name)
            if isinstance(feat, StringCounter):
                for name in feat:
                    if len(name) > 0:
                        progress[(idx_name, name)] = 0

        logger.info('starting index scan (query content id: %s)',
                    self.query_content_id)
        while len(progress) > 0:
            for idx_name in index_names:
                for name in self.query_fc.get(idx_name, []):
                    key = (idx_name, name)
                    if key not in progress:
                        continue
                    logger.info('[StringCounter index: %s] scanning for "%s"',
                                idx_name, name)
                    scanner = self.store.index_scan(idx_name, name)
                    progressed = 0
                    for cid in islice(scanner, progress[key], None):
                        if progressed >= batch_size:
                            break
                        if cid not in cids and cid not in blacklist:
                            cids.add(cid)
                            progressed += 1
                            yield cid
                    if progressed == 0:
                        progress.pop(key)
                    else:
                        progress[key] += progressed

    def labels_from_query(self, limit=None):
        '''ContentId -> [Label]'''
        return self.infer_subtopic_labels(limit=limit)

    def infer_subtopic_labels(self, limit=None):
        # The basic idea here is to aggressively gather truth data while
        # avoiding cross contamination with other subfolders. Since our query
        # is a (content_id, subtopic_id), we can use subtopic connected
        # components to achieve this.

        # Short aliases.
        cid, subid = self.query_content_id, self.query_subtopic_id

        # For positive labels, the only thing we can do is traverse the
        # subtopic connected component.
        # Don't impose a hard limit on positive labels. (There are probably
        # very few of them.)
        logger.info('Inferring positive labels for: %r', (cid, subid))
        pos_labels = (self.label_store.expand((cid, subid))
                      + list(self.positive_subtopic_labels()))
        logger.info('Inferring negative labels for: %r', (cid, subid))
        neg_labels = self.negative_subtopic_labels()

        pos_sample = web.streaming_sample(
            pos_labels, limit, limit=hard_limit(limit))
        neg_sample = web.streaming_sample(
            neg_labels, limit, limit=hard_limit(limit))
        print('-' * 79)
        print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n')
        print('-' * 79)
        print('NEGATIVES\n', '\n'.join(map(repr, neg_sample)))
        print('-' * 79)
        return pos_sample + neg_sample

    def positive_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        subfolders = list(self.folders.parent_subfolders((cid, subid)))

        for fid, subfolder_id in subfolders:
            for cid2, subid2 in self.folders.items(fid, subfolder_id):
                # Since this item is in the same folder as our query, we
                # consider it a positive example. But there's no explicit
                # label for it, so manufacture one.
                #
                # TODO: Fix annotator id here. (We need to push annotator
                # information down into the search engine; the rest is
                # trivial.) ---AG
                yield Label(cid, cid2,
                            Folders.DEFAULT_ANNOTATOR_ID,
                            CorefValue.Positive,
                            subid, subid2)

                # Sometimes the user will directly attach a positive label
                # to an item in the folder. This will grab those.
                for lab in self.label_store.directly_connected(cid2):
                    if lab.value == CorefValue.Positive \
                            and lab.subtopic_for(cid2) == subid2:
                        yield lab

    def negative_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        for lab in negative_subtopic_labels(self.label_store, self.folders,
                                            cid, subid):
            yield lab

    def content_objs_from_labels(self, labels):
        '''[Label] -> [(content_id, FeatureCollection)]'''
        is_mapping = lambda obj: isinstance(obj, collections.Mapping)
        def is_valid_fc((cid, fc)):
            if fc is None:
                return False
            if sum(1 for name in fc if is_mapping(fc[name])) == 0:
                return False
            return True

        ids = set()
        for lab in labels:
            ids.add(lab.content_id1)
            ids.add(lab.content_id2)
        return list(ifilter(is_valid_fc, self.store.get_many(ids)))
Exemplo n.º 5
0
def worker(work_unit, max_sample=1000):
    '''Expects a coordinate WorkUnit for DragNet and runs the following
    steps:

    1. scans all dossiers at the *folder* level and assembles feature
    vectors for each folder -- see `make_feature`

    2. trains a multinomial naive Bayes classifier that treats each
    *folder* as a classifier target.

    3. sample the corpus by scanning up to `max_sample` and applying
    the classifier to each item to get an approx "size" of the Folder

    4. Bootstrap by treating those classifier predictions as truth
    data and extract the learned features that are predictive as new
    query strings.

    5. Put the data in kvlayer for webservice end point to return to
    polling client -- see dossier.models.routes

    '''
    if 'config' not in work_unit.spec:
        raise coordinate.exceptions.ProgrammerError(
            'could not run dragnet without global config')

    web_conf = Config()
    unitconf = work_unit.spec['config']
    with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf],
                                   config=unitconf):

        labels = []
        D = list()

        label2fid = dict()

        rejects = set()
        keepers = set()

        # 1. make a classifier target for each *folder*, ignoring
        # subfolder structure
        FT = Folders(web_conf.kvlclient)
        for idx, fid in enumerate(FT.folders()):
            label2fid[idx] = fid
            for sid in FT.subfolders(fid):
                for cid, subtopic_id in FT.items(fid, sid):
                    fc = web_conf.store.get(cid)
                    if fc:
                        # NB: first call to make_feature
                        feat, _rejects, _keepers = make_feature(fc)
                    else:
                        _rejects = {}
                        _keepers = {}
                    D.append(feat)
                    labels.append(idx)
                    rejects.update(_rejects)
                    keepers.update(_keepers)
                    logger.info('fid=%r, observation: %r', fid, cid)

        # 2. Convert the StringCounters into an sklearn format and
        # train MultinomialNB
        logger.info('transforming...')
        v = DictVectorizer(sparse=False)
        X = v.fit_transform(D)
        logger.info('transform fit done.')

        labels = np.array(labels)

        # Fit the sklearn Bernoulli Naive Bayes classifer
        clf = MultinomialNB()
        clf.fit(X, labels)
        logger.info('fit MultinomialNB')

        # 3. Scan the corpus up to max_sample putting the items into
        # each target to get an approx "size" of the Folder
        counts = Counter()
        for cid, fc in islice(web_conf.store.scan(), max_sample):
            # build the same feature vector as the training process
            feat, _rejects, _keepers = make_feature(fc)
            X = v.transform([feat])
            # predict which folder it belongs in
            target = clf.predict(X[0])[0]
            # count the effective size of that folder in this sample
            counts[label2fid[target]] += 1

        logger.info('counts done')

        ## 4. Bootstrap by treating those classifier predictions as
        ## truth data and extract the learned features that are
        ## predictive as new query strings.
        clusters = []
        for idx in sorted(set(labels)):
            logger.debug('considering cluster: %d', idx)
            try:
                all_features = v.inverse_transform(clf.feature_log_prob_[idx])[0]
            except:
                logger.warn('beyond edge on cluster %d', idx)
                continue
            words = Counter(all_features)
            ordered = sorted(words.items(),
                             key=operator.itemgetter(1), reverse=True)
            filtered = []
            for it in ordered:
                if is_bad_token(it[0]): continue

                if is_username(it[0]):
                    logger.debug('%r is_username', it[0])
                #else:
                #    continue
                filtered.append(it)
                if len(filtered) > 100: # hard cutoff
                    break

            # normalize cluster size exponentially
            biggest = exp(filtered[0][1])
            # rescale all by biggest
            filtered = [(key, int(round(counts[label2fid[idx]] * exp(w) / biggest))) for key, w in filtered]
            # describe what we just figured out
            logger.info('%s --> %r', label2fid[idx], ['%s: %d' % it for it in filtered[:10]])

            # return build the JSON-serializable format for the
            # DragNet UI embedded inside SortingDesk
            cluster = []
            cluster.append({'caption': label2fid[idx],
                            'weight': counts[label2fid[idx]],
                            'folder_id': None,
                            })
            cluster += [{'caption': caption, 'weight': weight, 'folder_id': label2fid[idx]} for caption, weight in filtered if weight > 0]
            clusters.append(cluster)

        # 5. Put the data in kvlayer for webservice end point to
        # return to polling client
        web_conf.kvlclient.setup_namespace({'dragnet': (str,)})
        web_conf.kvlclient.put('dragnet', (('dragnet',), json.dumps({'clusters': clusters})))
        return dict(counts)