def v1_fc_put(request, response, store, kvlclient, tfidf, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. Alternatively, if the request's ``Content-type`` is ``text/html``, then a feature collection is generated from the HTML. The generated feature collection is then returned as a JSON payload. This endpoint returns status ``201`` upon successful storage otherwise. An existing feature collection with id ``content_id`` is overwritten. ''' tfidf = tfidf or None if request.headers.get('content-type', '').startswith('text/html'): url = urllib.unquote(cid.split('|', 1)[1]) fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf) logger.info('created FC for %r', cid) store.put([(cid, fc)]) return fc_to_json(fc) else: fc = FeatureCollection.from_dict(json.load(request.body)) keywords = set() for subid in fc: if subid.startswith('subtopic'): ty = subtopic_type(subid) if ty in ('text', 'manual'): # get the user selected string data = typed_subtopic_data(fc, subid) map(keywords.add, cleanse(data).split()) keywords.add(cleanse(data)) folders = Folders(kvlclient) for fid, sid in folders.parent_subfolders(cid): if not isinstance(fid, unicode): fid = fid.decode('utf8') if not isinstance(sid, unicode): sid = sid.decode('utf8') keywords.add(cleanse(fid)) keywords.add(cleanse(sid)) fc[u'keywords'] = StringCounter(keywords) store.put([(cid, fc)]) response.status = 201
class same_subfolder(web.Filter): def __init__(self, kvlclient, label_store): super(same_subfolder, self).__init__() self.kvl = kvlclient self.label_store = label_store self.folders = Folders(self.kvl) def create_predicate(self): subfolders = self.folders.parent_subfolders(self.query_content_id) cids = set() for folder_id, subfolder_id in subfolders: for cid, subid in self.folders.items(folder_id, subfolder_id): cids.add(cid) # Also add directly connected labels too. for lab in self.label_store.directly_connected((cid, subid)): cids.add(lab.other(cid)) return lambda (content_id, fc): content_id not in cids
class PairwiseFeatureLearner(object): '''A pairwise active learning model. This active learning model applies :class:`~sklearn.linear_model.LogisticRegression` on-the-fly as a user (or simulated user) interacts with content via the web services provided by :mod:`dossier.web`. This reads :class:`~dossier.label.Label` objects from :class:`~dossier.label.LabelStore` and provides predictions of pairwise equivalence, which can be used for coreference resolution, clustering, and ranking. .. automethod:: dossier.models.PairwiseFeatureLearner.__init__ .. automethod:: dossier.models.PairwiseFeatureLearner.probabilities ''' def __init__(self, store, label_store, content_id, subtopic_id=None, canopy_limit=None, label_limit=None): '''Build a new model. :param store: A store of feature collections. :type store: :class:`dossier.store.Store` :param label_store: A store of labels (ground truth data). :type label_store: :class:`dossier.label.LabelStore` :param str content_id: The query content id (which should correspond to a feature collection in the ``store``). If it doesn't, no results are returned. :param int canopy_limit: A limit on the number of results to return in the canopy (the initial index scan). This is meant to be a mechanism for resource control. :param int label_limit: A limit on the number of labels to use in training. This is meant to be a mechanism for resource control. ''' self.store = store self.label_store = label_store self.folders = Folders(store.kvl) self.query_content_id = content_id self.query_subtopic_id = subtopic_id self.query_fc = None self.canopy_limit = canopy_limit self.label_limit = label_limit def as_result(self, cid, fc, p): fnames = sorted(set(self.query_fc.keys()).intersection(fc.keys())) intermediates = dict([(n, { 'kernel': 'cosine', 'feature1': n, 'feature2': n, 'kernel_value': None, 'weight': None, 'common_feature_values': [] }) for n in fnames]) for n in fnames: intermediates[n]['weight'] = self.feature_weights.get(n) for n, qfeat, cfeat in ((n, self.query_fc[n], fc[n]) for n in fnames): if not isinstance(qfeat, StringCounter) \ or not isinstance(cfeat, StringCounter): continue vals = set(qfeat.keys()).intersection(cfeat.keys()) intermediates[n]['common_feature_values'] = \ sorted(filter(None, vals)) all_vals = sorted(set(qfeat.keys()).union(cfeat.keys())) if len(all_vals) > 0: qcounts = [qfeat.get(v, 0) for v in all_vals] ccounts = [cfeat.get(v, 0) for v in all_vals] sim = cosine(qcounts, ccounts) if not math.isnan(sim): intermediates[n]['kernel_value'] = sim return (cid, fc, { 'probability': p, 'intermediate_model_results': intermediates.values(), }) def probabilities(self): '''Trains a model and predicts recommendations. If the query feature collection could not be found or if there is insufficient training data, an empty list is returned. Otherwise, a list of content objects (tuples of content id and feature collection) and probabilities is returned. The probability is generated from the model, and reflects confidence of the model that the corresponding content object is related to the query based on the ground truth data. On a large database, random samples are used for training, so this function is not deterministic. :rtype: ``list`` of ((``content_id``, :class:`dossier.fc.FeatureCollection`), probability) ''' self.query_fc = self.store.get(self.query_content_id) if self.query_fc is None: logger.warning('Could not find FC for %s', self.query_content_id) return [] # Try the canopy query before training, because if the canopy query # gives us nothing, then there's no point in the additional work. # # Possible optimization: If the canopy query yields fewer than N # results, then can we just return all of them? ---AG # # N.B Doing the canopy query first will cause things to be slower # when there is insufficient training data. candidates = self.canopy(limit=self.canopy_limit) if len(candidates) == 0: logger.info( 'Could not find any candidates in a canopy query by ' 'scanning the following indexes: %s', ', '.join(self.store.index_names())) return [] # Get labels from the database and translate them to the form # `[{-1, 1}, i, j]` where `i, j` are indices into the list # `content_objs`, which has type `[(content_id, FeatureCollection)]`. logger.info('Fetching labels...') labels = list(self.labels_from_query(limit=self.label_limit)) logger.info('Fetching FCs from labels...') content_objs = self.content_objs_from_labels(labels) indexed_labels = labels_to_indexed_coref_values(content_objs, labels) logger.info('Training...') model = self.train(content_objs, indexed_labels) if model is None: logger.info( 'Could not train model: insufficient training data. ' '(query content id: %s)', self.query_content_id) raise InsufficientTrainingData feature_names, classifier, transformer = model return zip( candidates, self.classify(feature_names, classifier, transformer, candidates)) def train(self, content_objs, idx_labels): '''Trains and returns a model using sklearn. If there are new labels to add, they can be added, returns an sklearn model which can be used for prediction and getting features. This method may return ``None`` if there is insufficient training data to produce a model. :param labels: Ground truth data. :type labels: list of ``({-1, 1}, index1, index2)``. ''' # We have insufficient training data when there is only one or # fewer classes of labels. if len(set([lab[0] for lab in idx_labels])) <= 1: return None fcs = [fc for _, fc in content_objs] feature_names = vectorizable_features(fcs) dis = dissimilarities(feature_names, fcs) phi_dicts, labels = [], [] # lists are in correspondence for coref_value, i, j in idx_labels: # i, j are indices into the list `fcs` labels.append(coref_value) # either -1 or 1 phi_dict = dict([(name, dis[name][i, j]) for name in feature_names]) phi_dicts.append(phi_dict) vec = dict_vector() training_data = vec.fit_transform(phi_dicts) model = LogisticRegression(class_weight='auto', penalty='l1') model.fit(training_data, labels) self.feature_weights = dict([(name, model.coef_[0][i]) for i, name in enumerate(feature_names)]) return feature_names, model, vec def classify(self, feature_names, classifier, transformer, candidates, query_fc=None): '''Returns ``[probability]`` in correspondence with ``candidates``. Where each ``probability`` corresponds to the probability that the corresponding candidate is classified with a positive label given the training data. The list returned is in correspondence with the list of candidates given. N.B. The contract of this method should be simplified by bundling ``feature_names``, ``classifier`` and ``transformer`` into one thing known as "the model." ---AG ''' if query_fc is None: query_fc = self.query_fc dis = {} for name in feature_names: vec = dict_vector() query = vec.fit_transform([get_feat(query_fc, name)]) cans = vec.transform(get_feat(fc, name) for _, fc in candidates) dis[name] = 1 - pairwise_distances( cans, query, metric='cosine', n_jobs=1)[:, 0] # in correspondence with `candidates` phi_dicts = transformer.transform([ dict([(name, dis[name][i]) for name in feature_names]) for i in xrange(len(candidates)) ]) return classifier.predict_proba(phi_dicts)[:, 1] def canopy(self, limit=None): ids = web.streaming_sample( self.canopy_ids(limit_hint=hard_limit(limit)), limit, hard_limit(limit)) # I don't think it ever makes sense to include the query # as part of the candidate set. return filter(lambda (_, fc): fc is not None, self.store.get_many(ids)) def canopy_ids(self, limit_hint=None): limit_hint = limit_hint or 1000 # TODO: It seems like this should pre-emptively discard content # ids that have already participated in a *direct* label with # the query. But I think this is a premature optimization since # the filtering functions will take care of it. (This optimization # would mean fewer kernel computations.) blacklist = set([self.query_content_id]) cids = set() # OK, so it turns out that a naive index scan is pretty inflexible and # arbitrary. The issue is that in a big enough data set, the first # index scan will probably exhaust all of our result set, which # means result sets will never see any variety. # # Instead, we'll try to sample from each index in small batch sizes. # This is a heuristic; not a principled approach. ---AG index_names = self.store.index_names() batch_size = limit_hint / 10 progress = {} # idx, name |--> last end # When `progress` is empty, the following loop will terminate. # An index is removed from `progress` when it no longer produces # results. for idx_name in index_names: feat = self.query_fc.get(idx_name) if isinstance(feat, StringCounter): for name in feat: if len(name) > 0: progress[(idx_name, name)] = 0 logger.info('starting index scan (query content id: %s)', self.query_content_id) while len(progress) > 0: for idx_name in index_names: for name in self.query_fc.get(idx_name, []): key = (idx_name, name) if key not in progress: continue logger.info('[StringCounter index: %s] scanning for "%s"', idx_name, name) scanner = self.store.index_scan(idx_name, name) progressed = 0 for cid in islice(scanner, progress[key], None): if progressed >= batch_size: break if cid not in cids and cid not in blacklist: cids.add(cid) progressed += 1 yield cid if progressed == 0: progress.pop(key) else: progress[key] += progressed def labels_from_query(self, limit=None): '''ContentId -> [Label]''' return self.infer_subtopic_labels(limit=limit) def infer_subtopic_labels(self, limit=None): # The basic idea here is to aggressively gather truth data while # avoiding cross contamination with other subfolders. Since our query # is a (content_id, subtopic_id), we can use subtopic connected # components to achieve this. # Short aliases. cid, subid = self.query_content_id, self.query_subtopic_id # For positive labels, the only thing we can do is traverse the # subtopic connected component. # Don't impose a hard limit on positive labels. (There are probably # very few of them.) logger.info('Inferring positive labels for: %r', (cid, subid)) pos_labels = (self.label_store.expand( (cid, subid)) + list(self.positive_subtopic_labels())) logger.info('Inferring negative labels for: %r', (cid, subid)) neg_labels = self.negative_subtopic_labels() pos_sample = web.streaming_sample(pos_labels, limit, limit=hard_limit(limit)) neg_sample = web.streaming_sample(neg_labels, limit, limit=hard_limit(limit)) print('-' * 79) print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n') print('-' * 79) print('NEGATIVES\n', '\n'.join(map(repr, neg_sample))) print('-' * 79) return pos_sample + neg_sample def positive_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id subfolders = list(self.folders.parent_subfolders((cid, subid))) for fid, subfolder_id in subfolders: for cid2, subid2 in self.folders.items(fid, subfolder_id): # Since this item is in the same folder as our query, we # consider it a positive example. But there's no explicit # label for it, so manufacture one. # # TODO: Fix annotator id here. (We need to push annotator # information down into the search engine; the rest is # trivial.) ---AG yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID, CorefValue.Positive, subid, subid2) # Sometimes the user will directly attach a positive label # to an item in the folder. This will grab those. for lab in self.label_store.directly_connected(cid2): if lab.value == CorefValue.Positive \ and lab.subtopic_for(cid2) == subid2: yield lab def negative_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id for lab in negative_subtopic_labels(self.label_store, self.folders, cid, subid): yield lab def content_objs_from_labels(self, labels): '''[Label] -> [(content_id, FeatureCollection)]''' is_mapping = lambda obj: isinstance(obj, collections.Mapping) def is_valid_fc((cid, fc)): if fc is None: return False if sum(1 for name in fc if is_mapping(fc[name])) == 0: return False return True ids = set() for lab in labels: ids.add(lab.content_id1) ids.add(lab.content_id2) return list(ifilter(is_valid_fc, self.store.get_many(ids)))
class PairwiseFeatureLearner(object): '''A pairwise active learning model. This active learning model applies :class:`~sklearn.linear_model.LogisticRegression` on-the-fly as a user (or simulated user) interacts with content via the web services provided by :mod:`dossier.web`. This reads :class:`~dossier.label.Label` objects from :class:`~dossier.label.LabelStore` and provides predictions of pairwise equivalence, which can be used for coreference resolution, clustering, and ranking. .. automethod:: dossier.models.PairwiseFeatureLearner.__init__ .. automethod:: dossier.models.PairwiseFeatureLearner.probabilities ''' def __init__(self, store, label_store, content_id, subtopic_id=None, canopy_limit=None, label_limit=None): '''Build a new model. :param store: A store of feature collections. :type store: :class:`dossier.store.Store` :param label_store: A store of labels (ground truth data). :type label_store: :class:`dossier.label.LabelStore` :param str content_id: The query content id (which should correspond to a feature collection in the ``store``). If it doesn't, no results are returned. :param int canopy_limit: A limit on the number of results to return in the canopy (the initial index scan). This is meant to be a mechanism for resource control. :param int label_limit: A limit on the number of labels to use in training. This is meant to be a mechanism for resource control. ''' self.store = store self.label_store = label_store self.folders = Folders(store.kvl) self.query_content_id = content_id self.query_subtopic_id = subtopic_id self.query_fc = None self.canopy_limit = canopy_limit self.label_limit = label_limit def as_result(self, cid, fc, p): fnames = sorted(set(self.query_fc.keys()).intersection(fc.keys())) intermediates = dict([(n, {'kernel': 'cosine', 'feature1': n, 'feature2': n, 'kernel_value': None, 'weight': None, 'common_feature_values': []}) for n in fnames]) for n in fnames: intermediates[n]['weight'] = self.feature_weights.get(n) for n, qfeat, cfeat in ((n, self.query_fc[n], fc[n]) for n in fnames): if not isinstance(qfeat, StringCounter) \ or not isinstance(cfeat, StringCounter): continue vals = set(qfeat.keys()).intersection(cfeat.keys()) intermediates[n]['common_feature_values'] = \ sorted(filter(None, vals)) all_vals = sorted(set(qfeat.keys()).union(cfeat.keys())) if len(all_vals) > 0: qcounts = [qfeat.get(v, 0) for v in all_vals] ccounts = [cfeat.get(v, 0) for v in all_vals] sim = cosine(qcounts, ccounts) if not math.isnan(sim): intermediates[n]['kernel_value'] = sim return (cid, fc, { 'probability': p, 'intermediate_model_results': intermediates.values(), }) def probabilities(self): '''Trains a model and predicts recommendations. If the query feature collection could not be found or if there is insufficient training data, an empty list is returned. Otherwise, a list of content objects (tuples of content id and feature collection) and probabilities is returned. The probability is generated from the model, and reflects confidence of the model that the corresponding content object is related to the query based on the ground truth data. On a large database, random samples are used for training, so this function is not deterministic. :rtype: ``list`` of ((``content_id``, :class:`dossier.fc.FeatureCollection`), probability) ''' self.query_fc = self.store.get(self.query_content_id) if self.query_fc is None: logger.warning('Could not find FC for %s', self.query_content_id) return [] # Try the canopy query before training, because if the canopy query # gives us nothing, then there's no point in the additional work. # # Possible optimization: If the canopy query yields fewer than N # results, then can we just return all of them? ---AG # # N.B Doing the canopy query first will cause things to be slower # when there is insufficient training data. candidates = self.canopy(limit=self.canopy_limit) if len(candidates) == 0: logger.info( 'Could not find any candidates in a canopy query by ' 'scanning the following indexes: %s', ', '.join(self.store.index_names())) return [] # Get labels from the database and translate them to the form # `[{-1, 1}, i, j]` where `i, j` are indices into the list # `content_objs`, which has type `[(content_id, FeatureCollection)]`. logger.info('Fetching labels...') labels = list(self.labels_from_query(limit=self.label_limit)) logger.info('Fetching FCs from labels...') content_objs = self.content_objs_from_labels(labels) indexed_labels = labels_to_indexed_coref_values(content_objs, labels) logger.info('Training...') model = self.train(content_objs, indexed_labels) if model is None: logger.info( 'Could not train model: insufficient training data. ' '(query content id: %s)', self.query_content_id) raise InsufficientTrainingData feature_names, classifier, transformer = model return zip(candidates, self.classify( feature_names, classifier, transformer, candidates)) def train(self, content_objs, idx_labels): '''Trains and returns a model using sklearn. If there are new labels to add, they can be added, returns an sklearn model which can be used for prediction and getting features. This method may return ``None`` if there is insufficient training data to produce a model. :param labels: Ground truth data. :type labels: list of ``({-1, 1}, index1, index2)``. ''' # We have insufficient training data when there is only one or # fewer classes of labels. if len(set([lab[0] for lab in idx_labels])) <= 1: return None fcs = [fc for _, fc in content_objs] feature_names = vectorizable_features(fcs) dis = dissimilarities(feature_names, fcs) phi_dicts, labels = [], [] # lists are in correspondence for coref_value, i, j in idx_labels: # i, j are indices into the list `fcs` labels.append(coref_value) # either -1 or 1 phi_dict = dict([(name, dis[name][i,j]) for name in feature_names]) phi_dicts.append(phi_dict) vec = dict_vector() training_data = vec.fit_transform(phi_dicts) model = LogisticRegression(class_weight='auto', penalty='l1') model.fit(training_data, labels) self.feature_weights = dict([(name, model.coef_[0][i]) for i, name in enumerate(feature_names)]) return feature_names, model, vec def classify(self, feature_names, classifier, transformer, candidates, query_fc=None): '''Returns ``[probability]`` in correspondence with ``candidates``. Where each ``probability`` corresponds to the probability that the corresponding candidate is classified with a positive label given the training data. The list returned is in correspondence with the list of candidates given. N.B. The contract of this method should be simplified by bundling ``feature_names``, ``classifier`` and ``transformer`` into one thing known as "the model." ---AG ''' if query_fc is None: query_fc = self.query_fc dis = {} for name in feature_names: vec = dict_vector() query = vec.fit_transform([get_feat(query_fc, name)]) cans = vec.transform(get_feat(fc, name) for _, fc in candidates) dis[name] = 1 - pairwise_distances( cans, query, metric='cosine', n_jobs=1)[:,0] # in correspondence with `candidates` phi_dicts = transformer.transform( [dict([(name, dis[name][i]) for name in feature_names]) for i in xrange(len(candidates))]) return classifier.predict_proba(phi_dicts)[:,1] def canopy(self, limit=None): ids = web.streaming_sample( self.canopy_ids(limit_hint=hard_limit(limit)), limit, hard_limit(limit)) # I don't think it ever makes sense to include the query # as part of the candidate set. return filter(lambda (_, fc): fc is not None, self.store.get_many(ids)) def canopy_ids(self, limit_hint=None): limit_hint = limit_hint or 1000 # TODO: It seems like this should pre-emptively discard content # ids that have already participated in a *direct* label with # the query. But I think this is a premature optimization since # the filtering functions will take care of it. (This optimization # would mean fewer kernel computations.) blacklist = set([self.query_content_id]) cids = set() # OK, so it turns out that a naive index scan is pretty inflexible and # arbitrary. The issue is that in a big enough data set, the first # index scan will probably exhaust all of our result set, which # means result sets will never see any variety. # # Instead, we'll try to sample from each index in small batch sizes. # This is a heuristic; not a principled approach. ---AG index_names = self.store.index_names() batch_size = limit_hint / 10 progress = {} # idx, name |--> last end # When `progress` is empty, the following loop will terminate. # An index is removed from `progress` when it no longer produces # results. for idx_name in index_names: feat = self.query_fc.get(idx_name) if isinstance(feat, StringCounter): for name in feat: if len(name) > 0: progress[(idx_name, name)] = 0 logger.info('starting index scan (query content id: %s)', self.query_content_id) while len(progress) > 0: for idx_name in index_names: for name in self.query_fc.get(idx_name, []): key = (idx_name, name) if key not in progress: continue logger.info('[StringCounter index: %s] scanning for "%s"', idx_name, name) scanner = self.store.index_scan(idx_name, name) progressed = 0 for cid in islice(scanner, progress[key], None): if progressed >= batch_size: break if cid not in cids and cid not in blacklist: cids.add(cid) progressed += 1 yield cid if progressed == 0: progress.pop(key) else: progress[key] += progressed def labels_from_query(self, limit=None): '''ContentId -> [Label]''' return self.infer_subtopic_labels(limit=limit) def infer_subtopic_labels(self, limit=None): # The basic idea here is to aggressively gather truth data while # avoiding cross contamination with other subfolders. Since our query # is a (content_id, subtopic_id), we can use subtopic connected # components to achieve this. # Short aliases. cid, subid = self.query_content_id, self.query_subtopic_id # For positive labels, the only thing we can do is traverse the # subtopic connected component. # Don't impose a hard limit on positive labels. (There are probably # very few of them.) logger.info('Inferring positive labels for: %r', (cid, subid)) pos_labels = (self.label_store.expand((cid, subid)) + list(self.positive_subtopic_labels())) logger.info('Inferring negative labels for: %r', (cid, subid)) neg_labels = self.negative_subtopic_labels() pos_sample = web.streaming_sample( pos_labels, limit, limit=hard_limit(limit)) neg_sample = web.streaming_sample( neg_labels, limit, limit=hard_limit(limit)) print('-' * 79) print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n') print('-' * 79) print('NEGATIVES\n', '\n'.join(map(repr, neg_sample))) print('-' * 79) return pos_sample + neg_sample def positive_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id subfolders = list(self.folders.parent_subfolders((cid, subid))) for fid, subfolder_id in subfolders: for cid2, subid2 in self.folders.items(fid, subfolder_id): # Since this item is in the same folder as our query, we # consider it a positive example. But there's no explicit # label for it, so manufacture one. # # TODO: Fix annotator id here. (We need to push annotator # information down into the search engine; the rest is # trivial.) ---AG yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID, CorefValue.Positive, subid, subid2) # Sometimes the user will directly attach a positive label # to an item in the folder. This will grab those. for lab in self.label_store.directly_connected(cid2): if lab.value == CorefValue.Positive \ and lab.subtopic_for(cid2) == subid2: yield lab def negative_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id for lab in negative_subtopic_labels(self.label_store, self.folders, cid, subid): yield lab def content_objs_from_labels(self, labels): '''[Label] -> [(content_id, FeatureCollection)]''' is_mapping = lambda obj: isinstance(obj, collections.Mapping) def is_valid_fc((cid, fc)): if fc is None: return False if sum(1 for name in fc if is_mapping(fc[name])) == 0: return False return True ids = set() for lab in labels: ids.add(lab.content_id1) ids.add(lab.content_id2) return list(ifilter(is_valid_fc, self.store.get_many(ids)))