def test_add_facets(): cid1 = 'cid1' fc1 = FeatureCollection() fc1['bowNP_sip'] = StringCounter([u'elephant', u'car']) cid2 = 'cid2' fc2 = FeatureCollection() fc2['bowNP_sip'] = StringCounter([u'car', u'green']) fake_results = {'results': [(cid1, fc1), (cid2, fc2)]} new_results = mod_pairwise.add_facets(fake_results) assert 'facets' in new_results assert new_results['facets'] == { 'elephant': [cid1], 'car': [cid1, cid2], 'green': [cid2], }
def add_folder(self, folder_id, ann_id=None): '''Add a folder. If ``ann_id`` is set, then the folder is owned by the given user. Otherwise, the folder is owned and viewable by all anonymous users. :param str folder_id: Folder id :param str ann_id: Username ''' self.assert_valid_folder_id(folder_id) ann_id = self._annotator(ann_id) cid = self.wrap_folder_content_id(ann_id, folder_id) self.store.put([(cid, FeatureCollection())]) logger.info('Added folder %r with content id %r', folder_id, cid)
def forum_post_features(row): fc = FeatureCollection() for k in row['author']: fc['post_author_' + k] = row['author'][k] if 'image_urls' in row: fc['image_url'] = StringCounter() for image_url in row['image_urls']: fc['image_url'][image_url] += 1 others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title'] for k in others: if k in row: fc['post_' + k] = uni(row[k]) return fc
def test_vectorizable_features(): '''Make sure we only do learning on the right features. The "right" features means features that can be vectorized by sklearn. Translation: they must be instances of collections.Mapping. ''' fc = FeatureCollection({ u'yes': { 'fubar': 1 }, u'no': u'haha', }) got = mod_pairwise.vectorizable_features([fc]) assert got == ['yes']
def test_fc_get(store): # noqa store.put([(visid_to_dbid('abc'), FeatureCollection({'foo': {'a': 1}}))]) fc = routes.v1_fc_get(dbid_to_visid, store, 'abc') assert fc['foo']['a'] == 1
def test_random_no_name_index(store): # noqa store.put([('foo', FeatureCollection({u'NAME': {'bar': 1}}))]) # just make sure it runs search_engines.random(store).set_query_id('foo').results()
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None, timestamp=None, other_features=None): '''`html` is expected to be a raw string received over the wire from a remote webserver, and `encoding`, if provided, is used to decode it. Typically, encoding comes from the Content-Type header field. The :func:`~streamcorpus_pipeline._clean_html.make_clean_html` function handles character encodings. ''' def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs) timestamp = timestamp or int(time.time() * 1000) other_features = other_features or {} if clean_html is None: if html is not None: try: clean_html_utf8 = make_clean_html(html, encoding=encoding) except: logger.warn('dropping doc because:', exc_info=True) return clean_html = clean_html_utf8.decode('utf-8') else: clean_html_utf8 = u'' clean_html = u'' else: clean_html_utf8 = u'' if clean_visible is None or len(clean_visible) == 0: clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8') elif isinstance(clean_visible, str): clean_visible = clean_visible.decode('utf-8') fc = FeatureCollection() fc[u'meta_raw'] = html and uni(html, encoding) or u'' fc[u'meta_clean_html'] = clean_html fc[u'meta_clean_visible'] = clean_visible fc[u'meta_timestamp'] = unicode(timestamp) url = url or u'' fc[u'meta_url'] = uni(url) add_feature(u'icq', features.ICQs(clean_visible)) add_feature(u'skype', features.skypes(clean_visible)) add_feature(u'phone', features.phones(clean_visible)) add_feature(u'email', features.emails(clean_visible)) bowNP, normalizations = features.noun_phrases(cleanse(clean_visible), included_unnormalized=True) add_feature(u'bowNP', bowNP) bowNP_unnorm = chain(*normalizations.values()) add_feature(u'bowNP_unnorm', bowNP_unnorm) add_feature(u'image_url', features.image_urls(clean_html)) add_feature(u'a_url', features.a_urls(clean_html)) ## get parsed versions, extract usernames fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url']) fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url']) fc[u'usernames'] = features.usernames(fc[u'image_url']) fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url']) fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url']) fc[u'usernames'] += features.usernames(fc[u'a_url']) #fc[u'usernames'] += features.usernames2( # fc[u'meta_clean_visible']) # beginning of treating this as a pipeline... xform = features.entity_names() fc = xform.process(fc) for feat_name, feat_val in other_features.iteritems(): fc[feat_name] += StringCounter(feat_val) return fc
def example_fc(): fc = FeatureCollection() fc[u'meta_clean_visible'] = example_text return fc
# !!! IMPORTANT !!! # Define features that you want to index. This will let you quickly scan # for feature collections in the database with matching values. # # You don't have to index everything, but it's probably a good idea to index # the most prominent features. e.g., phone or email or website. # # These should correspond to the names of the corresponding features. feature_indexes = [u'phone', u'email', u'website', u'rate'] # Create a "store," which knows how to store and index feature collections. store = Store(conn, feature_indexes=feature_indexes) # Create a fresh feature collection and add a 'rate' feature. fc = FeatureCollection() fc['rate'] = StringCounter({ u'5per30': 5, u'5per60': 1, u'10per20': 2, }) # Content ids are the unique identifier for each feature collection. # It's probably sufficient to use whatever you have for "ad id." content_id = 'some_unique_value' store.put([(content_id, fc)]) print store.get(content_id) # Use the index scan! print list(store.index_scan_prefix(u'rate', '10'))
def counter_fc(bow): return FeatureCollection({u'feature': bow})
def make_fc(text): nhash = nilsimsa_hash(text) fc = FeatureCollection() fc['#nilsimsa_all'] = StringCounter([nhash]) return fc