def test_usernames(url_or_path, username, count): urls = StringCounter() urls[url_or_path] += count if username is not None: results = usernames(urls) assert results == StringCounter({username: count})
def test_a_urls(): html = ''' <a href="http://ExAmPle.com/My Page.html"> <a href="http://example.com/My%20Page.html"> ''' assert StringCounter(features.a_urls(html)) == StringCounter({ 'http://example.com/My Page.html': 2, })
def test_image_urls(): html = ''' <img src="http://ExAmPle.com/My Image.jpg"> <img src="http://example.com/My%20Image.jpg"> ''' assert StringCounter(features.image_urls(html)) == StringCounter({ 'http://example.com/My Image.jpg': 2, })
def test_extract_emails(): txt = ''' email: [email protected] email: [email protected] ''' assert StringCounter(features.emails(txt)) == StringCounter({ '*****@*****.**': 2, })
def test_host_names(): urls = StringCounter() urls['http://www.example.com/folder1'] = 3 urls['http://www.example.com/folder2'] = 2 urls['http://www.different.com/folder2'] = 7 assert features.host_names(urls) == StringCounter({ 'www.example.com': 5, 'www.different.com': 7, })
def test_path_dirs(): urls = StringCounter() urls['http://www.example.com/folder1/folder3/index.html?source=dummy'] = 3 urls['http://www.example.com/folder2/folder1'] = 2 urls['http://www.different.com/folder2'] = 7 assert features.path_dirs(urls) == StringCounter({ 'folder1': 5, 'folder2': 9, 'folder3': 3, 'index.html': 3, })
def test_string_counter(): hc = StringCounter('this sentence has been parsed') h1 = (id(hc), hc.generation) cache = dict(h1='hi') assert len(cache) == 1 hc.update('more text') h2 = (id(hc), hc.generation) cache[h2] = 'hi again?' assert len(cache) == 2 assert h1 != h2
def test_extract_phones(): txt = ''' Phone: 111-222-3333 Phone: 1112223333 Phone: 1-111-222-3333 Phone: 11112223333 Phone: 222-3333 Phone: 2223333 ''' assert StringCounter(features.phones(txt)) == StringCounter({ '1112223333': 2, '11112223333': 2, '2223333': 2, })
def test_json_serializer(): with registry: registry.add('StringCounter', JsonSerializer) fc = FeatureCollection() fc['thing2'] = StringCounter(dict(hello='people')) fc['thing2']['another'] = 5 fc['thing3'] = StringCounter(dict(hello='people2')) fc_str = fc.dumps() fc2 = FeatureCollection.loads(fc_str) assert fc2['thing2']['another'] == 5 assert fc2['thing2']['hello'] == 'people' assert fc2['thing3']['hello'] == 'people2'
def test_readonly(counter_type): fc = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) fc2 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) fc.read_only = True with pytest.raises(ReadOnlyException): fc += fc2 with pytest.raises(ReadOnlyException): fc -= fc2 with pytest.raises(ReadOnlyException): fc *= 2 with pytest.raises(ReadOnlyException): fc['woof'] = StringCounter() if hasattr(counter_type, 'read_only'): with pytest.raises(ReadOnlyException): fc['hello']['l'] = 3 with pytest.raises(ReadOnlyException): fc['hello']['l'] += 3 fc.read_only = False fc += fc2 assert Counter(map(abs, fc['hello'].values())) == Counter({2: 3, 4: 1}) fc -= fc2 fc -= fc2 assert Counter(map(abs, fc['hello'].values())) == Counter()
def test_string_counter_serialize(): fc = FeatureCollection() fc['thing1'] = StringCounter() fc['thing1']['foo'] += 1 fc_str = fc.dumps() fc2 = FeatureCollection.loads(fc_str) assert fc2['thing1']['foo'] == 1
def build_feature(self, si): usernames = StringCounter() for sentence in si.body.sentences['nltk_tokenizer']: for token in sentence.tokens: if len(token.token) <= 3: continue if self.classify(token.token.decode('utf8')): usernames[token.token] += 1 return usernames
def add_sip_to_fc(fc, tfidf, limit=40): '''add "bowNP_sip" to `fc` using `tfidf` data ''' if 'bowNP' not in fc: return if tfidf is None: return sips = features.sip_noun_phrases(tfidf, fc['bowNP'].keys(), limit=limit) fc[u'bowNP_sip'] = StringCounter(sips)
def path_dirs(urls): ''' Takes a StringCounter of normalized URL and parses them into a list of path directories. The file name is included in the path directory list. ''' path_dirs = StringCounter() for url in urls: for path_dir in filter(None, urlparse(url).path.split('/')): path_dirs[path_dir] += urls[url] return path_dirs
def test_read_only_features(): fc = FeatureCollection({'feat': StringCounter({'foo': 1})}) fc['feat']['foo'] += 1 fc.read_only = True with pytest.raises(ReadOnlyException): fc['feat']['foo'] += 1 with pytest.raises(ReadOnlyException): fc['feat'].pop('foo') with pytest.raises(ReadOnlyException): del fc['feat']['foo']
def create_fc_from_html(url, html, encoding='utf-8', tfidf=None, other_features=None): if encoding is not None: html = unicode(html, encoding) soup = BeautifulSoup(html, "lxml") title = soup_get(soup, 'title', lambda v: v.get_text()) body = soup_get(soup, 'body', lambda v: v.prettify()) if other_features is None: other_features = {} other_features.update({ u'title': StringCounter([title]), u'titleBow': StringCounter(title.split()), }) fc = html_to_fc(body, url=url, other_features=other_features) if fc is None: return None if tfidf is not None: add_sip_to_fc(fc, tfidf) return fc
class JsonSerializer(StringCounter): def __init__(self): raise NotImplementedError @staticmethod def loads(bytes): return StringCounter(json.loads(bytes)) @staticmethod def dumps(sc): return json.dumps(sc) constructor = staticmethod(lambda: StringCounter())
def v1_fc_put(request, response, store, kvlclient, tfidf, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. Alternatively, if the request's ``Content-type`` is ``text/html``, then a feature collection is generated from the HTML. The generated feature collection is then returned as a JSON payload. This endpoint returns status ``201`` upon successful storage otherwise. An existing feature collection with id ``content_id`` is overwritten. ''' tfidf = tfidf or None if request.headers.get('content-type', '').startswith('text/html'): url = urllib.unquote(cid.split('|', 1)[1]) fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf) logger.info('created FC for %r', cid) store.put([(cid, fc)]) return fc_to_json(fc) else: fc = FeatureCollection.from_dict(json.load(request.body)) keywords = set() for subid in fc: if subid.startswith('subtopic'): ty = subtopic_type(subid) if ty in ('text', 'manual'): # get the user selected string data = typed_subtopic_data(fc, subid) map(keywords.add, cleanse(data).split()) keywords.add(cleanse(data)) folders = Folders(kvlclient) for fid, sid in folders.parent_subfolders(cid): if not isinstance(fid, unicode): fid = fid.decode('utf8') if not isinstance(sid, unicode): sid = sid.decode('utf8') keywords.add(cleanse(fid)) keywords.add(cleanse(sid)) fc[u'keywords'] = StringCounter(keywords) store.put([(cid, fc)]) response.status = 201
def host_names(urls): ''' Takes a StringCounter of normalized URL and parses their hostnames N.B. this assumes that absolute URLs will begin with http:// in order to accurately resolve the host name. Relative URLs will not have host names. ''' host_names = StringCounter() for url in urls: host_names[urlparse(url).netloc] += urls[url] return host_names
def forum_post_features(row): fc = FeatureCollection() for k in row['author']: fc['post_author_' + k] = row['author'][k] if 'image_urls' in row: fc['image_url'] = StringCounter() for image_url in row['image_urls']: fc['image_url'][image_url] += 1 others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title'] for k in others: if k in row: fc['post_' + k] = uni(row[k]) return fc
def test_string_counter2(): sc = StringCounter() sc['hi'] += 1 assert isinstance(sc, StringCounter) sc += Counter('hi') assert isinstance(sc, StringCounter)
def loads(bytes): return StringCounter(json.loads(bytes))
def make_fc(text): nhash = nilsimsa_hash(text) fc = FeatureCollection() fc['#nilsimsa_all'] = StringCounter([nhash]) return fc
def extract(positive_fcs, negative_fcs, features=None): '''Takes a labeled set of feature collections (positive and negative) and the features wanted. And trains a Naive Bayes classifier on the underlying keys of the set of selected features features. If no features are selected, all are used. Returns two list of (keywords, strength) tuples ordered by strength. The first are feature keys that were predictive of the positive label and the second are the feature keys are were predictive of the negative label. ``*_fcs`` is the list of feature collections, positive label and negative label respectively. ``features`` designates which specific feature gets vectorized the other features are ignored. ''' # Vector of labels labels = np.array([1] * len(positive_fcs) + [0] * len(negative_fcs)) # Used to convert the feature collection keys into a sklearn # compatible format v = DictVectorizer(sparse=False) D = list() for fc in (positive_fcs + negative_fcs): feat = StringCounter() if not fc: logger.warn('how did we get an empty fc? %r', fc) else: # The features used to pull the keys for the classifier for f in features: feat += fc[f] D.append(feat) # Convert the list of Counters into an sklearn compatible format X = v.fit_transform(D) # Fit the sklearn Bernoulli Naive Bayes classifer clf = BernoulliNB() clf.fit(X, labels) # Extract the learned features that are predictive of the positive # and negative class positive_keywords = v.inverse_transform(clf.feature_log_prob_[1])[0] negative_keywords = v.inverse_transform(clf.feature_log_prob_[0])[0] pos_words = Counter(positive_keywords) neg_words = Counter(negative_keywords) ## make a list ordered by their weight pos_ordered = sorted(pos_words.items(), key=operator.itemgetter(1), reverse=True) neg_ordered = sorted(neg_words.items(), key=operator.itemgetter(1), reverse=True) return pos_ordered, neg_ordered
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None, timestamp=None, other_features=None): '''`html` is expected to be a raw string received over the wire from a remote webserver, and `encoding`, if provided, is used to decode it. Typically, encoding comes from the Content-Type header field. The :func:`~streamcorpus_pipeline._clean_html.make_clean_html` function handles character encodings. ''' def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs) timestamp = timestamp or int(time.time() * 1000) other_features = other_features or {} if clean_html is None: if html is not None: try: clean_html_utf8 = make_clean_html(html, encoding=encoding) except: logger.warn('dropping doc because:', exc_info=True) return clean_html = clean_html_utf8.decode('utf-8') else: clean_html_utf8 = u'' clean_html = u'' else: clean_html_utf8 = u'' if clean_visible is None or len(clean_visible) == 0: clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8') elif isinstance(clean_visible, str): clean_visible = clean_visible.decode('utf-8') fc = FeatureCollection() fc[u'meta_raw'] = html and uni(html, encoding) or u'' fc[u'meta_clean_html'] = clean_html fc[u'meta_clean_visible'] = clean_visible fc[u'meta_timestamp'] = unicode(timestamp) url = url or u'' fc[u'meta_url'] = uni(url) add_feature(u'skype', features.skypes(clean_visible)) add_feature(u'phone', features.phones(clean_visible)) add_feature(u'email', features.emails(clean_visible)) bowNP, normalizations = features.noun_phrases(cleanse(clean_visible), included_unnormalized=True) add_feature(u'bowNP', bowNP) bowNP_unnorm = chain(*normalizations.values()) add_feature(u'bowNP_unnorm', bowNP_unnorm) add_feature(u'image_url', features.image_urls(clean_html)) add_feature(u'a_url', features.a_urls(clean_html)) ## get parsed versions, extract usernames fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url']) fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url']) fc[u'usernames'] = features.usernames(fc[u'image_url']) fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url']) fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url']) fc[u'usernames'] += features.usernames(fc[u'a_url']) #fc[u'usernames'] += features.usernames2( # fc[u'meta_clean_visible']) # beginning of treating this as a pipeline... xform = features.entity_names() fc = xform.process(fc) for feat_name, feat_val in other_features.iteritems(): fc[feat_name] += StringCounter(feat_val) return fc
def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs)