def make_feature(fc): '''Builds a new `StringCounter` from the many `StringCounters` in the input `fc`. This StringCounter will define one of the targets for the `MultinomialNB` classifier. This crucial function decides the relative importance of features extracted by the ETL pipeline. This is essentially a form of domain fitting that allows us to tune the extraction to the fields that are important to a domain. However, if the NER for a domain is inadequate, then the primary purpose of these relative weightings is to remove bogus NER extractions. ''' feat = StringCounter() rejects = set() keepers = set() #keepers_keys = ['GPE', 'PERSON', 'ORGANIZATION', 'usernames'] keepers_keys = ['phone', 'email'] #['usernames', 'phone', 'email', 'ORGANIZATION', 'PERSON'] rejects_keys = ['keywords', 'usernames', 'ORGANIZATION', 'PERSON'] # The features used to pull the keys for the classifier for f, strength in [('keywords', 10**4), ('GPE', 1), ('bow', 1), ('bowNP_sip', 10**8), ('phone', 10**12), ('email', 10**12), ('bowNP', 10**3), ('PERSON', 10**8), ('ORGANIZATION', 10**6), ('usernames', 10**12)]: if strength == 1: feat += fc[f] else: feat += StringCounter({key: strength * count for key, count in fc[f].items()}) if f in rejects_keys: map(rejects.add, fc[f]) if f in keepers_keys: map(keepers.add, fc[f]) if u'' in feat: feat.pop(u'') return feat, rejects, keepers
def test_usernames(url_or_path, username, count): urls = StringCounter() urls[url_or_path] += count if username is not None: results = usernames(urls) assert results == StringCounter({username: count})
def test_image_urls(): html = ''' <img src="http://ExAmPle.com/My Image.jpg"> <img src="http://example.com/My%20Image.jpg"> ''' assert StringCounter(features.image_urls(html)) == StringCounter({ 'http://example.com/My Image.jpg': 2, })
def test_extract_emails(): txt = ''' email: [email protected] email: [email protected] ''' assert StringCounter(features.emails(txt)) == StringCounter({ '*****@*****.**': 2, })
def test_a_urls(): html = ''' <a href="http://ExAmPle.com/My Page.html"> <a href="http://example.com/My%20Page.html"> ''' assert StringCounter(features.a_urls(html)) == StringCounter({ 'http://example.com/My Page.html': 2, })
def test_host_names(): urls = StringCounter() urls['http://www.example.com/folder1'] = 3 urls['http://www.example.com/folder2'] = 2 urls['http://www.different.com/folder2'] = 7 assert features.host_names(urls) == StringCounter({ 'www.example.com': 5, 'www.different.com': 7, })
def test_path_dirs(): urls = StringCounter() urls['http://www.example.com/folder1/folder3/index.html?source=dummy'] = 3 urls['http://www.example.com/folder2/folder1'] = 2 urls['http://www.different.com/folder2'] = 7 assert features.path_dirs(urls) == StringCounter({ 'folder1': 5, 'folder2': 9, 'folder3': 3, 'index.html': 3, })
def test_string_counter(): hc = StringCounter('this sentence has been parsed') h1 = (id(hc), hc.generation) cache = dict(h1='hi') assert len(cache) == 1 hc.update('more text') h2 = (id(hc), hc.generation) cache[h2] = 'hi again?' assert len(cache) == 2 assert h1 != h2
def test_string_counter(): hc = StringCounter("this sentence has been parsed") h1 = (id(hc), hc.generation) cache = dict(h1="hi") assert len(cache) == 1 hc.update("more text") h2 = (id(hc), hc.generation) cache[h2] = "hi again?" assert len(cache) == 2 assert h1 != h2
def test_extract_phones(): txt = ''' Phone: 111-222-3333 Phone: 1112223333 Phone: 1-111-222-3333 Phone: 11112223333 Phone: 222-3333 Phone: 2223333 ''' assert StringCounter(features.phones(txt)) == StringCounter({ '1112223333': 2, '11112223333': 2, '2223333': 2, })
def test_add_facets(): cid1 = 'cid1' fc1 = FeatureCollection() fc1['bowNP_sip'] = StringCounter([u'elephant', u'car']) cid2 = 'cid2' fc2 = FeatureCollection() fc2['bowNP_sip'] = StringCounter([u'car', u'green']) fake_results = {'results': [(cid1, fc1), (cid2, fc2)]} new_results = mod_pairwise.add_facets(fake_results) assert 'facets' in new_results assert new_results['facets'] == { 'elephant': [cid1], 'car': [cid1, cid2], 'green': [cid2], }
def add_sip_to_fc(fc, tfidf, limit=40): '''add "bowNP_sip" to `fc` using `tfidf` data ''' if 'bowNP' not in fc: return if tfidf is None: return sips = features.sip_noun_phrases(tfidf, fc['bowNP'].keys(), limit=limit) fc[u'bowNP_sip'] = StringCounter(sips)
def path_dirs(urls): ''' Takes a StringCounter of normalized URL and parses them into a list of path directories. The file name is included in the path directory list. ''' path_dirs = StringCounter() for url in urls: for path_dir in filter(None, urlparse(url).path.split('/')): path_dirs[path_dir] += urls[url] return path_dirs
def create_fc_from_html(url, html, encoding='utf-8', tfidf=None, other_features=None): if encoding is not None: html = unicode(html, encoding) soup = BeautifulSoup(html, "lxml") title = soup_get(soup, 'title', lambda v: v.get_text()) body = soup_get(soup, 'body', lambda v: v.prettify()) if other_features is None: other_features = {} other_features.update({ u'title': StringCounter([title]), u'titleBow': StringCounter(title.split()), }) fc = html_to_fc(body, url=url, other_features=other_features) if fc is None: return None if tfidf is not None: add_sip_to_fc(fc, tfidf) return fc
def v1_fc_put(request, response, store, kvlclient, tfidf, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. Alternatively, if the request's ``Content-type`` is ``text/html``, then a feature collection is generated from the HTML. The generated feature collection is then returned as a JSON payload. This endpoint returns status ``201`` upon successful storage otherwise. An existing feature collection with id ``content_id`` is overwritten. ''' tfidf = tfidf or None if request.headers.get('content-type', '').startswith('text/html'): url = urllib.unquote(cid.split('|', 1)[1]) fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf) logger.info('created FC for %r', cid) store.put([(cid, fc)]) return fc_to_json(fc) else: fc = FeatureCollection.from_dict(json.load(request.body)) keywords = set() for subid in fc: if subid.startswith('subtopic'): ty = subtopic_type(subid) if ty in ('text', 'manual'): # get the user selected string data = typed_subtopic_data(fc, subid) map(keywords.add, cleanse(data).split()) keywords.add(cleanse(data)) folders = Folders(kvlclient) for fid, sid in folders.parent_subfolders(cid): if not isinstance(fid, unicode): fid = fid.decode('utf8') if not isinstance(sid, unicode): sid = sid.decode('utf8') keywords.add(cleanse(fid)) keywords.add(cleanse(sid)) fc[u'keywords'] = StringCounter(keywords) store.put([(cid, fc)]) response.status = 201
def forum_post_features(row): fc = FeatureCollection() for k in row['author']: fc['post_author_' + k] = row['author'][k] if 'image_urls' in row: fc['image_url'] = StringCounter() for image_url in row['image_urls']: fc['image_url'][image_url] += 1 others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title'] for k in others: if k in row: fc['post_' + k] = uni(row[k]) return fc
def host_names(urls): ''' Takes a StringCounter of normalized URL and parses their hostnames N.B. this assumes that absolute URLs will begin with http:// in order to accurately resolve the host name. Relative URLs will not have host names. ''' host_names = StringCounter() for url in urls: host_names[urlparse(url).netloc] += urls[url] return host_names
def usernames(urls): '''Take an iterable of `urls` of normalized URL or file paths and attempt to extract usernames. Returns a list. ''' usernames = StringCounter() for url, count in urls.items(): uparse = urlparse(url) path = uparse.path hostname = uparse.hostname m = username_re.match(path) if m: usernames[m.group('username')] += count elif hostname in ['twitter.com', 'www.facebook.com']: usernames[path.lstrip('/')] += count return usernames
def callback(si, link): if si is None: return cid_url = hashlib.md5(str(link)).hexdigest() cid = etl.interface.mk_content_id(cid_url) content_ids.append(cid) # hack alert! # We currently use FCs to store subtopic text data, which # means we cannot overwrite existing FCs with reckless # abandon. So we adopt a heuristic: check if an FC already # exists, and if it does, check if it is being used to store # user data. If so, don't overwrite it and move on. fc = config.store.get(cid) if fc is not None and any(k.startswith('subtopic|') for k in fc.iterkeys()): logger.info('skipping ingest for %r (abs url: %r) because ' 'an FC with user data already exists.', cid, link) return other_features = { u'keywords': StringCounter(keywords), #list(queries)), } try: fc = etl.create_fc_from_html( link, si.body.raw, encoding=si.body.encoding or 'utf-8', tfidf=tfidf, other_features=other_features, ) if not fc: logger.info('failed to get an FC, moving on') return logger.info('created FC for %r (abs url: %r)', cid, link) config.store.put([(cid, fc)]) except Exception: logger.info('trapped ingest failure on %r (abs url: %r)', cid, link, exc_info=True)
def get_feat(fc, name): if len(fc[name]) == 0: return StringCounter({'': 0}) else: return fc[name]
# !!! IMPORTANT !!! # Define features that you want to index. This will let you quickly scan # for feature collections in the database with matching values. # # You don't have to index everything, but it's probably a good idea to index # the most prominent features. e.g., phone or email or website. # # These should correspond to the names of the corresponding features. feature_indexes = [u'phone', u'email', u'website', u'rate'] # Create a "store," which knows how to store and index feature collections. store = Store(conn, feature_indexes=feature_indexes) # Create a fresh feature collection and add a 'rate' feature. fc = FeatureCollection() fc['rate'] = StringCounter({ u'5per30': 5, u'5per60': 1, u'10per20': 2, }) # Content ids are the unique identifier for each feature collection. # It's probably sufficient to use whatever you have for "ad id." content_id = 'some_unique_value' store.put([(content_id, fc)]) print store.get(content_id) # Use the index scan! print list(store.index_scan_prefix(u'rate', '10'))
def extract(positive_fcs, negative_fcs, features=None): '''Takes a labeled set of feature collections (positive and negative) and the features wanted. And trains a Naive Bayes classifier on the underlying keys of the set of selected features features. If no features are selected, all are used. Returns two list of (keywords, strength) tuples ordered by strength. The first are feature keys that were predictive of the positive label and the second are the feature keys are were predictive of the negative label. ``*_fcs`` is the list of feature collections, positive label and negative label respectively. ``features`` designates which specific feature gets vectorized the other features are ignored. ''' # Vector of labels labels = np.array([1] * len(positive_fcs) + [0] * len(negative_fcs)) # Used to convert the feature collection keys into a sklearn # compatible format v = DictVectorizer(sparse=False) D = list() for fc in (positive_fcs + negative_fcs): feat = StringCounter() if not fc: logger.warn('how did we get an empty fc? %r', fc) else: # The features used to pull the keys for the classifier for f in features: feat += fc[f] D.append(feat) # Convert the list of Counters into an sklearn compatible format X = v.fit_transform(D) # Fit the sklearn Bernoulli Naive Bayes classifer clf = BernoulliNB() clf.fit(X, labels) # Extract the learned features that are predictive of the positive # and negative class positive_keywords = v.inverse_transform(clf.feature_log_prob_[1])[0] negative_keywords = v.inverse_transform(clf.feature_log_prob_[0])[0] pos_words = Counter(positive_keywords) neg_words = Counter(negative_keywords) ## make a list ordered by their weight pos_ordered = sorted(pos_words.items(), key=operator.itemgetter(1), reverse=True) neg_ordered = sorted(neg_words.items(), key=operator.itemgetter(1), reverse=True) return pos_ordered, neg_ordered
def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs)
def make_fc(text): nhash = nilsimsa_hash(text) fc = FeatureCollection() fc['#nilsimsa_all'] = StringCounter([nhash]) return fc
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None, timestamp=None, other_features=None): '''`html` is expected to be a raw string received over the wire from a remote webserver, and `encoding`, if provided, is used to decode it. Typically, encoding comes from the Content-Type header field. The :func:`~streamcorpus_pipeline._clean_html.make_clean_html` function handles character encodings. ''' def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs) timestamp = timestamp or int(time.time() * 1000) other_features = other_features or {} if clean_html is None: if html is not None: try: clean_html_utf8 = make_clean_html(html, encoding=encoding) except: logger.warn('dropping doc because:', exc_info=True) return clean_html = clean_html_utf8.decode('utf-8') else: clean_html_utf8 = u'' clean_html = u'' else: clean_html_utf8 = u'' if clean_visible is None or len(clean_visible) == 0: clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8') elif isinstance(clean_visible, str): clean_visible = clean_visible.decode('utf-8') fc = FeatureCollection() fc[u'meta_raw'] = html and uni(html, encoding) or u'' fc[u'meta_clean_html'] = clean_html fc[u'meta_clean_visible'] = clean_visible fc[u'meta_timestamp'] = unicode(timestamp) url = url or u'' fc[u'meta_url'] = uni(url) add_feature(u'icq', features.ICQs(clean_visible)) add_feature(u'skype', features.skypes(clean_visible)) add_feature(u'phone', features.phones(clean_visible)) add_feature(u'email', features.emails(clean_visible)) bowNP, normalizations = features.noun_phrases(cleanse(clean_visible), included_unnormalized=True) add_feature(u'bowNP', bowNP) bowNP_unnorm = chain(*normalizations.values()) add_feature(u'bowNP_unnorm', bowNP_unnorm) add_feature(u'image_url', features.image_urls(clean_html)) add_feature(u'a_url', features.a_urls(clean_html)) ## get parsed versions, extract usernames fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url']) fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url']) fc[u'usernames'] = features.usernames(fc[u'image_url']) fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url']) fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url']) fc[u'usernames'] += features.usernames(fc[u'a_url']) #fc[u'usernames'] += features.usernames2( # fc[u'meta_clean_visible']) # beginning of treating this as a pipeline... xform = features.entity_names() fc = xform.process(fc) for feat_name, feat_val in other_features.iteritems(): fc[feat_name] += StringCounter(feat_val) return fc