def test_target_parsing(): path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test' ) test_html = open(os.path.join(path, 'target-test.html')).read() html = make_clean_html( test_html ) assert 'logo' in html assert 'target' in html visible = make_clean_visible( html ) assert 'logo' not in visible assert 'target' not in visible hyperlink_labels = _init_stage( 'hyperlink_labels', dict(offset_types=['LINES'], require_abs_url=True, all_domains=True, )) si = StreamItem(body=ContentItem(clean_html=html)) context = {} hyperlink_labels( si, context ) html2 = si.body.clean_html visible2 = make_clean_visible( html2 ) #print visible2 assert 'target' not in visible2 assert 'logo' not in visible2
def test_handles(eval_data): for text, expected in eval_data: text = make_clean_html(text) text = make_clean_visible(text) sc = extract_user_names(text) assert set(sc) == set(expected)
def test_target_parsing(test_data_dir): path = os.path.join(test_data_dir, 'test') test_html = open(os.path.join(path, 'target-test.html')).read() html = make_clean_html( test_html ) assert 'logo' in html assert 'target' in html visible = make_clean_visible( html ) assert 'logo' not in visible assert 'target' not in visible stage = hyperlink_labels(config={ 'offset_types': ['LINES'], 'require_abs_url': True, 'all_domains': True, }) si = StreamItem(body=ContentItem(clean_html=html)) context = {} stage( si, context ) html2 = si.body.clean_html visible2 = make_clean_visible( html2 ) #print visible2 assert 'target' not in visible2 assert 'logo' not in visible2
def test_make_clean_html_nyt(test_data_dir, tmpdir): path = os.path.join(test_data_dir, 'test') tmpdir.join('nytimes-index-clean.html').open('wb').write( make_clean_html(open(os.path.join(path, 'nytimes-index.html')).read().decode('utf8'))) generated = open(os.path.join(path, 'nytimes-index-clean.html')).read() stable = open(os.path.join(path, 'nytimes-index-clean-stable.html')).read() assert generated == stable assert '<script' not in generated
def test_make_clean_html_nyt(): path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test' ) open(os.path.join(path, 'nytimes-index-clean.html'), 'wb').write( make_clean_html(open(os.path.join(path, 'nytimes-index.html')).read().decode('utf8'))) generated = open(os.path.join(path, 'nytimes-index-clean.html')).read() stable = open(os.path.join(path, 'nytimes-index-clean-stable.html')).read() assert generated == stable assert '<script' not in generated
def fill_more_slots(self, slots, text, save=False, phone=True, skype=True, twitter=True, si=None): clean_html = make_clean_html(text, stream_item=si) text = make_clean_visible(clean_html) if save: open('foobar-%s.txt' % md5(text).hexdigest(), 'wb').write(text) for key in [ 'phone', 'phone_raw', 'Skype', 'Twitter', 'email', 'keywords' ]: if key not in slots: slots[key] = set() email_matches = list(email_matcher(text)) if email_matches: email_match = email_matches[0][CANONICAL] slots['email'].add(email_match) if phone: phone_matches = list(phonenumber_matcher(text, country='US')) for phone_match in phone_matches: slots['phone'].add(phone_match[CANONICAL]) slots['phone_raw'].add(phone_match[RAW]) if skype: skype_matches = list(skype_matcher(text)) for skype_match in skype_matches: slots['Skype'].add(skype_match[CANONICAL]) if twitter: twitter_matches = list(twitter_matcher(text)) for twitter_match in twitter_matches: slots['Twitter'].add(twitter_match[CANONICAL]) for tok in text.split(): # assume non-CJK if prob_username(tok.lower(), self.char_unigrams, self.char_bigrams) > 0.5: slots['keywords'].add(tok) for key, val in slots.items(): if not val: slots.pop(key) return slots
def test_unicode_conversion(test_data_dir): path = os.path.join(test_data_dir, 'test') test_html = open(os.path.join(path, 'raw-unicode-issues.html')).read() print type(test_html) print test_html print repr(test_html) print str(test_html).decode('utf8') html = make_clean_html( test_html ) print unicode(html) visible = make_clean_visible( html ) print type(visible) print visible.decode('utf8')
def test_unicode_conversion(): path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test' ) test_html = open(os.path.join(path, 'raw-unicode-issues.html')).read() print type(test_html) print test_html print repr(test_html) print str(test_html).decode('utf8') html = make_clean_html( test_html ) print unicode(html) visible = make_clean_visible( html ) print type(visible) print visible.decode('utf8')
def test_make_clean_html(): test_bad_html = ''' <a href="http://birdingblogs.com/author/daleforbes">birdingblogs.com</a></div><div id="comments-template"><h3 id="comments">4 Responses to 󈬢+ Years of Digiscoping History.”</h3>''' correct_test_bad_html = '''<html><body> <a href="http://birdingblogs.com/author/daleforbes">birdingblogs.com</a><div id="comments-template"><h3 id="comments">4 Responses to + Years of Digiscoping History.”</h3></div> </body></html> ''' ## split things up around the "+" because different versions of ## lxml insert different numbers of spaces! correct_first_half, correct_second_half = correct_test_bad_html.split('+') correct_first_half = correct_first_half.strip() correct_second_half = correct_second_half.strip() cleaned = make_clean_html(test_bad_html) cleaned_first_half, cleaned_second_half = cleaned.split('+') cleaned_first_half = cleaned_first_half.strip() cleaned_second_half = cleaned_second_half.strip() #assert cleaned == correct_test_bad_html, cleaned assert cleaned_first_half == correct_first_half and cleaned_second_half == correct_second_half, cleaned
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None, timestamp=None, other_features=None): '''`html` is expected to be a raw string received over the wire from a remote webserver, and `encoding`, if provided, is used to decode it. Typically, encoding comes from the Content-Type header field. The :func:`~streamcorpus_pipeline._clean_html.make_clean_html` function handles character encodings. ''' def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs) timestamp = timestamp or int(time.time() * 1000) other_features = other_features or {} if clean_html is None: if html is not None: try: clean_html_utf8 = make_clean_html(html, encoding=encoding) except: logger.warn('dropping doc because:', exc_info=True) return clean_html = clean_html_utf8.decode('utf-8') else: clean_html_utf8 = u'' clean_html = u'' else: clean_html_utf8 = u'' if clean_visible is None or len(clean_visible) == 0: clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8') elif isinstance(clean_visible, str): clean_visible = clean_visible.decode('utf-8') fc = FeatureCollection() fc[u'meta_raw'] = html and uni(html, encoding) or u'' fc[u'meta_clean_html'] = clean_html fc[u'meta_clean_visible'] = clean_visible fc[u'meta_timestamp'] = unicode(timestamp) url = url or u'' fc[u'meta_url'] = uni(url) add_feature(u'icq', features.ICQs(clean_visible)) add_feature(u'skype', features.skypes(clean_visible)) add_feature(u'phone', features.phones(clean_visible)) add_feature(u'email', features.emails(clean_visible)) bowNP, normalizations = features.noun_phrases(cleanse(clean_visible), included_unnormalized=True) add_feature(u'bowNP', bowNP) bowNP_unnorm = chain(*normalizations.values()) add_feature(u'bowNP_unnorm', bowNP_unnorm) add_feature(u'image_url', features.image_urls(clean_html)) add_feature(u'a_url', features.a_urls(clean_html)) ## get parsed versions, extract usernames fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url']) fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url']) fc[u'usernames'] = features.usernames(fc[u'image_url']) fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url']) fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url']) fc[u'usernames'] += features.usernames(fc[u'a_url']) #fc[u'usernames'] += features.usernames2( # fc[u'meta_clean_visible']) # beginning of treating this as a pipeline... xform = features.entity_names() fc = xform.process(fc) for feat_name, feat_val in other_features.iteritems(): fc[feat_name] += StringCounter(feat_val) return fc
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None, timestamp=None, other_features=None): '''`html` is expected to be a raw string received over the wire from a remote webserver, and `encoding`, if provided, is used to decode it. Typically, encoding comes from the Content-Type header field. The :func:`~streamcorpus_pipeline._clean_html.make_clean_html` function handles character encodings. ''' def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs) timestamp = timestamp or int(time.time() * 1000) other_features = other_features or {} if clean_html is None: if html is not None: try: clean_html_utf8 = make_clean_html(html, encoding=encoding) except: logger.warn('dropping doc because:', exc_info=True) return clean_html = clean_html_utf8.decode('utf-8') else: clean_html_utf8 = u'' clean_html = u'' else: clean_html_utf8 = u'' if clean_visible is None or len(clean_visible) == 0: clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8') elif isinstance(clean_visible, str): clean_visible = clean_visible.decode('utf-8') fc = FeatureCollection() fc[u'meta_raw'] = html and uni(html, encoding) or u'' fc[u'meta_clean_html'] = clean_html fc[u'meta_clean_visible'] = clean_visible fc[u'meta_timestamp'] = unicode(timestamp) url = url or u'' fc[u'meta_url'] = uni(url) add_feature(u'icq', features.ICQs(clean_visible)) add_feature(u'skype', features.skypes(clean_visible)) add_feature(u'phone', features.phones(clean_visible)) add_feature(u'email', features.emails(clean_visible)) bowNP, normalizations = features.noun_phrases( cleanse(clean_visible), included_unnormalized=True) add_feature(u'bowNP', bowNP) bowNP_unnorm = chain(*normalizations.values()) add_feature(u'bowNP_unnorm', bowNP_unnorm) add_feature(u'image_url', features.image_urls(clean_html)) add_feature(u'a_url', features.a_urls(clean_html)) ## get parsed versions, extract usernames fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url']) fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url']) fc[u'usernames'] = features.usernames(fc[u'image_url']) fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url']) fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url']) fc[u'usernames'] += features.usernames(fc[u'a_url']) #fc[u'usernames'] += features.usernames2( # fc[u'meta_clean_visible']) # beginning of treating this as a pipeline... xform = features.entity_names() fc = xform.process(fc) for feat_name, feat_val in other_features.iteritems(): fc[feat_name] += StringCounter(feat_val) return fc
# nb = Classifier('naivebayes') # print 'is a (simply) a username? %r' % simple.classify(username) # print 'is a (nb) a username? %r' % nb.classify(username) parser = argparse.ArgumentParser() parser.add_argument('positive', help='File containing the positive test examples.') parser.add_argument('--negative', default=None, help='File with negative examples. If omitted, will generate randomly.') parser.add_argument('--test-text', help=('name of entity for whom data has been saved: %r' % usernames_with_saved_data)) args = yakonfig.parse_args(parser, [yakonfig, dblogger]) if args.test_text: for eg, tr in load_eval_data(args.test_text): eg = make_clean_html(eg) eg = make_clean_visible(eg) sc = extract_user_names(eg) found = set(sc) expected = set(tr) TP = found.intersection(expected) FN = expected - found FP = found - expected print('TP: \n\t%s' % '\n\t'.join(TP)) print('\n\nFN: \n\t%s' % '\n\t'.join(FN)) print('\n\nFP: \n\t%s' % '\n\t'.join(FP)) P = len(TP) / (len(TP) + len(FP)) R = len(TP) / (len(TP) + len(FN)) F = 2 * P * R / (P + R) print('F=%.4f, P=%.4f, R=%.4f, TP=%d, FN=%d, FP=%d' % (F, P, R, len(TP), len(FN), len(FP))) #print sc