Python make_clean_html示例，streamcorpus_pipeline._clean_html.make_clean_html Python示例

示例#1

0

显示文件

文件： test_clean_html.py 项目： nithintumma/streamcorpus-pipeline

def test_target_parsing():
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    test_html = open(os.path.join(path, 'target-test.html')).read()

    html = make_clean_html( test_html )

    assert 'logo' in html
    assert 'target' in html

    visible = make_clean_visible( html )
    
    assert 'logo' not in visible
    assert 'target' not in visible

    hyperlink_labels = _init_stage(
        'hyperlink_labels', 
        dict(offset_types=['LINES'],
             require_abs_url=True,
             all_domains=True,
             ))
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    hyperlink_labels( si, context )
    html2 = si.body.clean_html

    visible2 = make_clean_visible( html2 )
    
    #print visible2

    assert 'target' not in visible2
    assert 'logo' not in visible2

示例#2

0

显示文件

def test_handles(eval_data):
    for text, expected in eval_data:
        text = make_clean_html(text)
        text = make_clean_visible(text)
        sc = extract_user_names(text)

        assert set(sc) == set(expected)

示例#3

0

显示文件

def test_target_parsing(test_data_dir):
    path = os.path.join(test_data_dir, 'test')
    test_html = open(os.path.join(path, 'target-test.html')).read()

    html = make_clean_html( test_html )

    assert 'logo' in html
    assert 'target' in html

    visible = make_clean_visible( html )

    assert 'logo' not in visible
    assert 'target' not in visible

    stage = hyperlink_labels(config={
        'offset_types': ['LINES'],
        'require_abs_url': True,
        'all_domains': True,
    })
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    stage( si, context )
    html2 = si.body.clean_html

    visible2 = make_clean_visible( html2 )

    #print visible2

    assert 'target' not in visible2
    assert 'logo' not in visible2

示例#4

0

显示文件

def test_make_clean_html_nyt(test_data_dir, tmpdir):
    path = os.path.join(test_data_dir, 'test')
    tmpdir.join('nytimes-index-clean.html').open('wb').write(
        make_clean_html(open(os.path.join(path, 'nytimes-index.html')).read().decode('utf8')))
    generated = open(os.path.join(path, 'nytimes-index-clean.html')).read()
    stable    = open(os.path.join(path, 'nytimes-index-clean-stable.html')).read()
    assert generated == stable

    assert '<script' not in generated

示例#5

0

显示文件

文件： test_clean_html.py 项目： nithintumma/streamcorpus-pipeline

def test_make_clean_html_nyt():
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    open(os.path.join(path, 'nytimes-index-clean.html'), 'wb').write(
        make_clean_html(open(os.path.join(path, 'nytimes-index.html')).read().decode('utf8')))
    generated = open(os.path.join(path, 'nytimes-index-clean.html')).read()
    stable    = open(os.path.join(path, 'nytimes-index-clean-stable.html')).read()
    assert generated == stable

    assert '<script' not in generated

示例#6

0

显示文件

文件： page_extractors.py 项目： vishalbelsare/memex-dossier-open

    def fill_more_slots(self,
                        slots,
                        text,
                        save=False,
                        phone=True,
                        skype=True,
                        twitter=True,
                        si=None):
        clean_html = make_clean_html(text, stream_item=si)
        text = make_clean_visible(clean_html)

        if save:
            open('foobar-%s.txt' % md5(text).hexdigest(), 'wb').write(text)

        for key in [
                'phone', 'phone_raw', 'Skype', 'Twitter', 'email', 'keywords'
        ]:
            if key not in slots:
                slots[key] = set()

        email_matches = list(email_matcher(text))
        if email_matches:
            email_match = email_matches[0][CANONICAL]
            slots['email'].add(email_match)

        if phone:
            phone_matches = list(phonenumber_matcher(text, country='US'))
            for phone_match in phone_matches:
                slots['phone'].add(phone_match[CANONICAL])
                slots['phone_raw'].add(phone_match[RAW])

        if skype:
            skype_matches = list(skype_matcher(text))
            for skype_match in skype_matches:
                slots['Skype'].add(skype_match[CANONICAL])

        if twitter:
            twitter_matches = list(twitter_matcher(text))
            for twitter_match in twitter_matches:
                slots['Twitter'].add(twitter_match[CANONICAL])

        for tok in text.split():  # assume non-CJK
            if prob_username(tok.lower(), self.char_unigrams,
                             self.char_bigrams) > 0.5:
                slots['keywords'].add(tok)

        for key, val in slots.items():
            if not val:
                slots.pop(key)

        return slots

示例#7

0

显示文件

def test_unicode_conversion(test_data_dir):
    path = os.path.join(test_data_dir, 'test')
    test_html = open(os.path.join(path, 'raw-unicode-issues.html')).read()

    print type(test_html)
    print test_html
    print repr(test_html)
    print str(test_html).decode('utf8')

    html = make_clean_html( test_html )

    print unicode(html)

    visible = make_clean_visible( html )

    print type(visible)

    print visible.decode('utf8')

示例#8

0

显示文件

文件： test_clean_html.py 项目： nithintumma/streamcorpus-pipeline

def test_unicode_conversion():
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    test_html = open(os.path.join(path, 'raw-unicode-issues.html')).read()

    print type(test_html)
    print test_html
    print repr(test_html)
    print str(test_html).decode('utf8')

    html = make_clean_html( test_html )

    print unicode(html)

    visible = make_clean_visible( html )

    print type(visible)

    print visible.decode('utf8')

示例#9

0

显示文件

文件： test_clean_html.py 项目： nithintumma/streamcorpus-pipeline

def test_make_clean_html():
    test_bad_html = '''
<a href="http://birdingblogs.com/author/daleforbes">birdingblogs.com</a></div><div
id="comments-template"><h3 id="comments">4 Responses to &#822050+ Years of Digiscoping History.&#8221;</h3>'''

    correct_test_bad_html = '''<html><body>
<a href="http://birdingblogs.com/author/daleforbes">birdingblogs.com</a><div id="comments-template"><h3 id="comments">4 Responses to   + Years of Digiscoping History.”</h3></div>
</body></html>
'''
    ## split things up around the "+" because different versions of
    ## lxml insert different numbers of spaces!
    correct_first_half, correct_second_half = correct_test_bad_html.split('+')
    correct_first_half = correct_first_half.strip()
    correct_second_half = correct_second_half.strip()

    cleaned =  make_clean_html(test_bad_html)
    cleaned_first_half, cleaned_second_half = cleaned.split('+')
    cleaned_first_half = cleaned_first_half.strip()
    cleaned_second_half = cleaned_second_half.strip()

    #assert cleaned == correct_test_bad_html, cleaned
    assert cleaned_first_half == correct_first_half and cleaned_second_half == correct_second_half, cleaned

示例#10

0

显示文件

def test_make_clean_html():
    test_bad_html = '''
<a href="http://birdingblogs.com/author/daleforbes">birdingblogs.com</a></div><div
id="comments-template"><h3 id="comments">4 Responses to &#822050+ Years of Digiscoping History.&#8221;</h3>'''

    correct_test_bad_html = '''<html><body>
<a href="http://birdingblogs.com/author/daleforbes">birdingblogs.com</a><div id="comments-template"><h3 id="comments">4 Responses to   + Years of Digiscoping History.”</h3></div>
</body></html>
'''
    ## split things up around the "+" because different versions of
    ## lxml insert different numbers of spaces!
    correct_first_half, correct_second_half = correct_test_bad_html.split('+')
    correct_first_half = correct_first_half.strip()
    correct_second_half = correct_second_half.strip()

    cleaned =  make_clean_html(test_bad_html)
    cleaned_first_half, cleaned_second_half = cleaned.split('+')
    cleaned_first_half = cleaned_first_half.strip()
    cleaned_second_half = cleaned_second_half.strip()

    #assert cleaned == correct_test_bad_html, cleaned
    assert cleaned_first_half == correct_first_half and cleaned_second_half == correct_second_half, cleaned

示例#11

0

显示文件

文件： interface.py 项目： anukat2015/dossier.models

def html_to_fc(html=None,
               clean_html=None,
               clean_visible=None,
               encoding=None,
               url=None,
               timestamp=None,
               other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except:
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(cleanse(clean_visible),
                                                  included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc

示例#12

0

显示文件

文件： interface.py 项目： mrG7/dossier.models

def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None,
               timestamp=None, other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except: 
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(
        cleanse(clean_visible), included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc

示例#13

0

显示文件

    # nb = Classifier('naivebayes')

    # print 'is a (simply) a username? %r' % simple.classify(username)
    # print 'is a (nb) a username? %r' % nb.classify(username)

    parser = argparse.ArgumentParser()
    parser.add_argument('positive', help='File containing the positive test examples.')
    parser.add_argument('--negative', default=None, help='File with negative examples. If omitted, will generate randomly.')
    parser.add_argument('--test-text', 
                        help=('name of entity for whom data has been saved: %r' % 
                              usernames_with_saved_data))
    args = yakonfig.parse_args(parser, [yakonfig, dblogger])

    if args.test_text:
        for eg, tr in load_eval_data(args.test_text):
            eg = make_clean_html(eg)
            eg = make_clean_visible(eg)
            sc = extract_user_names(eg)
            found = set(sc)
            expected = set(tr)
            TP = found.intersection(expected)
            FN = expected - found
            FP = found - expected
            print('TP: \n\t%s' % '\n\t'.join(TP))
            print('\n\nFN: \n\t%s' % '\n\t'.join(FN))
            print('\n\nFP: \n\t%s' % '\n\t'.join(FP))
            P = len(TP) / (len(TP) + len(FP))
            R = len(TP) / (len(TP) + len(FN))
            F = 2 * P * R / (P + R)
            print('F=%.4f, P=%.4f, R=%.4f, TP=%d, FN=%d, FP=%d' % (F, P, R, len(TP), len(FN), len(FP)))
            #print sc