def make_hyperlink_labeled_test_stream_item():
    context = {}
    si = make_test_stream_item()
    assert len(si.body.clean_html) > 200
    hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         'offset_types': ['BYTES']}
        )(si, context)

    cv = _init_stage('clean_visible', {})
    cv(si, context)
    assert len(si.body.clean_visible) > 200

    return si
def test_basics(test_data_dir):
    start = time.time()
    ## run it with a byte regex
    si1 = make_test_stream_item(test_data_dir)
    context = {}
    hl1 = hyperlink_labels(
        config={
            "require_abs_url": True,
            "all_domains": False,
            "domain_substrings": ["nytimes.com"],
            "offset_types": ["BYTES"],
        }
    )
    hl1(si1, context)
    elapsed_bytes = time.time() - start

    assert si1.body.labels["author"][0].offsets.keys() == [OffsetType.BYTES]

    ## run it with regex
    start = time.time()
    si2 = make_test_stream_item(test_data_dir)
    hl2 = hyperlink_labels(
        config={
            "require_abs_url": True,
            "all_domains": False,
            "domain_substrings": ["nytimes.com"],
            "offset_types": ["LINES"],
        }
    )
    hl2(si2, context)
    elapsed_lines = time.time() - start

    assert si2.body.labels["author"][0].offsets.keys() == [OffsetType.LINES]

    byte_labels = set()
    for annotator_id in si1.body.labels:
        for label in si1.body.labels[annotator_id]:
            assert OffsetType.BYTES in label.offsets
            byte_labels.add(label.target.target_id)

    line_labels = set()
    for annotator_id in si2.body.labels:
        for label in si2.body.labels[annotator_id]:
            assert OffsetType.LINES in label.offsets
            line_labels.add(label.target.target_id)

    assert line_labels == byte_labels
    logger.info("{:.5f} bytes, {:.5f} lines".format(elapsed_bytes, elapsed_lines))
def test_speed(parser_type, test_data_dir):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.join(test_data_dir, "test")
        stream_item.body.clean_html = open(os.path.join(path, "nytimes-index-clean.html")).read()
        stream_items.append(stream_item)

    context = {}
    start = time.time()
    hl = hyperlink_labels(
        config={
            "require_abs_url": True,
            "all_domains": False,
            "domain_substrings": ["nytimes.com"],
            "offset_types": [parser_type],
        }
    )
    for si in stream_items:
        si = hl(si, context)
        elapsed = time.time() - start

    rate = len(stream_items) / elapsed

    logger.debug("OffsetType: {}".format(OffsetType))
    logger.info("{:.1f} per second for {}".format(rate, parser_type))
def make_hyperlink_labeled_test_chunk():
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    dpath = os.path.dirname(__file__)
    ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' )

    cv = _init_stage('clean_visible', {})
    hl = hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         'offset_types': ['BYTES']}
        )
    for si in Chunk(path=ipath):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

    o_chunk.close()
    return tpath
def test_long_doc(parser_type):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    stream_item.body.clean_html = open(
        os.path.join(path, 'company-test.html')).read()

    context = {}
    ## run it with a byte state machine
    hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         ## will fail if set to bytes
         'offset_types': [parser_type]}
        )(stream_item, context)
Пример #6
0
def make_hyperlink_labeled_test_chunk(tmpdir):
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = tmpdir.join(str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [BYTES],
    })
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath
def test_speed(parser_type):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.dirname(__file__)
        path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
        stream_item.body.clean_html = open(
            os.path.join(path, 'nytimes-index-clean.html')).read()
        stream_items.append( stream_item )

    context = {}
    start = time.time()
    ## run it with a byte state machine
    for si in stream_items:
        si = hyperlink_labels(
            {'require_abs_url': True, 
             'domain_substrings': ['nytimes.com'],
             'all_domains': False,
             'offset_types': [parser_type]}
            )(si, context)
    elapsed = time.time() - start
    
    rate = len(stream_items) / elapsed

    print OffsetType
    print '\n\n%.1f per second for %s' % (rate, parser_type)
Пример #8
0
def test_speed(parser_type, test_data_dir):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.join(test_data_dir, 'test' )
        stream_item.body.clean_html = open(
            os.path.join(path, 'nytimes-index-clean.html')).read()
        stream_items.append( stream_item )

    context = {}
    start = time.time()
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': False,
        'domain_substrings': ['nytimes.com'],
        'offset_types': [parser_type],
    })
    for si in stream_items:
        si = hl(si, context)
        elapsed = time.time() - start

    rate = len(stream_items) / elapsed

    logger.debug('OffsetType: {}'.format(OffsetType))
    logger.info('{:.1f} per second for {}'.format(rate, parser_type))
Пример #9
0
def test_target_parsing(test_data_dir):
    path = os.path.join(test_data_dir, 'test')
    test_html = open(os.path.join(path, 'target-test.html')).read()

    html = make_clean_html( test_html )

    assert 'logo' in html
    assert 'target' in html

    visible = make_clean_visible( html )

    assert 'logo' not in visible
    assert 'target' not in visible

    stage = hyperlink_labels(config={
        'offset_types': ['LINES'],
        'require_abs_url': True,
        'all_domains': True,
    })
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    stage( si, context )
    html2 = si.body.clean_html

    visible2 = make_clean_visible( html2 )

    #print visible2

    assert 'target' not in visible2
    assert 'logo' not in visible2
Пример #10
0
def test_basics(test_data_dir):
    start = time.time()
    ## run it with a byte regex
    si1 = make_test_stream_item(test_data_dir)
    context = {}
    hl1 = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': False,
        'domain_substrings': ['nytimes.com'],
        'offset_types': ['BYTES'],
    })
    hl1(si1,context)
    elapsed_bytes = time.time() - start

    assert si1.body.labels['author'][0].offsets.keys() == [OffsetType.BYTES]

    ## run it with regex
    start = time.time()
    si2 = make_test_stream_item(test_data_dir)
    hl2 = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': False,
        'domain_substrings': ['nytimes.com'],
        'offset_types': ['LINES'],
    })
    hl2(si2,context)
    elapsed_lines = time.time() - start

    assert si2.body.labels['author'][0].offsets.keys() == [OffsetType.LINES]

    byte_labels = set()
    for annotator_id in si1.body.labels:
        for label in si1.body.labels[annotator_id]:
            assert OffsetType.BYTES in label.offsets
            byte_labels.add(label.target.target_id)

    line_labels = set()
    for annotator_id in si2.body.labels:
        for label in si2.body.labels[annotator_id]:
            assert OffsetType.LINES in label.offsets
            line_labels.add(label.target.target_id)

    assert line_labels == byte_labels
    logger.info('{:.5f} bytes, {:.5f} lines'
                .format(elapsed_bytes, elapsed_lines))
def test_long_doc(parser_type, test_data_dir):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, "test")
    stream_item.body.clean_html = open(os.path.join(path, "company-test.html")).read()

    context = {}
    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [parser_type]})
    hl(stream_item, context)
def make_hyperlink_labeled_test_stream_item(test_data_dir):
    context = {}
    si = make_test_stream_item(test_data_dir)
    assert len(si.body.clean_html) > 200
    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": ["BYTES"]})
    hl(si, context)
    cv = clean_visible(config={})
    cv(si, context)
    assert len(si.body.clean_visible) > 200
    return si
def test_basics():
    start = time.time()
    ## run it with a byte regex
    si1 = make_test_stream_item()
    context = {}
    hyperlink_labels(
        {'require_abs_url': True, 
         'domain_substrings': ['nytimes.com'],
         'all_domains': False,
         'offset_types': ['BYTES']}
        )(si1, context)
    elapsed_bytes = time.time() - start

    assert si1.body.labels['author'][0].offsets.keys() == [OffsetType.BYTES]

    start = time.time()
    si2 = make_test_stream_item()
    ## run it with regex
    hyperlink_labels(
        {'require_abs_url': True, 
         'domain_substrings': ['nytimes.com'],
         'all_domains': False,
         'offset_types': ['LINES']}
        )(si2, context)
    elapsed_lines = time.time() - start

    assert si2.body.labels['author'][0].offsets.keys() == [OffsetType.LINES]

    byte_labels = set()
    for annotator_id in si1.body.labels:
        for label in si1.body.labels[annotator_id]:
            assert OffsetType.BYTES in label.offsets
            byte_labels.add(label.target.target_id)

    line_labels = set()
    for annotator_id in si2.body.labels:
        for label in si2.body.labels[annotator_id]:
            assert OffsetType.LINES in label.offsets
            line_labels.add(label.target.target_id)

    assert line_labels == byte_labels
    print '\n\n%.5f bytes,\n %.5f lines' % (elapsed_bytes, elapsed_lines)
Пример #14
0
def make_hyperlink_labeled_test_stream_item(test_data_dir):
    context = {}
    si = make_test_stream_item(test_data_dir)
    assert len(si.body.clean_html) > 200
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': ['BYTES'],
    })
    hl(si, context)
    cv = clean_visible(config={})
    cv(si, context)
    assert len(si.body.clean_visible) > 200
    return si
Пример #15
0
def test_long_doc(parser_type, test_data_dir):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, 'test' )
    stream_item.body.clean_html = open(
        os.path.join(path, 'company-test.html')).read()

    context = {}
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [parser_type],
    })
    hl(stream_item, context)
def make_hyperlink_labeled_test_chunk(tmpdir):
    """
    returns a path to a temporary chunk that has been hyperlink labeled
    """
    tpath = tmpdir.join(str(uuid.uuid1()) + ".sc")
    o_chunk = Chunk(tpath, mode="wb")

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]})
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath