def make_hyperlink_labeled_test_stream_item(): context = {} si = make_test_stream_item() assert len(si.body.clean_html) > 200 hyperlink_labels( {'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES']} )(si, context) cv = _init_stage('clean_visible', {}) cv(si, context) assert len(si.body.clean_visible) > 200 return si
def test_basics(test_data_dir): start = time.time() ## run it with a byte regex si1 = make_test_stream_item(test_data_dir) context = {} hl1 = hyperlink_labels( config={ "require_abs_url": True, "all_domains": False, "domain_substrings": ["nytimes.com"], "offset_types": ["BYTES"], } ) hl1(si1, context) elapsed_bytes = time.time() - start assert si1.body.labels["author"][0].offsets.keys() == [OffsetType.BYTES] ## run it with regex start = time.time() si2 = make_test_stream_item(test_data_dir) hl2 = hyperlink_labels( config={ "require_abs_url": True, "all_domains": False, "domain_substrings": ["nytimes.com"], "offset_types": ["LINES"], } ) hl2(si2, context) elapsed_lines = time.time() - start assert si2.body.labels["author"][0].offsets.keys() == [OffsetType.LINES] byte_labels = set() for annotator_id in si1.body.labels: for label in si1.body.labels[annotator_id]: assert OffsetType.BYTES in label.offsets byte_labels.add(label.target.target_id) line_labels = set() for annotator_id in si2.body.labels: for label in si2.body.labels[annotator_id]: assert OffsetType.LINES in label.offsets line_labels.add(label.target.target_id) assert line_labels == byte_labels logger.info("{:.5f} bytes, {:.5f} lines".format(elapsed_bytes, elapsed_lines))
def test_speed(parser_type, test_data_dir): stream_items = [] for i in xrange(10): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, "test") stream_item.body.clean_html = open(os.path.join(path, "nytimes-index-clean.html")).read() stream_items.append(stream_item) context = {} start = time.time() hl = hyperlink_labels( config={ "require_abs_url": True, "all_domains": False, "domain_substrings": ["nytimes.com"], "offset_types": [parser_type], } ) for si in stream_items: si = hl(si, context) elapsed = time.time() - start rate = len(stream_items) / elapsed logger.debug("OffsetType: {}".format(OffsetType)) logger.info("{:.1f} per second for {}".format(rate, parser_type))
def make_hyperlink_labeled_test_chunk(): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') dpath = os.path.dirname(__file__) ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' ) cv = _init_stage('clean_visible', {}) hl = hyperlink_labels( {'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES']} ) for si in Chunk(path=ipath): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def test_long_doc(parser_type): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'company-test.html')).read() context = {} ## run it with a byte state machine hyperlink_labels( {'require_abs_url': True, 'all_domains': True, ## will fail if set to bytes 'offset_types': [parser_type]} )(stream_item, context)
def make_hyperlink_labeled_test_chunk(tmpdir): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = tmpdir.join(str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') ipath = get_test_chunk_path() hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': [BYTES], }) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def test_speed(parser_type): stream_items = [] for i in xrange(10): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'nytimes-index-clean.html')).read() stream_items.append( stream_item ) context = {} start = time.time() ## run it with a byte state machine for si in stream_items: si = hyperlink_labels( {'require_abs_url': True, 'domain_substrings': ['nytimes.com'], 'all_domains': False, 'offset_types': [parser_type]} )(si, context) elapsed = time.time() - start rate = len(stream_items) / elapsed print OffsetType print '\n\n%.1f per second for %s' % (rate, parser_type)
def test_speed(parser_type, test_data_dir): stream_items = [] for i in xrange(10): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'nytimes-index-clean.html')).read() stream_items.append( stream_item ) context = {} start = time.time() hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': False, 'domain_substrings': ['nytimes.com'], 'offset_types': [parser_type], }) for si in stream_items: si = hl(si, context) elapsed = time.time() - start rate = len(stream_items) / elapsed logger.debug('OffsetType: {}'.format(OffsetType)) logger.info('{:.1f} per second for {}'.format(rate, parser_type))
def test_target_parsing(test_data_dir): path = os.path.join(test_data_dir, 'test') test_html = open(os.path.join(path, 'target-test.html')).read() html = make_clean_html( test_html ) assert 'logo' in html assert 'target' in html visible = make_clean_visible( html ) assert 'logo' not in visible assert 'target' not in visible stage = hyperlink_labels(config={ 'offset_types': ['LINES'], 'require_abs_url': True, 'all_domains': True, }) si = StreamItem(body=ContentItem(clean_html=html)) context = {} stage( si, context ) html2 = si.body.clean_html visible2 = make_clean_visible( html2 ) #print visible2 assert 'target' not in visible2 assert 'logo' not in visible2
def test_basics(test_data_dir): start = time.time() ## run it with a byte regex si1 = make_test_stream_item(test_data_dir) context = {} hl1 = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': False, 'domain_substrings': ['nytimes.com'], 'offset_types': ['BYTES'], }) hl1(si1,context) elapsed_bytes = time.time() - start assert si1.body.labels['author'][0].offsets.keys() == [OffsetType.BYTES] ## run it with regex start = time.time() si2 = make_test_stream_item(test_data_dir) hl2 = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': False, 'domain_substrings': ['nytimes.com'], 'offset_types': ['LINES'], }) hl2(si2,context) elapsed_lines = time.time() - start assert si2.body.labels['author'][0].offsets.keys() == [OffsetType.LINES] byte_labels = set() for annotator_id in si1.body.labels: for label in si1.body.labels[annotator_id]: assert OffsetType.BYTES in label.offsets byte_labels.add(label.target.target_id) line_labels = set() for annotator_id in si2.body.labels: for label in si2.body.labels[annotator_id]: assert OffsetType.LINES in label.offsets line_labels.add(label.target.target_id) assert line_labels == byte_labels logger.info('{:.5f} bytes, {:.5f} lines' .format(elapsed_bytes, elapsed_lines))
def test_long_doc(parser_type, test_data_dir): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, "test") stream_item.body.clean_html = open(os.path.join(path, "company-test.html")).read() context = {} hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [parser_type]}) hl(stream_item, context)
def make_hyperlink_labeled_test_stream_item(test_data_dir): context = {} si = make_test_stream_item(test_data_dir) assert len(si.body.clean_html) > 200 hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": ["BYTES"]}) hl(si, context) cv = clean_visible(config={}) cv(si, context) assert len(si.body.clean_visible) > 200 return si
def test_basics(): start = time.time() ## run it with a byte regex si1 = make_test_stream_item() context = {} hyperlink_labels( {'require_abs_url': True, 'domain_substrings': ['nytimes.com'], 'all_domains': False, 'offset_types': ['BYTES']} )(si1, context) elapsed_bytes = time.time() - start assert si1.body.labels['author'][0].offsets.keys() == [OffsetType.BYTES] start = time.time() si2 = make_test_stream_item() ## run it with regex hyperlink_labels( {'require_abs_url': True, 'domain_substrings': ['nytimes.com'], 'all_domains': False, 'offset_types': ['LINES']} )(si2, context) elapsed_lines = time.time() - start assert si2.body.labels['author'][0].offsets.keys() == [OffsetType.LINES] byte_labels = set() for annotator_id in si1.body.labels: for label in si1.body.labels[annotator_id]: assert OffsetType.BYTES in label.offsets byte_labels.add(label.target.target_id) line_labels = set() for annotator_id in si2.body.labels: for label in si2.body.labels[annotator_id]: assert OffsetType.LINES in label.offsets line_labels.add(label.target.target_id) assert line_labels == byte_labels print '\n\n%.5f bytes,\n %.5f lines' % (elapsed_bytes, elapsed_lines)
def make_hyperlink_labeled_test_stream_item(test_data_dir): context = {} si = make_test_stream_item(test_data_dir) assert len(si.body.clean_html) > 200 hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES'], }) hl(si, context) cv = clean_visible(config={}) cv(si, context) assert len(si.body.clean_visible) > 200 return si
def test_long_doc(parser_type, test_data_dir): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'company-test.html')).read() context = {} hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': [parser_type], }) hl(stream_item, context)
def make_hyperlink_labeled_test_chunk(tmpdir): """ returns a path to a temporary chunk that has been hyperlink labeled """ tpath = tmpdir.join(str(uuid.uuid1()) + ".sc") o_chunk = Chunk(tpath, mode="wb") ipath = get_test_chunk_path() hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]}) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath