def test_title(): stage = title({}) cv = clean_visible({}) si = make_stream_item(0, '') si.body.clean_html = '''Then there was a <tag> ... <title>TITLE HERE </title> ''' si = cv(si, {}) si = stage(si) assert si.other_content['title'].clean_visible == 'TITLE HERE' si = make_stream_item(0, '') si.body.clean_html = '''Then there was a that went <tag> ... <title>TITLE HERE%s </title> ''' % ('*' * 80) si = cv(si, {}) si = stage(si) assert si.other_content[ 'title'].clean_visible == 'TITLE HERE' + '*' * 50 + '...'
def test_title(): stage = title({}) cv = clean_visible({}) si = make_stream_item(0, '') si.body.clean_html = '''Then there was a <tag> ... <title>TITLE HERE </title> ''' si = cv(si, {}) si = stage(si) assert si.other_content['title'].clean_visible == 'TITLE HERE' si = make_stream_item(0, '') si.body.clean_html = '''Then there was a that went <tag> ... <title>TITLE HERE%s </title> ''' % ('*' * 80) si = cv(si, {}) si = stage(si) assert si.other_content['title'].clean_visible == 'TITLE HERE' + '*' * 50 + '...'
def ids_and_clean_visible_from_streamcorpus_chunk_path(corpus_path): '''converts a streamcorpus.Chunk file into the structure that is passed by the search engine to find_soft_selectors ''' ch = clean_html(clean_html.default_config) cv = clean_visible(clean_visible.default_config) ids_and_clean_visible = [] for si in streamcorpus.Chunk(path=corpus_path): if not si.body.clean_visible: ## attempt to make clean_visible if not si.body.raw: logger.critical('no raw content, so skipping: %r', si.abs_url) continue abs_url = si.abs_url si = ch(si, {}) if not si: logger.critical( 'failed to make clean_html, so skipping: %r', abs_url) continue si = cv(si, {}) if not si or not si.body.clean_visible: logger.critical( 'failed to make clean_visible, so skipping: %r', abs_url) continue rec = (si.stream_id, si.body.clean_visible.decode('utf8'), {}) ids_and_clean_visible.append(rec) return ids_and_clean_visible
def make_hyperlink_labeled_test_stream_item(test_data_dir): context = {} si = make_test_stream_item(test_data_dir) assert len(si.body.clean_html) > 200 hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": ["BYTES"]}) hl(si, context) cv = clean_visible(config={}) cv(si, context) assert len(si.body.clean_visible) > 200 return si
def make_hyperlink_labeled_test_stream_item(test_data_dir): context = {} si = make_test_stream_item(test_data_dir) assert len(si.body.clean_html) > 200 hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES'], }) hl(si, context) cv = clean_visible(config={}) cv(si, context) assert len(si.body.clean_visible) > 200 return si