def test_arias_model(self): cutoff_percent = 60 window = 2 a = Arias(cutoff_percent, window) content_arias = a.analyze(big_html_doc) # now compute the actual content blocks = Blockifier.blockify(big_html_doc) actual_content_indices = [1, 2, 3] actual_content = ' '.join([blocks[k].text for k in actual_content_indices]) self.assertEqual(actual_content, content_arias)
def test_arias_model(self): cutoff_percent = 60 window = 2 a = Arias(cutoff_percent, window) content_arias = a.analyze(big_html_doc) # now compute the actual content blocks = Blockifier.blockify(big_html_doc) actual_content_indices = [1, 2, 3] actual_content = ' '.join( [blocks[k].text for k in actual_content_indices]) self.assertEqual(actual_content, content_arias)
#! /usr/bin/env python # Run a particular algorithm on the entire set of documents from dragnet import Arias as technique import os for site in os.listdir('documents'): sitepath = os.path.join('documents', site) for document in os.listdir(sitepath): # Make sure the output directory exists try: os.makedirs(os.path.join('output', site)) except: pass # Read in, analyze, write out inpath = os.path.join(sitepath, document) outpath = os.path.join('output', site, document) print 'Working on %s' % inpath with open(inpath) as inf: with open(outpath, 'w+') as outf: outf.write(technique.analyze(inf.read(), inpath))