def handle_noargs(self, **options): for source in Source.objects.all().requires_processing(): if int(options.get('verbosity')) >= 2: print "Looking at %s" % source source.last_processing_attempt = datetime.datetime.now() source.save() pdf = source.file() html = KenyaParser.convert_pdf_to_html(pdf) data = KenyaParser.convert_html_to_data(html) KenyaParser.create_entries_from_data_and_source(data, source)
def handle_noargs(self, **options): for source in Source.objects.all().requires_processing(): if int(options.get('verbosity')) >= 2: print "Looking at %s" % source source.last_processing_attempt = datetime.datetime.now() source.save() pdf = source.file() html = KenyaParser.convert_pdf_to_html( pdf ) data = KenyaParser.convert_html_to_data( html ) KenyaParser.create_entries_from_data_and_source( data, source )
def test_converting_html_to_data(self): """test the convert_pdf_to_data function""" html_file = open( self.sample_html, 'r') html = html_file.read() data = KenyaParser.convert_html_to_data( html=html ) # Whilst developing the code this proved useful (on a mac at least) # tmp = tempfile.NamedTemporaryFile( delete=False, suffix=".json" ) # tmp = open( '/tmp/mzalend_hansard_parse.json', 'w') # tmp.write( json.dumps( data, sort_keys=True, indent=4 ) ) # tmp.close() # subprocess.call(['open', tmp.name ]) expected = json.loads( open( self.expected_data_json, 'r' ).read() ) self.assertEqual( data['transcript'], expected['transcript'] ) # FIXME self.assertEqual( data['meta'], expected['meta'] )
def test_converting_html_to_data(self): """test the convert_pdf_to_data function""" html_file = open(self.sample_html, 'r') html = html_file.read() data = KenyaParser.convert_html_to_data(html=html) # Whilst developing the code this proved useful (on a mac at least) # tmp = tempfile.NamedTemporaryFile( delete=False, suffix=".json" ) # tmp = open( '/tmp/mzalend_hansard_parse.json', 'w') # tmp.write( json.dumps( data, sort_keys=True, indent=4 ) ) # tmp.close() # subprocess.call(['open', tmp.name ]) expected = json.loads(open(self.expected_data_json, 'r').read()) self.assertEqual(data['transcript'], expected['transcript']) # FIXME self.assertEqual(data['meta'], expected['meta'])