def load_commons_speeches(root_dir, writer, num_workers, rate): log.debug("starting loader") working_dir = commons_speech_working_dir(root_dir) log.debug(working_dir) tracker = os.path.join(working_dir, "tracker") pool = mp.Pool(num_workers, lambda *args: globals().update(dict(args)), {"_writer":writer, "rate":rate}.items()) # TODO: move reporting out day_mod = 100 day_count = 0 day_time = datetime.datetime.now() speech_mod = 10000 speech_count = 0 speech_time = datetime.datetime.now() with open(tracker, "a+") as tracker_file: for filename, speeches in pool.imap(commons_speech_saver, commons_speech_feeder(working_dir, tracker)): day_count += 1 if day_count % day_mod is 0: delta = datetime.datetime.now() - day_time log.info("%10s Days in %s (%s/s)" % (day_count, delta, day_count / delta.seconds)) log.info("Latest file %s" % filename) for i in range((speech_count + speeches) / speech_mod - speech_count / speech_mod): sp_count = (speech_count / speech_mod + i + 1) * speech_mod delta = datetime.datetime.now() - speech_time log.info("%10s Speeches in %s (%s/s)" % (sp_count, delta, sp_count / delta.seconds)) tracker_file.flush() speech_count += speeches tracker_file.write("%s\n" % filename)
def test_feeder(self): """Test source urls are generated correctly and in quickly""" start = datetime.datetime.now() urls = [x for x in commons_speech_feeder(commons_speech_working_dir(self.rootdir))] self.assertEqual(len(urls), 995) self.assertLess(datetime.datetime.now() - start, datetime.timedelta(seconds=0.1)) self.assertEqual(urls[0], "http://www.hansard-archive.parliament.uk/Parliamentary_Debates_1803_to_1820/S1V0001P0.zip") self.assertEqual(urls[-1], "http://www.hansard-archive.parliament.uk/Parliamentary_Debates_1909_to_1981/S5V0199P0.zip")
def setUp(self): """Create and set up working locations""" self.rootdir = "/tmp/hanalytics-test" self.writer = mock.Mock() self.working_dir = commons_speech_working_dir(self.rootdir)
def test_create_working_dir(self): """Test the working directory is correctly created""" working_dir = commons_speech_working_dir(self.rootdir) self.assertEqual(working_dir, "/tmp/hanalytics-test/hansardarchive") self.assertTrue(os.path.exists(working_dir))