Пример #1
0
 def __init__(self, collection_runs, debug_enabled):
     self.debug_enabled = debug_enabled
     self.collection_runs = collection_runs
     self.collector = CamaraFederalCollector()
     self.parser = CamaraFederalParser()
     self.updater = CamaraFederalUpdater(debug_enabled)
     self.stdout_mutex = Lock()
Пример #2
0
class CamaraFederal(object):
    def __init__(self, collection_runs, debug_enabled):
        self.debug_enabled = debug_enabled
        self.collection_runs = collection_runs
        self.collector = CamaraFederalCollector()
        self.parser = CamaraFederalParser()
        self.updater = CamaraFederalUpdater(debug_enabled)
        self.stdout_mutex = Lock()

    # Paralell Collector helpers
    @staticmethod
    def _fill_queue_from_list(items, queue):
        for i in items:
            queue.put(i)

    @staticmethod
    def _start_collectors(function, input_queue, output_queue=None):
        process_list = []

        for i in range(max_collectors):
            p = Process(name='%s %d' % ('Collector', i), target=function, args=(input_queue, output_queue,))
            process_list.append(p)
            p.start()

        return process_list

    @staticmethod
    def _start_updaters(function, input_queue, finished_event):
        process_list = []

        for i in range(max_updaters):
            p = Process(name='%s %d' % ('Updater', i), target=function, args=(input_queue, finished_event))
            p.daemon = True
            process_list.append(p)
            p.start()

        return process_list

    @staticmethod
    def _wait(collectors, finished_event):
        for p in collectors:
            p.join()

        finished_event.wait()

    # Paralel Collector implementation
    def _collect_pictures(self, picture_uri_queue, picture_queue):
        myname = current_process().name
        with self.stdout_mutex:
            print '[%s] started' % (myname)

        items = 0
        while True:
            try:
                legislator = picture_uri_queue.get(block=False)
                uri = legislator['picture_uri']
            except Empty:
                # Empty is also raised if queue item is not available, so checking...
                if not picture_uri_queue.empty():
                    continue
                break
            else:
                items += 1
                legislator['picture'] = self.collector.retrieve_legislator_picture(legislator)
                picture_queue.put(legislator)

        with self.stdout_mutex:
            print '[%s] finished. %d items processed.' % (myname, items)

    def _update_legislators(self, picture_queue, finished):
        myname = current_process().name
        informed_empty = False
        total_legislators = self.total_legislators

        with self.stdout_mutex:
            print '[%s] started' % (myname)

        items = 0

        informed = False
        while items < total_legislators:
            try:
                legislator = picture_queue.get(block=False)
                informed = False
            except Empty:
                pass
            else:
                items += 1
                self.updater.update_legislator(legislator)
                if not informed and (items % 100 == 0 or items > (total_legislators - 10)):
                    with self.stdout_mutex:
                        print('[%s] %d items processed.' % (myname, items))
                    informed = True

        finished.set()

    # Collector api used by Collect command
    def collect_legislatures(self):
        print '[CamaraFederal] Retrieving legislatures'

        content = self.collector.retrieve_legislatures()
        legislatures = self.parser.parse_legislatures(content)
        self.updater.update_legislatures(legislatures)

        print '[CamaraFederal] Retrieved %d legislatures' % len(legislatures)

    def collect_legislators(self, legislature_id=None):
        # Sequential collect
        if legislature_id is None:
            legislature = self.updater.last_legislature()
        else:
            legislature = self.updater.get_legislature(legislature_id)

        # set the legislature in the updater
        self.updater.legislature = legislature

        with self.stdout_mutex:
            print '[CamaraFederal] Retrieving legislators'

        content = self.collector.retrieve_legislators(legislature)
        legislators = self.parser.parse_legislators(content)

        self.total_legislators = len(legislators)

        # Paralell collect
        picture_uri_queue = Queue()
        picture_queue = Queue()
        updater_finished = Event()

        self._fill_queue_from_list(legislators, picture_uri_queue)

        process_list = self._start_collectors(self._collect_pictures, picture_uri_queue, picture_queue)

        self._start_updaters(self._update_legislators, picture_queue, updater_finished)

        self._wait(process_list, updater_finished)

        print '[CamaraFederal] Collected %s legislators' % self.total_legislators

    # FIXME: copied from BaseCollector, needs to be shared!
    def debug(self, message):
        if self.debug_enabled:
            print message

    def create_collection_run(self, legislature):
        collection_run, created = CollectionRun.objects.get_or_create(date=date.today(),
                                                                      legislature=legislature)
        self.collection_runs.append(collection_run)

        # Keep only one run for a day. If one exists, we delete the existing collection data
        # before we start this one.
        if not created:
            self.debug("Collection run for %s already exists for legislature %s, clearing." % (date.today().strftime("%F"), legislature))
            ArchivedExpense.objects.filter(collection_run=collection_run).delete()

        return collection_run
    # end copied block

    def collect_expenses(self, legislature_id=None):
        if legislature_id is None:
            legislature = self.updater.last_legislature()
        else:
            legislature = self.updater.get_legislature(legislature_id)

        pending_collection = cache.get('pending-cdf-collection')
        if pending_collection:
            pending_collection = pickle.loads(pending_collection)
            self.updater.collection_run = CollectionRun.objects.get(id=pending_collection['collection_run_id'])
        else:
            self.updater.collection_run = self.create_collection_run(legislature)
        self.updater.legislature = legislature

        with self.stdout_mutex:
            print '[CamaraFederal] Retrieving expenses'

        mandates = self.updater.get_mandates()

        if pending_collection:
            mandates = [m for m in mandates if m.id not in pending_collection['processed_mandates']]
        else:
            pending_collection = dict(collection_run_id=self.updater.collection_run.id, processed_mandates=[])

        for mandate in mandates:
            for year in range(legislature.date_start.year, legislature.date_end.year + 1):
                for month in range(1, 13):
                    content = self.collector.retrieve_total_expenses_per_nature(mandate.legislator, year, month)
                    natures = self.parser.parse_total_expenses_per_nature(content)
                    self.updater.update_expense_natures(natures)

                    for nature in natures:
                        print '[CamaraFederal] Retrieving expenses with %s by %s on %d-%d' % (nature['name'], unicode(mandate.legislator), year, month)
                        content = self.collector.retrieve_nature_expenses(mandate.legislator, nature['original_id'], year, month)
                        expenses = self.parser.parse_nature_expenses(content, nature, year, month)
                        self.updater.update_nature_expenses(mandate, nature['original_id'], expenses)
            transaction.commit()
            pending_collection['processed_mandates'].append(mandate.id)
            cache.set('pending-cdf-collection', pickle.dumps(pending_collection))
        cache.delete('pending-cdf-collection')