Пример #1
0
def _enhance(entry, selected_enhancers):
    """Enhances an entry"""
    for enhancer, args in selected_enhancers:
        dotlog('.')
        entry = enhancer(entry, *args)
        if entry is None:
            return None
    return entry
Пример #2
0
 def load_data(self):
     """Fetches data"""
     dotlog('Loading existing data.')
     old_timeout = socket.getdefaulttimeout()
     socket.setdefaulttimeout(self.parser.timeout)
     try:
         self._load_data()
     finally:
         socket.setdefaulttimeout(old_timeout)
Пример #3
0
 def _process_entries(self, entries):
     """callback called by the worker"""
     # now lets apply filters, then store entries
     psession = create_session(self.parser.database,
                               global_=False)
     for entry in entries:
         dotlog('.')
         entry = _apply_filters(entry, self.existing_entries,
                                self.filter_chain)
         if entry is None:
             continue
         new_entry = create_entry(entry, commit=False,
                                  session=psession)
         self.existing_entries.append(new_entry[1])
     psession.commit()
Пример #4
0
def _prepare_enhancer(enhancer, entries):
    """prepare the enhancer"""
    if hasattr(enhancer, 'prepare'):
        dotlog('.')
        enhancer.prepare(entries)
    return enhancer
Пример #5
0
    def generate_data(self):
        """Generates the output"""
        selected_enhancers = _select_enhancers(self.parser.enhancers)
        selected_outputs = _select_outputs(self.parser.outputs)
        processes = self.parser.processes

        # XXX TODO: limit the size of the data
        # processed, by number of items, or by date
        entries = get_entries().all()
        if selected_enhancers != []:
            # Enhancement is a two-phase process.
            # 1. Preparing entries for enhancement
            dotlog('Preparing enhancers.')
            if processes == 1:
                class Wrap(object):
                    def __init__(self, res):
                        self.res = res
                    def get(self):
                        return self.res

                results = [Wrap(e) for e in
                           [_prepare_enhancer(enhancer, entries)
                           for enhancer, args in selected_enhancers]]
            else:
                pool = Pool(self.parser.processes)
                results = [pool.apply_async(_prepare_enhancer,
                                            (enhancer, entries))
                       for enhancer, args in selected_enhancers]
                pool.close()
                pool.join()

            # pushing back into the ENHANCERS list (not the same process)
            # and selected_enhancers
            # XXX maybe the multiprocess has something cleaner for this
            for result in results:
                enhancer = result.get()
                enhancer_type = type(enhancer)
                for name, ob in ENHANCERS.items():
                    if type(ob) == enhancer_type:
                        ENHANCERS[name] = enhancer

                def _replace(s, args, new):
                    if type(s) == type(new):
                        s = new
                    return s, args

                selected_enhancers = [_replace(s, args, enhancer)
                                      for s, args in selected_enhancers]

            dotlog('\n')

            # 2. Now enhancing them
            dotlog('Enhancing: %s' % ', '.join([e for e, args
                                            in self.parser.enhancers]))

            if processes == 1:
                results = [e for e in [_enhance(entry, selected_enhancers)
                           for entry in entries] if e is not None]
            else:
                # creating a processing pool
                pool = Pool(self.parser.processes)
                results = [pool.apply_async(_enhance,
                           (entry, selected_enhancers))
                           for entry in entries]

                pool.close()
                pool.join()
                entries = [r for r in [result.get() for result in results]
                           if r is not None]
            dotlog('\n')

        if selected_outputs != []:
            dotlog('Writing outputs.')
            dotlog('Rendering: %s' % ', '.join([output[0] for output
                                             in self.parser.outputs]))
            for output, args in selected_outputs:
                dotlog('.')
                output(entries, args)
            dotlog('\n')

            dotlog('Output ready.')
Пример #6
0
    def _load_data(self):
        """Loads the data"""
        # remove the temp db if it exists
        if os.path.exists('_temp_.db'):
            os.remove('_temp_.db')

        # cleanup old entries
        dotlog('\tRemoving old entries.')
        self._cleanup_entries()

        # initial entries, see if this call is optimal
        dotlog('\tReading existing entries.')
        self.existing_entries = get_entries().all()

        # building filtering chain once.
        filter_names = [f[0] for f in self.parser.filters]
        dotlog('Loading filters plugins : %s' % ', '.join(filter_names))
        self.filter_chain = set([(load_plugin(name, FILTERS), args)
                                 for name, args in self.parser.filters
                                 if name in FILTERS])

        # building source chain once.
        source_names = [s[0] for s in self.parser.sources]
        dotlog('Loading source plugins : %s' % ', '.join(source_names))
        sources = [(reader_name, load_plugin(reader_name, READERS), args)
                   for reader_name, args in self.parser.sources]

        processes = self.parser.processes
        dotlog('Reading sources.')

        if processes == 1:
            dotlog('\tWorking in the same process')
            for source in sources:
                entries = _process_source(*source)
                self._process_entries(entries)
        else:
            # creating a processing pool
            pool = Pool(self.parser.processes)
            for source in sources:
                dotlog('\tLaunching worker for %s - %s' % (source[0], source[-1]))
                pool.apply_async(_process_source, source,
                                 callback=self._process_entries)

            pool.close()
            pool.join()
        dotlog('\n')