示例#1
0
class PlanetaLudicoScrap(object):

    def __init__(self, col):
        self._collection = col
        self._listener = VoidListener()

    def setListener(self, listener):
        self._listener = listener

    def scrapListOfURL(self, list_url):
        #list_entries = []
        for url in list_url:
            self._process_url(url)

    def _process_url(self, url):
        json_entries = self._read_entries(url)
        #print json_entries
        for json_entry in json_entries['results']['titles']:
            entry = self._build_entry(json_entry)
            self._save(entry)

    def _save(self, entry):
        old = self._collection.find_one('link', entry.link)
        if old is None:
            self._collection.save(entry.json())
        else:
            self._listener.skippingUnmodifiedThread(old, entry)

    def _read_entries(self, url_obj):
        return RESTReader.read(url_obj)

    def _build_entry(self, json_entry):
        entry = Entry(json_entry['title']['text'],
                      json_entry['date'],
                      json_entry['title']['href'],
                      json_entry['source']['text'])
        self._listener.enteringThread(entry)
        return entry
示例#2
0
 def __init__(self, col):
     self._collection = col
     self._listener = VoidListener()