def seed(self): for sitedef in self.db.sitedefs.find(): hostname = None try: if not u'allowed_links' in sitedef or \ not u'syntax' in sitedef or \ not u'start_url' in sitedef or \ not sitedef[u'start_url']: logger.log_error('Bad site definition: ' + str(sitedef)) continue hostname = urlparse(sitedef[u'start_url']).netloc self.collection.insert({ 'timestamp' : datetime.datetime.utcnow(), 'urls' : [sitedef[u'start_url']], 'hostname' : hostname, 'sitedef_id' : sitedef[u'_id'] }) except pymongo.errors.DuplicateKeyError: logger.log_warning('Duplicate key error : %s' % (str(sitedef))) if not hostname: continue self.collection.update({ 'hostname' : hostname }, { '$addToSet' : { 'urls' : sitedef[u'start_url'] }, '$set' : { 'timestamp' : datetime.datetime.utcnow() } }) continue
def release(self, id, next_start=None, start_offset=None): if id == None: return None if next_start == None and start_offset == None: logger.log_warning('Release invoked without valid start time, noop') return start_time = datetime.datetime.utcnow() if next_start: start_time = next_start if start_offset: start_time = start_time + datetime.timedelta(seconds=start_offset) with MongoScopeGuard(): self.collection.update({ '_id' : id },{ '$set' : { 'timestamp' : start_time } })
def parse_syntax(body, syntax, url, container=None, soup=None): if not syntax: return None if not soup: soup = BeautifulSoup(body, 'lxml') result = {} for key in syntax: #keyword is reserved if key == 'container': pass elif (type(syntax[key]) == str or type(syntax[key]) == unicode) and len(syntax[key].strip()): selector = '%s' % (syntax[key]) try: nodes = soup.select(selector) except IndexError, e: logger.log_error('[%s] IndexError with selector %s %s' % (url, selector, e)) continue if len(nodes) == 0: logger.log_warning('[%s] Could not find selector %s' % (url, str(selector))) continue node = nodes[0] val = node.get_text(' ', strip=True) if not val or val == '': #fall back if 'value' in node: val = val[u'value'] #elif ...: result[key] = { 'text' : val } if node.has_attr('href'): if url: result[key]['href'] = urlparse.urljoin(url, node['href']) else: result[key]['href'] = node['href'] elif type(syntax[key]) == dict: result[key] = parse_syntax(body, syntax[key], url, soup=soup, container=container)