def __init__(self, verbose=False, quiet=True): Connector_Web.__init__(self, verbose=verbose, quiet=quiet) self.quiet = quiet self.read_configfiles() self.queue = True
def etl_document(self, uri): result = True doc_mtime = self.exporter.get_lastmodified(docid=uri) if doc_mtime: if self.verbose: print( "Annotated document in search index. No new indexing of {}" .format(uri)) else: # Download and Index the new or updated uri if self.verbose: print( "Annotated document not in search index. Start indexing of {}" .format(uri)) try: etl = Connector_Web() etl.index(uri=uri) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write("Exception while getting {} : {}".format( uri, e)) result = False return result
def etl_document(self, uri): result = True doc_mtime = self.exporter.get_lastmodified(docid=uri) if doc_mtime: if self.verbose: print ("Annotated document in search index. No new indexing of {}".format(uri)) else: # Download and Index the new or updated uri if self.verbose: print ("Annotated document not in search index. Start indexing of {}".format(uri)) try: etl = Connector_Web() etl.index(uri=uri) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while getting {} : {}".format(uri, e) ) result = False return result
def etl_document(uri): result = True doc_mtime = exporter.get_lastmodified(docid=uri) if doc_mtime: if verbose: print ("Article indexed before, so skip new indexing: {}".format(uri)) else: # Download and Index the new or updated uri if verbose: print ("Annotated page not in index: {}".format(uri)) try: etl = Connector_Web() etl.index(uri=uri) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while getting {} : {}".format(uri, e) ) result = False return result
def index(self, uri): result = True exporter = export_solr.export_solr() feed = feedparser.parse(uri) new_items = 0 for item in feed.entries: articleuri = item.link # # Is new article or indexed in former runs? # doc_mtime = exporter.get_lastmodified(docid=articleuri) if doc_mtime: if self.verbose: print( "Article indexed before, so skip new indexing: {}".format(articleuri)) else: # Download and Index the new or updated uri if self.verbose: print("Article not in index: {}".format(articleuri)) try: partresult = Connector_Web.index(self, uri=articleuri) if partresult == False: result = False new_items += 1 except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while getting {} : {}".format(articleuri, e)) if new_items: exporter.commit() return result
def index(self, sitemap): if self.verbose or self.quiet == False: print("Downloading sitemap {}".format(sitemap)) sitemap = urllib.request.urlopen(sitemap) et = ElementTree.parse(sitemap) root = et.getroot() # process subsitemaps if sitemapindex for sitemap in root.findall( "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap"): url = sitemap.findtext( '{http://www.sitemaps.org/schemas/sitemap/0.9}loc') if self.verbose or self.quiet == False: print("Processing subsitemap {}".format(url)) self.index(url) # # get urls if urlset # urls = [] # XML schema with namespace sitemaps.org for url in root.findall( "{http://www.sitemaps.org/schemas/sitemap/0.9}url"): url = url.findtext( '{http://www.sitemaps.org/schemas/sitemap/0.9}loc') urls.append(url) # XML schema with namespace Google sitemaps for url in root.findall( "{http://www.google.com/schemas/sitemap/0.84}url"): url = url.findtext( '{http://www.google.com/schemas/sitemap/0.84}loc') urls.append(url) # Queue or download and index the urls for url in urls: if self.queue: # add webpage to queue as Celery task try: if self.verbose or self.quiet == False: print("Adding URL to queue: {}".format(url)) result = tasks.index_web.delay(uri=url) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while adding to queue {} : {}\n".format( url, e)) else: # batchmode, index page after page ourselves try: if self.verbose or self.quiet == False: print("Indexing {}".format(url)) result = Connector_Web.index(self, uri=url) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while indexing {} : {}\n".format(url, e))
app = Celery('etl.tasks', broker=broker) app.conf.CELERY_QUEUES = [ Queue('tasks', Exchange('tasks'), routing_key='tasks', queue_arguments={'x-max-priority': 10}) ] app.conf.CELERYD_MAX_TASKS_PER_CHILD = 1 app.conf.CELERYD_PREFETCH_MULTIPLIER = 1 app.conf.CELERY_ACKS_LATE = True etl_delete = Delete() etl_web = Connector_Web() etl_rss = Connector_RSS() # # Delete document with URI from index # @app.task(name='etl.delete') def delete(uri): etl_delete.delete(uri=uri) # # Index a file #
def index (self, sitemap): if self.verbose or self.quiet==False: print ( "Downloading sitemap {}".format(sitemap) ) sitemap = urllib.request.urlopen(sitemap) et = ElementTree.parse(sitemap) root = et.getroot() # process subsitemaps if sitemapindex for sitemap in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap"): url = sitemap.findtext('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') if self.verbose or self.quiet==False: print ("Processing subsitemap {}".format(url) ) self.index(url) # # get urls if urlset # urls=[] # XML schema with namespace sitemaps.org for url in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url"): url = url.findtext('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') urls.append(url) # XML schema with namespace Google sitemaps for url in root.findall("{http://www.google.com/schemas/sitemap/0.84}url"): url = url.findtext('{http://www.google.com/schemas/sitemap/0.84}loc') urls.append(url) # Queue or download and index the urls for url in urls: if self.queue: # add webpage to queue as Celery task try: if self.verbose or self.quiet==False: print ("Adding URL to queue: {}".format(url) ) result = tasks.index_web.delay(uri=url) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while adding to queue {} : {}\n".format(url, e) ) else: # batchmode, index page after page ourselves try: if self.verbose or self.quiet==False: print ("Indexing {}".format(url) ) result = Connector_Web.index(self, uri=url) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while indexing {} : {}\n".format(url, e) )
# ETL connectors from etl import ETL from etl_delete import Delete from etl_file import Connector_File from etl_web import Connector_Web from etl_rss import Connector_RSS verbose = True quiet = False app = Celery('etl.tasks') app.conf.CELERYD_MAX_TASKS_PER_CHILD = 1 etl_delete = Delete() etl_web = Connector_Web() etl_rss = Connector_RSS() # # Delete document with URI from index # @app.task(name='etl.delete') def delete(uri): etl_delete.delete(uri=uri) # # Index a file #
def index(self, uri): result = True # todo: result to false if getting/parsing uri failed exporter = export_solr.export_solr() feed = feedparser.parse(uri) for item in feed.entries: articleuri = item.link mtime = None #get modification time from file todo: from download try: mtime = dateparser.parse(item.published) # maybe there was a update try: if item.updated: mtime = dateparser.parse(item.updated) except BaseException as e: sys.stderr.write( "Exception while parsing updated date. Status: {}\n". format(e.message)) except BaseException as e: sys.stderr.write( "Exception while parsing date. Status: {}\n".format( e.message)) if not mtime: mtime = time.localtime() #convert mtime to Solr format mtime_masked = mtime.strftime("%Y-%m-%dT%H:%M:%SZ") #get modtime in index doc_mtime = exporter.get_lastmodified(docid=articleuri) # # Is new article (not indexed so initial 0) or modified (doc_mtime <> mtime of file)? # if mtime_masked == doc_mtime: # Doc found in Solr and field moddate of Solr doc same as files mtime # so file was indexed as newest version before doindex = False if self.verbose: print( "Not indexing unchanged article {}".format(articleuri)) else: # Index the article, because new or changed doindex = True if doc_mtime == None: if self.verbose or self.quiet == False: print("Indexing new article {}".format(articleuri)) else: if self.verbose or self.quiet == False: print( "Indexing modified article {}".format(articleuri)) # Download and Index the new or updated uri try: partresult = Connector_Web.index(self, uri=articleuri, last_modified=False) if partresult == False: result = False except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write("Exception while getting {} : {}".format( articleuri, e.message)) return result