def __init__(self, verbose=False, quiet=True):

		Connector_Web.__init__(self, verbose=verbose, quiet=quiet)

		self.quiet = quiet
		self.read_configfiles()
		self.queue = True
Пример #2
0
    def etl_document(self, uri):

        result = True
        doc_mtime = self.exporter.get_lastmodified(docid=uri)

        if doc_mtime:

            if self.verbose:
                print(
                    "Annotated document in search index. No new indexing of {}"
                    .format(uri))

        else:
            # Download and Index the new or updated uri

            if self.verbose:
                print(
                    "Annotated document not in search index. Start indexing of {}"
                    .format(uri))

            try:
                etl = Connector_Web()
                etl.index(uri=uri)
            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except BaseException as e:
                sys.stderr.write("Exception while getting {} : {}".format(
                    uri, e))
                result = False
        return result
Пример #3
0
    def __init__(self, verbose=False, quiet=True):

        Connector_Web.__init__(self, verbose=verbose, quiet=quiet)

        self.quiet = quiet
        self.read_configfiles()
        self.queue = True
	def etl_document(self, uri):
	
		result = True
		doc_mtime = self.exporter.get_lastmodified(docid=uri)
	
		if doc_mtime:
	
			if self.verbose:
				print ("Annotated document in search index. No new indexing of {}".format(uri))
	
		else:
			# Download and Index the new or updated uri
	
			if self.verbose:
				print ("Annotated document not in search index. Start indexing of {}".format(uri))
		
			try:
				etl = Connector_Web()
				etl.index(uri=uri)
			except KeyboardInterrupt:
				raise KeyboardInterrupt	
			except BaseException as e:
				sys.stderr.write( "Exception while getting {} : {}".format(uri, e) )
				result = False
		return result
Пример #5
0
def etl_document(uri):

	result = True
	doc_mtime = exporter.get_lastmodified(docid=uri)

	if doc_mtime:

		if verbose:
			print ("Article indexed before, so skip new indexing: {}".format(uri))

	else:
		# Download and Index the new or updated uri

		if verbose:
			print ("Annotated page not in index: {}".format(uri))
	
		try:
			etl = Connector_Web()
			etl.index(uri=uri)
		except KeyboardInterrupt:
			raise KeyboardInterrupt	
		except BaseException as e:
			sys.stderr.write( "Exception while getting {} : {}".format(uri, e) )
			result = False
	return result
Пример #6
0
    def index(self, uri):

        result = True

        exporter = export_solr.export_solr()

        feed = feedparser.parse(uri)

        new_items = 0

        for item in feed.entries:

            articleuri = item.link

            #
            # Is new article or indexed in former runs?
            #

            doc_mtime = exporter.get_lastmodified(docid=articleuri)

            if doc_mtime:

                if self.verbose:
                    print(
                        "Article indexed before, so skip new indexing: {}".format(articleuri))

            else:
                # Download and Index the new or updated uri

                if self.verbose:
                    print("Article not in index: {}".format(articleuri))

                try:
                    partresult = Connector_Web.index(self, uri=articleuri)
                    if partresult == False:
                        result = False
                    new_items += 1

                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except BaseException as e:
                    sys.stderr.write(
                        "Exception while getting {} : {}".format(articleuri, e))

        if new_items:
            exporter.commit()

        return result
Пример #7
0
    def index(self, sitemap):

        if self.verbose or self.quiet == False:
            print("Downloading sitemap {}".format(sitemap))

        sitemap = urllib.request.urlopen(sitemap)

        et = ElementTree.parse(sitemap)

        root = et.getroot()

        # process subsitemaps if sitemapindex
        for sitemap in root.findall(
                "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap"):
            url = sitemap.findtext(
                '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')

            if self.verbose or self.quiet == False:
                print("Processing subsitemap {}".format(url))

            self.index(url)

        #
        # get urls if urlset
        #

        urls = []

        # XML schema with namespace sitemaps.org
        for url in root.findall(
                "{http://www.sitemaps.org/schemas/sitemap/0.9}url"):

            url = url.findtext(
                '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')

            urls.append(url)

        # XML schema with namespace Google sitemaps
        for url in root.findall(
                "{http://www.google.com/schemas/sitemap/0.84}url"):

            url = url.findtext(
                '{http://www.google.com/schemas/sitemap/0.84}loc')

            urls.append(url)

        # Queue or download and index the urls

        for url in urls:

            if self.queue:

                # add webpage to queue as Celery task
                try:

                    if self.verbose or self.quiet == False:
                        print("Adding URL to queue: {}".format(url))

                    result = tasks.index_web.delay(uri=url)

                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except BaseException as e:
                    sys.stderr.write(
                        "Exception while adding to queue {} : {}\n".format(
                            url, e))

            else:

                # batchmode, index page after page ourselves

                try:
                    if self.verbose or self.quiet == False:
                        print("Indexing {}".format(url))

                    result = Connector_Web.index(self, uri=url)

                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except BaseException as e:
                    sys.stderr.write(
                        "Exception while indexing {} : {}\n".format(url, e))
Пример #8
0
app = Celery('etl.tasks', broker=broker)

app.conf.CELERY_QUEUES = [
    Queue('tasks',
          Exchange('tasks'),
          routing_key='tasks',
          queue_arguments={'x-max-priority': 10})
]

app.conf.CELERYD_MAX_TASKS_PER_CHILD = 1
app.conf.CELERYD_PREFETCH_MULTIPLIER = 1
app.conf.CELERY_ACKS_LATE = True

etl_delete = Delete()
etl_web = Connector_Web()
etl_rss = Connector_RSS()

#
# Delete document with URI from index
#


@app.task(name='etl.delete')
def delete(uri):
    etl_delete.delete(uri=uri)


#
# Index a file
#
	def index (self, sitemap):

		if self.verbose or self.quiet==False:
			print ( "Downloading sitemap {}".format(sitemap) )

		sitemap = urllib.request.urlopen(sitemap)

		et = ElementTree.parse(sitemap)
		
		root = et.getroot()

		# process subsitemaps if sitemapindex
		for sitemap in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap"):
			url = sitemap.findtext('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')

			if self.verbose or self.quiet==False:
				print ("Processing subsitemap {}".format(url) )

			self.index(url)


		#
		# get urls if urlset
		#
		
		urls=[]
		
		# XML schema with namespace sitemaps.org
		for url in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url"):

			url = url.findtext('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')

			urls.append(url)

		# XML schema with namespace Google sitemaps
		for url in root.findall("{http://www.google.com/schemas/sitemap/0.84}url"):

			url = url.findtext('{http://www.google.com/schemas/sitemap/0.84}loc')

			urls.append(url)


		# Queue or download and index the urls

		for url in urls:
				
			if self.queue:

				# add webpage to queue as Celery task
				try:

					if self.verbose or self.quiet==False:
						print ("Adding URL to queue: {}".format(url) )

					result = tasks.index_web.delay(uri=url)

				except KeyboardInterrupt:
					raise KeyboardInterrupt	
				except BaseException as e:
					sys.stderr.write( "Exception while adding to queue {} : {}\n".format(url, e) )


			else:

				# batchmode, index page after page ourselves
				
				try:
					if self.verbose or self.quiet==False:
						print ("Indexing {}".format(url) )

					result = Connector_Web.index(self, uri=url)

				except KeyboardInterrupt:
					raise KeyboardInterrupt	
				except BaseException as e:
					sys.stderr.write( "Exception while indexing {} : {}\n".format(url, e) )
Пример #10
0
# ETL connectors
from etl import ETL
from etl_delete import Delete
from etl_file import Connector_File
from etl_web import Connector_Web
from etl_rss import Connector_RSS


verbose = True
quiet = False

app = Celery('etl.tasks')
app.conf.CELERYD_MAX_TASKS_PER_CHILD = 1

etl_delete = Delete()
etl_web = Connector_Web()
etl_rss = Connector_RSS()


#
# Delete document with URI from index
#

@app.task(name='etl.delete')
def delete(uri):
	etl_delete.delete(uri=uri)


#
# Index a file
#
Пример #11
0
    def index(self, uri):

        result = True
        # todo: result to false if getting/parsing uri failed

        exporter = export_solr.export_solr()

        feed = feedparser.parse(uri)

        for item in feed.entries:

            articleuri = item.link
            mtime = None

            #get modification time from file todo: from download
            try:

                mtime = dateparser.parse(item.published)

                # maybe there was a update
                try:
                    if item.updated:
                        mtime = dateparser.parse(item.updated)
                except BaseException as e:
                    sys.stderr.write(
                        "Exception while parsing updated date. Status: {}\n".
                        format(e.message))

            except BaseException as e:
                sys.stderr.write(
                    "Exception while parsing date. Status: {}\n".format(
                        e.message))

            if not mtime:
                mtime = time.localtime()

            #convert mtime to Solr format
            mtime_masked = mtime.strftime("%Y-%m-%dT%H:%M:%SZ")

            #get modtime in index
            doc_mtime = exporter.get_lastmodified(docid=articleuri)

            #
            # Is new article (not indexed so initial 0) or modified (doc_mtime <> mtime of file)?
            #

            if mtime_masked == doc_mtime:

                # Doc found in Solr and field moddate of Solr doc same as files mtime
                # so file was indexed as newest version before
                doindex = False

                if self.verbose:
                    print(
                        "Not indexing unchanged article {}".format(articleuri))

            else:

                # Index the article, because new or changed
                doindex = True

                if doc_mtime == None:
                    if self.verbose or self.quiet == False:
                        print("Indexing new article {}".format(articleuri))
                else:
                    if self.verbose or self.quiet == False:
                        print(
                            "Indexing modified article {}".format(articleuri))

                # Download and Index the new or updated uri
                try:
                    partresult = Connector_Web.index(self,
                                                     uri=articleuri,
                                                     last_modified=False)
                    if partresult == False:
                        result = False
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except BaseException as e:
                    sys.stderr.write("Exception while getting {} : {}".format(
                        articleuri, e.message))

        return result