示例#1
0
    def test_pdf(self):

        etl_file = Connector_File()

        # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py)
        parameters, data = etl_file.index_file(
            filename='test/test.pdf', additional_plugins=['enhance_pdf_ocr'])

        # check extracted content type
        self.assertEqual(data['content_type_ss'], 'application/pdf')

        # check content type group which is mapped to this content type (result of plugin enhance_contenttype_group.py)
        self.assertEqual(data['content_type_group_ss'], ['Text document'])

        # check extracted title (result of plugin enhance_extract_text_tika_server.py)
        self.assertEqual(data['title_txt'], 'TestPDFtitle')

        # check extracted content of PDF text (result of plugin enhance_extract_text_tika_server.py)
        self.assertTrue(
            'TestPDFContent1 on TestPDFPage1' in data['content_txt'])
        self.assertTrue(
            'TestPDFContent2 on TestPDFPage2' in data['content_txt'])

        # check OCR of embedded images in PDF (result of plugin enhance_pdf_ocr.py)
        self.assertTrue('TestPDFOCRImage1Content1' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage1Content2' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage2Content1' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage2Content2' in data['ocr_t'])

        # check if a plugin threw an exception
        self.assertEqual(len(data['etl_error_plugins_ss']), 0)
示例#2
0
    def __init__(self, verbose=False, quiet=True):

        Connector_File.__init__(self, verbose=verbose)

        self.quiet = quiet
        self.set_configdefaults()
        self.read_configfiles()
示例#3
0
    def set_configdefaults(self):

        Connector_File.set_configdefaults(self)

        #
        # Standard config
        #
        # Do not edit config here! Overwrite options in /etc/opensemanticsearch/connector-web
        #

        # no filename to uri mapping
        self.config['uri_prefix_strip'] = False
        self.config['uri_prefix'] = False

        # strip in facet path
        self.config['facet_path_strip_prefix'] = [
            'http://www.', 'http://', 'https://www.', 'https://', 'ftp://'
        ]

        self.config['plugins'] = [
            'filter_blacklist',
            'enhance_extract_text_tika_server',
            'enhance_detect_language_tika_server',
            'enhance_contenttype_group',
            'enhance_pst',
            'enhance_csv',
            'enhance_path',
            'enhance_zip',
            'enhance_warc',
            'enhance_extract_hashtags',
            'clean_title',
            'enhance_multilingual',
        ]
示例#4
0
    def test_warc(self):

        etl_file = Connector_File()
        exporter = export_solr()

        filename = os.path.dirname(
            os.path.realpath(__file__)) + '/test/example.warc'

        # run ETL of example.warc with configured plugins and warc extractor
        parameters, data = etl_file.index_file(filename=filename)

        contained_doc_id = 'http://example.com/<urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>'
        fields = ['id', 'title_txt', 'content_type_ss', 'content_txt']

        data = exporter.get_data(contained_doc_id, fields)

        # delete from search index
        etl_delete = Delete()
        etl_delete.delete(filename)
        etl_delete.delete(contained_doc_id)

        self.assertEqual(data['title_txt'], ['Example Domain'])

        self.assertEqual(data['content_type_ss'], ['text/html; charset=UTF-8'])

        self.assertTrue(
            'This domain is established to be used for illustrative examples in documents.'
            in data['content_txt'][0])
    def __init__(self, verbose=False, quiet=False):

        Connector_File.__init__(self, verbose=verbose)

        self.quiet = quiet

        # apply filters before adding to queue, so filtered or yet indexed files not added to queue
        # adding to queue by plugin export_queue_files

        # exporter to index filenames before text extraction and other later tasks
        # will run before adding tasks to queue by export_queue_files
        # so reseted plugin status will be in index before started ETL tasks apply not modified filter
        export_to_index = self.config['export']

        self.config['plugins'] = [
            'enhance_mapping_id',
            'filter_blacklist',
            'filter_file_not_modified',
            'enhance_file_mtime',
            'enhance_path',
            'enhance_entity_linking',
            'enhance_multilingual',
            export_to_index,
            'export_queue_files',
        ]
	def __init__(self, verbose=False, quiet=True):

		Connector_File.__init__(self, verbose=verbose)


		self.quiet = quiet
		self.set_configdefaults()
		self.read_configfiles()
def index_file(filename, wait=0, config=False):

	if wait:
		time.sleep(wait)

	etl_file = Connector_File()

	if config:
		etl_file.config = config

	etl_file.index(filename=filename)
示例#8
0
def index_file(filename, wait=0, config=False):

	if wait:
		time.sleep(wait)

	etl_file = Connector_File()

	if config:
		etl_file.config = config

	etl_file.index(filename=filename)
示例#9
0
def index_file(filename, additional_plugins=[], wait=0, config=None):

    if wait:
        time.sleep(wait)

    etl_file = Connector_File()

    # set alternate config options (will overwrite config options from config file)
    if config:
        for option in config:
            etl_file.config[option] = config[option]

    etl_file.index_file(filename=filename,
                        additional_plugins=additional_plugins)
示例#10
0
    def __init__(self, verbose=False, quiet=False):

        Connector_File.__init__(self, verbose=verbose)

        self.quiet = quiet

        # apply filters before adding to queue, so filtered or yet indexed files not added to queue
        # adding to queue by plugin export_queue_files

        self.config['plugins'] = [
            'enhance_mapping_id',
            'filter_blacklist',
            'filter_file_not_modified',
            'export_queue_files',
        ]
	def __init__(self, verbose=False, quiet=False):

		Connector_File.__init__(self, verbose=verbose)

		self.quiet = quiet

		# apply filters before adding to queue, so filtered or yet indexed files not added to queue
		# adding to queue by plugin export_queue_files

		self.config['plugins'] = [
			'enhance_mapping_id',
			'filter_blacklist',
			'filter_file_not_modified',
			'export_queue_files',
		]
示例#12
0
    def test_pdf(self):

        etl_file = Connector_File()

        filename = os.path.dirname(
            os.path.realpath(__file__)) + '/test/test.pdf'

        # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py)
        parameters, data = etl_file.index_file(
            filename=filename, additional_plugins=['enhance_pdf_ocr'])

        # delete from search index
        etl_delete = Delete()
        etl_delete.delete(filename)

        # check extracted content type
        self.assertTrue(data['content_type_ss'] == 'application/pdf'
                        or data['content_type_ss']
                        == ['application/pdf', 'image/jpeg', 'image/png'])

        # check content type group which is mapped to this content type (result of plugin enhance_contenttype_group.py)
        self.assertTrue(data['content_type_group_ss'] == ['Text document']
                        or data['content_type_group_ss']
                        == ['Text document', 'Image', 'Image'])

        # check extracted title (result of plugin enhance_extract_text_tika_server.py)
        self.assertEqual(data['title_txt'], 'TestPDFtitle')

        # check extracted content of PDF text (result of plugin enhance_extract_text_tika_server.py)
        self.assertTrue(
            'TestPDFContent1 on TestPDFPage1' in data['content_txt'])
        self.assertTrue(
            'TestPDFContent2 on TestPDFPage2' in data['content_txt'])

        # check OCR of embedded images in PDF (result of plugin enhance_pdf_ocr.py)
        self.assertTrue('TestPDFOCRImage1Content1' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage1Content2' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage2Content1' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage2Content2' in data['ocr_t'])

        # check if a plugin threw an exception
        self.assertEqual(len(data['etl_error_plugins_ss']), 0)
	def set_configdefaults(self):
		
		Connector_File.set_configdefaults(self)

		#
		# Standard config
		#
		# Do not edit config here! Overwrite options in /etc/opensemanticsearch/connector-web
		#

		# no filename to uri mapping
		self.config['uri_prefix_strip'] = False;
		self.config['uri_prefix'] = False
		
		# strip in facet path
		self.config['facet_path_strip_prefix'] = [ 'http://www.',
									'http://', 
									'https://www.',
		                            'https://',
		                            'ftp://'
		]
		
		self.config['plugins'] = [
			'filter_blacklist',
			'enhance_extract_text_tika_server',
			'enhance_detect_language_tika_server',
			'enhance_contenttype_group',
			'enhance_pst',
			'enhance_csv',
			'enhance_path',
			'enhance_zip',
			'enhance_warc',
			'enhance_extract_hashtags',
			'clean_title',
			'enhance_multilingual',

		]
示例#14
0
    def pst2email(self, pstfilename, parameters={}, verbose=False):

        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs

        if 'tmp' in parameters:
            system_temp_dirname = parameters['tmp']
            if not os.path.exists(system_temp_dirname):
                os.mkdir(system_temp_dirname)
        else:
            system_temp_dirname = tempfile.gettempdir()

        h = hashlib.md5(parameters['id'])
        temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_pst_" + h.hexdigest(
        )

        if not os.path.exists(temp_dirname):
            os.mkdir(temp_dirname)

        # start external PST extractor / converter
        result = subprocess.call(
            ['readpst', '-S', '-D', '-o', temp_dirname, pstfilename])

        if not result == 0:
            sys.stderr.write(
                "Error: readpst failed for {}".format(pstfilename))

        # prepare document processing
        connector = Connector_File()
        connector.verbose = verbose
        connector.config = parameters.copy()

        # only set container if not yet set by a ZIP or PST before (if this PST is inside another ZIP or PST)
        if not 'container' in connector.config:
            connector.config['container'] = pstfilename

        for dirName, subdirList, fileList in os.walk(temp_dirname):

            if verbose:
                print('Scanning directory: %s' % dirName)

            for fileName in fileList:
                if verbose:
                    print('Scanning file: %s' % fileName)

                try:
                    # replace temp dirname from indexed id
                    contained_dirname = dirName.replace(temp_dirname, '', 1)

                    # build a virtual filename pointing to original PST file

                    if contained_dirname:
                        contained_dirname = contained_dirname + os.path.sep
                    else:
                        contained_dirname = os.path.sep

                    connector.config[
                        'id'] = parameters['id'] + contained_dirname + fileName

                    contained_filename = dirName + os.path.sep + fileName

                    # E-mails filenames are pure number
                    # Attachment file names are number-filename
                    # if temp_filename without - in filename, its a mail file
                    # rename to suffix .eml so Tika will extract more metadata like from and to
                    if not '-' in fileName:
                        os.rename(contained_filename,
                                  contained_filename + '.eml')
                        contained_filename += '.eml'
                        connector.config['id'] += '.eml'

                    try:
                        connector.index_file(filename=contained_filename)

                    except KeyboardInterrupt:
                        raise KeyboardInterrupt

                    except BaseException as e:
                        sys.stderr.write(
                            "Exception while indexing contained content {} from {} : {}\n"
                            .format(fileName, connector.config['container'],
                                    e.message))

                    os.remove(contained_filename)

                except BaseException as e:
                    sys.stderr.write(
                        "Exception while indexing file {} : {}\n".format(
                            fileName, e.message))

        shutil.rmtree(temp_dirname)
	def pst2email(self, pstfilename, parameters={}, verbose=False):
	
		# we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
	
		if 'tmp' in parameters:
			system_temp_dirname = parameters['tmp']
			if not os.path.exists(system_temp_dirname):
				os.mkdir(system_temp_dirname)
		else:
			system_temp_dirname = tempfile.gettempdir()
	
		h = hashlib.md5(parameters['id'].encode('UTF-8'))
		temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_pst_" + str(os.getpid()) + "_" + h.hexdigest()
	
		if not os.path.exists(temp_dirname):
			os.mkdir(temp_dirname)

		
		# start external PST extractor / converter
		result = subprocess.call(['readpst', '-S', '-D' , '-o', temp_dirname, pstfilename])
		
		if not result == 0:
			sys.stderr.write( "Error: readpst failed for {}".format(pstfilename) )
		

		# prepare document processing
		connector = Connector_File()
		connector.verbose=verbose
		connector.config = parameters.copy()
		
		
		# only set container if not yet set by a ZIP or PST before (if this PST is inside another ZIP or PST)
		if not 'container' in connector.config:
			connector.config['container'] = pstfilename

		
		for dirName, subdirList, fileList in os.walk(temp_dirname):
	
			if verbose:
				print('Scanning directory: %s' % dirName)
	
				
			for fileName in fileList:
				if verbose:
					print('Scanning file: %s' % fileName)
	
				try:
					# replace temp dirname from indexed id
					contained_dirname = dirName.replace(temp_dirname, '', 1)
					
					# build a virtual filename pointing to original PST file
					
					if contained_dirname:
						contained_dirname = contained_dirname + os.path.sep
					else:
						contained_dirname = os.path.sep
					
					connector.config['id'] = parameters['id'] + contained_dirname + fileName

					contained_filename = dirName + os.path.sep + fileName

					# E-mails filenames are pure number
					# Attachment file names are number-filename
					# if temp_filename without - in filename, its a mail file
					# rename to suffix .eml so Tika will extract more metadata like from and to
					if not '-' in fileName:
						os.rename(contained_filename, contained_filename + '.eml')
						contained_filename += '.eml'
						connector.config['id'] += '.eml'
						
						
					try:
						connector.index_file(filename=contained_filename)

					except KeyboardInterrupt:
						raise KeyboardInterrupt
	
					except BaseException as e:
						sys.stderr.write( "Exception while indexing contained content {} from {} : {}\n".format(fileName, connector.config['container'], e.message) )
	
					os.remove(contained_filename)
	
				except BaseException as e:
					sys.stderr.write( "Exception while indexing file {} : {}\n".format(fileName, e.message) )
	
		shutil.rmtree(temp_dirname)
示例#16
0
# Queue handler
from celery import Celery

# ETL connectors
from etl import ETL
from etl_delete import Delete
from etl_file import Connector_File
from etl_web import Connector_Web
from etl_rss import Connector_RSS

verbose = True

app = Celery('etl.tasks')

etl_delete = Delete()
etl_file = Connector_File()
etl_web = Connector_Web()
etl_rss = Connector_RSS()

#
# Delete document with URI from index
#


@app.task(name='etl.delete')
def delete(uri):
    etl_delete.delete(uri=uri)


#
# Index a file
示例#17
0
    def unwarc_and_index_files(self, warcfilename, parameters=None, verbose=False):
        if parameters is None:
            parameters = {}

        # create temp dir where to unwarc the archive
        if 'tmp' in parameters:
            system_temp_dirname = parameters['tmp']
            if not os.path.exists(system_temp_dirname):
                os.mkdir(system_temp_dirname)
        else:
            system_temp_dirname = tempfile.gettempdir()

        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
        h = hashlib.md5(parameters['id'].encode('UTF-8'))
        temp_dirname = system_temp_dirname + os.path.sep + \
            "opensemanticetl_enhancer_warc_" + h.hexdigest()

        if os.path.exists(temp_dirname) == False:
            os.mkdir(temp_dirname)

        # prepare document processing
        connector = Connector_File()
        connector.verbose = verbose
        connector.config = parameters.copy()

        # only set container if not yet set by a zip before (if this zip is inside another zip)
        if not 'container' in connector.config:
            connector.config['container'] = warcfilename

        i = 0

        with open(warcfilename, 'rb') as stream:
            for record in ArchiveIterator(stream):
                i += 1

                if record.rec_type == 'response':

                    print(record.rec_headers)

                    # write WARC record content to tempfile
                    tempfilename = temp_dirname + \
                        os.path.sep + 'warcrecord' + str(i)
                    tmpfile = open(tempfilename, 'wb')
                    tmpfile.write(record.content_stream().read())
                    tmpfile.close()

                    # set last modification time of the file to WARC-Date
                    try:
                        last_modified = time.mktime(time.strptime(
                            record.rec_headers.get_header('WARC-Date'), '%Y-%m-%dT%H:%M:%SZ'))
                        os.utime(tempfilename, (last_modified, last_modified))
                    except BaseException as e:
                        sys.stderr.write("Exception while reading filedate to warc content {} from {} : {}\n".format(
                            tempfilename, connector.config['container'], e))

                    # set id (URL and WARC Record ID)
                    connector.config['id'] = record.rec_headers.get_header(
                        'WARC-Target-URI') + '/' + record.rec_headers.get_header('WARC-Record-ID')

                    # index the extracted file
                    try:

                        connector.index_file(filename=tempfilename)

                    except KeyboardInterrupt:
                        raise KeyboardInterrupt

                    except BaseException as e:
                        sys.stderr.write("Exception while indexing warc content {} from {} : {}\n".format(
                            tempfilename, connector.config['container'], e))

                    os.remove(tempfilename)

        shutil.rmtree(temp_dirname)
	def unwarc_and_index_files(self, warcfilename, parameters={}, verbose=False):
			
		# create temp dir where to unwarc the archive
		if 'tmp' in parameters:
			system_temp_dirname = parameters['tmp']
			if not os.path.exists(system_temp_dirname):
				os.mkdir(system_temp_dirname)
		else:
			system_temp_dirname = tempfile.gettempdir()

		# we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
		h = hashlib.md5(parameters['id'].encode('UTF-8'))
		temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_warc_" + h.hexdigest()
	
		if os.path.exists(temp_dirname) == False:
			os.mkdir(temp_dirname)
	
		# prepare document processing
		connector = Connector_File()
		connector.verbose = verbose
		connector.config = parameters.copy()
		
		
		# only set container if not yet set by a zip before (if this zip is inside another zip)
		if not 'container' in connector.config:
			connector.config['container'] = warcfilename

		i = 0
					
		with open(warcfilename, 'rb') as stream:
			for record in ArchiveIterator(stream):
				i += 1

				if record.rec_type == 'response':
					
					print(record.rec_headers)

					# write WARC record content to tempfile
					tempfilename = temp_dirname + os.path.sep + 'warcrecord' + str(i)
					tmpfile = open(tempfilename, 'wb')
					tmpfile.write(record.content_stream().read())
					tmpfile.close()
					
					# set last modification time of the file to WARC-Date
					try:
						last_modified = time.mktime(time.strptime(record.rec_headers.get_header('WARC-Date'), '%Y-%m-%dT%H:%M:%SZ'))
						os.utime( tempfilename, (last_modified, last_modified) )
					except BaseException as e:
						sys.stderr.write( "Exception while reading filedate to warc content {} from {} : {}\n".format(tempfilename, connector.config['container'], e) )

					# set id (URL and WARC Record ID)
					connector.config['id'] = record.rec_headers.get_header('WARC-Target-URI')+ '/' + record.rec_headers.get_header('WARC-Record-ID')

					# index the extracted file
					try:
	
						connector.index_file(filename = tempfilename)
	
					except KeyboardInterrupt:
						raise KeyboardInterrupt

					except BaseException as e:
						sys.stderr.write( "Exception while indexing warc content {} from {} : {}\n".format(tempfilename, connector.config['container'], e) )
	
					os.remove(tempfilename)
		
		shutil.rmtree(temp_dirname)
示例#19
0
    def unzip_and_index_files(self, zipfilename, parameters={}, verbose=False):

        # create temp dir where to unzip the archive

        if 'tmp' in parameters:
            system_temp_dirname = parameters['tmp']
            if not os.path.exists(system_temp_dirname):
                os.mkdir(system_temp_dirname)
        else:
            system_temp_dirname = tempfile.gettempdir()

        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
        h = hashlib.md5(parameters['id'].encode('UTF-8'))
        temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_zip_" + h.hexdigest(
        )

        if os.path.exists(temp_dirname) == False:
            os.mkdir(temp_dirname)

        # unzip the files
        my_zip = zipfile.ZipFile(zipfilename)
        my_zip.extractall(temp_dirname)
        my_zip.close()

        # prepare document processing
        connector = Connector_File()
        connector.verbose = verbose
        connector.config = parameters.copy()

        # only set container if not yet set by a zip before (if this zip is inside another zip)
        if not 'container' in connector.config:
            connector.config['container'] = zipfilename

        # walk trough all unzipped directories / files and index all files
        for dirName, subdirList, fileList in os.walk(temp_dirname):

            if verbose:
                print('Scanning directory: %s' % dirName)

            for fileName in fileList:
                if verbose:
                    print('Scanning file: %s' % fileName)

                try:
                    # replace temp dirname from indexed id
                    zipped_dirname = dirName.replace(temp_dirname, '', 1)

                    # build a virtual filename pointing to original zip file

                    if zipped_dirname:
                        zipped_dirname = zipped_dirname + os.path.sep
                    else:
                        zipped_dirname = os.path.sep

                    connector.config[
                        'id'] = parameters['id'] + zipped_dirname + fileName

                    unziped_filename = dirName + os.path.sep + fileName

                    try:

                        connector.index_file(filename=unziped_filename)

                    except KeyboardInterrupt:
                        raise KeyboardInterrupt

                    except BaseException as e:
                        sys.stderr.write(
                            "Exception while indexing zipped content {} from {} : {}\n"
                            .format(fileName, connector.config['container'],
                                    e))

                    os.remove(unziped_filename)

                except BaseException as e:
                    sys.stderr.write(
                        "Exception while indexing file {} : {}\n".format(
                            fileName, e))

        shutil.rmtree(temp_dirname)
    def __init__(self, verbose=False, quiet=False):

        Connector_File.__init__(self, verbose=verbose)

        self.quiet = quiet