Exemplo n.º 1
0
    def unzip_and_index_files(self, zipfilename, parameters={}, verbose=False):

        # create temp dir where to unzip the archive

        if 'tmp' in parameters:
            system_temp_dirname = parameters['tmp']
            if not os.path.exists(system_temp_dirname):
                os.mkdir(system_temp_dirname)
        else:
            system_temp_dirname = tempfile.gettempdir()

        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
        h = hashlib.md5(parameters['id'].encode('UTF-8'))
        temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_zip_" + h.hexdigest(
        )

        if os.path.exists(temp_dirname) == False:
            os.mkdir(temp_dirname)

        # unzip the files
        my_zip = zipfile.ZipFile(zipfilename)
        my_zip.extractall(temp_dirname)
        my_zip.close()

        # prepare document processing
        connector = Connector_File()
        connector.verbose = verbose
        connector.config = parameters.copy()

        # only set container if not yet set by a zip before (if this zip is inside another zip)
        if not 'container' in connector.config:
            connector.config['container'] = zipfilename

        # walk trough all unzipped directories / files and index all files
        for dirName, subdirList, fileList in os.walk(temp_dirname):

            if verbose:
                print('Scanning directory: %s' % dirName)

            for fileName in fileList:
                if verbose:
                    print('Scanning file: %s' % fileName)

                try:
                    # replace temp dirname from indexed id
                    zipped_dirname = dirName.replace(temp_dirname, '', 1)

                    # build a virtual filename pointing to original zip file

                    if zipped_dirname:
                        zipped_dirname = zipped_dirname + os.path.sep
                    else:
                        zipped_dirname = os.path.sep

                    connector.config[
                        'id'] = parameters['id'] + zipped_dirname + fileName

                    unziped_filename = dirName + os.path.sep + fileName

                    try:

                        connector.index_file(filename=unziped_filename)

                    except KeyboardInterrupt:
                        raise KeyboardInterrupt

                    except BaseException as e:
                        sys.stderr.write(
                            "Exception while indexing zipped content {} from {} : {}\n"
                            .format(fileName, connector.config['container'],
                                    e))

                    os.remove(unziped_filename)

                except BaseException as e:
                    sys.stderr.write(
                        "Exception while indexing file {} : {}\n".format(
                            fileName, e))

        shutil.rmtree(temp_dirname)
Exemplo n.º 2
0
    def unwarc_and_index_files(self, warcfilename, parameters=None, verbose=False):
        if parameters is None:
            parameters = {}

        # create temp dir where to unwarc the archive
        if 'tmp' in parameters:
            system_temp_dirname = parameters['tmp']
            if not os.path.exists(system_temp_dirname):
                os.mkdir(system_temp_dirname)
        else:
            system_temp_dirname = tempfile.gettempdir()

        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
        h = hashlib.md5(parameters['id'].encode('UTF-8'))
        temp_dirname = system_temp_dirname + os.path.sep + \
            "opensemanticetl_enhancer_warc_" + h.hexdigest()

        if os.path.exists(temp_dirname) == False:
            os.mkdir(temp_dirname)

        # prepare document processing
        connector = Connector_File()
        connector.verbose = verbose
        connector.config = parameters.copy()

        # only set container if not yet set by a zip before (if this zip is inside another zip)
        if not 'container' in connector.config:
            connector.config['container'] = warcfilename

        i = 0

        with open(warcfilename, 'rb') as stream:
            for record in ArchiveIterator(stream):
                i += 1

                if record.rec_type == 'response':

                    print(record.rec_headers)

                    # write WARC record content to tempfile
                    tempfilename = temp_dirname + \
                        os.path.sep + 'warcrecord' + str(i)
                    tmpfile = open(tempfilename, 'wb')
                    tmpfile.write(record.content_stream().read())
                    tmpfile.close()

                    # set last modification time of the file to WARC-Date
                    try:
                        last_modified = time.mktime(time.strptime(
                            record.rec_headers.get_header('WARC-Date'), '%Y-%m-%dT%H:%M:%SZ'))
                        os.utime(tempfilename, (last_modified, last_modified))
                    except BaseException as e:
                        sys.stderr.write("Exception while reading filedate to warc content {} from {} : {}\n".format(
                            tempfilename, connector.config['container'], e))

                    # set id (URL and WARC Record ID)
                    connector.config['id'] = record.rec_headers.get_header(
                        'WARC-Target-URI') + '/' + record.rec_headers.get_header('WARC-Record-ID')

                    # index the extracted file
                    try:

                        connector.index_file(filename=tempfilename)

                    except KeyboardInterrupt:
                        raise KeyboardInterrupt

                    except BaseException as e:
                        sys.stderr.write("Exception while indexing warc content {} from {} : {}\n".format(
                            tempfilename, connector.config['container'], e))

                    os.remove(tempfilename)

        shutil.rmtree(temp_dirname)
Exemplo n.º 3
0
    def pst2email(self, pstfilename, parameters={}, verbose=False):

        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs

        if 'tmp' in parameters:
            system_temp_dirname = parameters['tmp']
            if not os.path.exists(system_temp_dirname):
                os.mkdir(system_temp_dirname)
        else:
            system_temp_dirname = tempfile.gettempdir()

        h = hashlib.md5(parameters['id'])
        temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_pst_" + h.hexdigest(
        )

        if not os.path.exists(temp_dirname):
            os.mkdir(temp_dirname)

        # start external PST extractor / converter
        result = subprocess.call(
            ['readpst', '-S', '-D', '-o', temp_dirname, pstfilename])

        if not result == 0:
            sys.stderr.write(
                "Error: readpst failed for {}".format(pstfilename))

        # prepare document processing
        connector = Connector_File()
        connector.verbose = verbose
        connector.config = parameters.copy()

        # only set container if not yet set by a ZIP or PST before (if this PST is inside another ZIP or PST)
        if not 'container' in connector.config:
            connector.config['container'] = pstfilename

        for dirName, subdirList, fileList in os.walk(temp_dirname):

            if verbose:
                print('Scanning directory: %s' % dirName)

            for fileName in fileList:
                if verbose:
                    print('Scanning file: %s' % fileName)

                try:
                    # replace temp dirname from indexed id
                    contained_dirname = dirName.replace(temp_dirname, '', 1)

                    # build a virtual filename pointing to original PST file

                    if contained_dirname:
                        contained_dirname = contained_dirname + os.path.sep
                    else:
                        contained_dirname = os.path.sep

                    connector.config[
                        'id'] = parameters['id'] + contained_dirname + fileName

                    contained_filename = dirName + os.path.sep + fileName

                    # E-mails filenames are pure number
                    # Attachment file names are number-filename
                    # if temp_filename without - in filename, its a mail file
                    # rename to suffix .eml so Tika will extract more metadata like from and to
                    if not '-' in fileName:
                        os.rename(contained_filename,
                                  contained_filename + '.eml')
                        contained_filename += '.eml'
                        connector.config['id'] += '.eml'

                    try:
                        connector.index_file(filename=contained_filename)

                    except KeyboardInterrupt:
                        raise KeyboardInterrupt

                    except BaseException as e:
                        sys.stderr.write(
                            "Exception while indexing contained content {} from {} : {}\n"
                            .format(fileName, connector.config['container'],
                                    e.message))

                    os.remove(contained_filename)

                except BaseException as e:
                    sys.stderr.write(
                        "Exception while indexing file {} : {}\n".format(
                            fileName, e.message))

        shutil.rmtree(temp_dirname)
	def unwarc_and_index_files(self, warcfilename, parameters={}, verbose=False):
			
		# create temp dir where to unwarc the archive
		if 'tmp' in parameters:
			system_temp_dirname = parameters['tmp']
			if not os.path.exists(system_temp_dirname):
				os.mkdir(system_temp_dirname)
		else:
			system_temp_dirname = tempfile.gettempdir()

		# we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
		h = hashlib.md5(parameters['id'].encode('UTF-8'))
		temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_warc_" + h.hexdigest()
	
		if os.path.exists(temp_dirname) == False:
			os.mkdir(temp_dirname)
	
		# prepare document processing
		connector = Connector_File()
		connector.verbose = verbose
		connector.config = parameters.copy()
		
		
		# only set container if not yet set by a zip before (if this zip is inside another zip)
		if not 'container' in connector.config:
			connector.config['container'] = warcfilename

		i = 0
					
		with open(warcfilename, 'rb') as stream:
			for record in ArchiveIterator(stream):
				i += 1

				if record.rec_type == 'response':
					
					print(record.rec_headers)

					# write WARC record content to tempfile
					tempfilename = temp_dirname + os.path.sep + 'warcrecord' + str(i)
					tmpfile = open(tempfilename, 'wb')
					tmpfile.write(record.content_stream().read())
					tmpfile.close()
					
					# set last modification time of the file to WARC-Date
					try:
						last_modified = time.mktime(time.strptime(record.rec_headers.get_header('WARC-Date'), '%Y-%m-%dT%H:%M:%SZ'))
						os.utime( tempfilename, (last_modified, last_modified) )
					except BaseException as e:
						sys.stderr.write( "Exception while reading filedate to warc content {} from {} : {}\n".format(tempfilename, connector.config['container'], e) )

					# set id (URL and WARC Record ID)
					connector.config['id'] = record.rec_headers.get_header('WARC-Target-URI')+ '/' + record.rec_headers.get_header('WARC-Record-ID')

					# index the extracted file
					try:
	
						connector.index_file(filename = tempfilename)
	
					except KeyboardInterrupt:
						raise KeyboardInterrupt

					except BaseException as e:
						sys.stderr.write( "Exception while indexing warc content {} from {} : {}\n".format(tempfilename, connector.config['container'], e) )
	
					os.remove(tempfilename)
		
		shutil.rmtree(temp_dirname)
Exemplo n.º 5
0
if __name__ == "__main__":

    from optparse import OptionParser

    parser = OptionParser("etl-tasks [options]")
    parser.add_option("-q",
                      "--quiet",
                      dest="quiet",
                      action="store_true",
                      default=False,
                      help="Don\'t print status (filenames) while indexing")
    parser.add_option("-v",
                      "--verbose",
                      dest="verbose",
                      action="store_true",
                      default=False,
                      help="Print debug messages")

    (options, args) = parser.parse_args()

    if options.verbose == False or options.verbose == True:
        verbose = options.verbose
        etl_delete.verbose = options.verbose
        etl_file.verbose = options.verbose
        etl_web.verbose = options.verbose
        etl_rss.verbose = options.verbose

    if options.quiet == False or options.quiet == True:
        etl_file.quiet = options.quiet

    app.worker_main()
	def pst2email(self, pstfilename, parameters={}, verbose=False):
	
		# we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
	
		if 'tmp' in parameters:
			system_temp_dirname = parameters['tmp']
			if not os.path.exists(system_temp_dirname):
				os.mkdir(system_temp_dirname)
		else:
			system_temp_dirname = tempfile.gettempdir()
	
		h = hashlib.md5(parameters['id'].encode('UTF-8'))
		temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_pst_" + str(os.getpid()) + "_" + h.hexdigest()
	
		if not os.path.exists(temp_dirname):
			os.mkdir(temp_dirname)

		
		# start external PST extractor / converter
		result = subprocess.call(['readpst', '-S', '-D' , '-o', temp_dirname, pstfilename])
		
		if not result == 0:
			sys.stderr.write( "Error: readpst failed for {}".format(pstfilename) )
		

		# prepare document processing
		connector = Connector_File()
		connector.verbose=verbose
		connector.config = parameters.copy()
		
		
		# only set container if not yet set by a ZIP or PST before (if this PST is inside another ZIP or PST)
		if not 'container' in connector.config:
			connector.config['container'] = pstfilename

		
		for dirName, subdirList, fileList in os.walk(temp_dirname):
	
			if verbose:
				print('Scanning directory: %s' % dirName)
	
				
			for fileName in fileList:
				if verbose:
					print('Scanning file: %s' % fileName)
	
				try:
					# replace temp dirname from indexed id
					contained_dirname = dirName.replace(temp_dirname, '', 1)
					
					# build a virtual filename pointing to original PST file
					
					if contained_dirname:
						contained_dirname = contained_dirname + os.path.sep
					else:
						contained_dirname = os.path.sep
					
					connector.config['id'] = parameters['id'] + contained_dirname + fileName

					contained_filename = dirName + os.path.sep + fileName

					# E-mails filenames are pure number
					# Attachment file names are number-filename
					# if temp_filename without - in filename, its a mail file
					# rename to suffix .eml so Tika will extract more metadata like from and to
					if not '-' in fileName:
						os.rename(contained_filename, contained_filename + '.eml')
						contained_filename += '.eml'
						connector.config['id'] += '.eml'
						
						
					try:
						connector.index_file(filename=contained_filename)

					except KeyboardInterrupt:
						raise KeyboardInterrupt
	
					except BaseException as e:
						sys.stderr.write( "Exception while indexing contained content {} from {} : {}\n".format(fileName, connector.config['container'], e.message) )
	
					os.remove(contained_filename)
	
				except BaseException as e:
					sys.stderr.write( "Exception while indexing file {} : {}\n".format(fileName, e.message) )
	
		shutil.rmtree(temp_dirname)