def __init__(self, source_file, test_parse=False, source_directory=None, working_directory=None, testing=False): """ source directory and working directory are generally for test purposes """ if not testing: # If this isn't specified we're testing so don't need them assert Config.logger assert Config.config dtpo_log("debug", "TextExtractor -> %s", source_file) if source_directory is None: source_directory = Config.config.get_source_directory() if working_directory is None: working_directory = Config.config.get_working_directory() self.source_file = source_directory + "/" + source_file self.text_file = working_directory + "/" + source_file + ".txt" self.file_array = [] self.status = False self.file_type, self.mime_type = get_file_type(self.source_file) if str(self.file_type) == "k.PDF_Document": self.parse_pdf(test_parse) else: error_message = "TextExtractor - Invalid File Type for {0}".format(self.source_file) dtpo_log("error", error_message) raise ValueError(error_message)
def get_file_type(source_file): """ Check the source file and determine its type TODO Implement other types """ dtpo_log("info", "get_file_type for %s - needs fully implementing", source_file) return k.PDF_Document, "application/pdf"
def parse_pdf(self, test_parse=False): """ Parse a PDF and return text contents as an array """ dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file) # input options pagenos = set() maxpages = 0 # output option codec = "utf-8" caching = True laparams = LAParams() laparams.char_margin = 8.0 laparams.word_margin = 2.0 rsrcmgr = PDFResourceManager(caching=caching) try: outfp = file(self.text_file, "w") except IOError as io_error: raise DTPOFileError(self.text_file, 0, str(io_error)) try: fp = file(self.source_file, "rb") except IOError as io_error: raise DTPOFileError(self.source_file, 0, str(io_error)) try: device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True) except PDFException as pdf_error: message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error)) raise DTPOFileError(self.source_file, 0, message) except Exception as exception: message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception)) raise DTPOFileError(self.source_file, 0, message) fp.close() device.close() outfp.close() # Got the PDF converted = now get it into an array self.file_array = [] for line in open(self.text_file): self.file_array.append(line) # Remove the last entry - it's always '\x0c' if len(self.file_array) > 0: del self.file_array[-1] # Remove the outfile if not test_parse: os.remove(self.text_file)
def parse_pattern_file(self, config_file) : """ Parse the file extracting the relevant details """ # # Now read through the file and set the parameters # current_file_pattern = None dtpo_log('debug', "Parsing pattern file -> '%s'", config_file) line_number = 0 try : for line in open(config_file) : line_number = line_number + 1 key, value = parse_line(line, line_number) # if we found something then process it if (key or value) : current_file_pattern = self.process_pattern_value( current_file_pattern, key, value) # check that the defaults are there - do this first in case the # file is corrupt - that way we fail gracefully for key in self.pattern_keys : if self.pattern_keys[key]['type'] == 'default' and \ not self.pattern_keys[key]['optional'] and \ eval("self.{0} is None".format( self.pattern_keys[key]['variable'])) : raise DTPOFileError(config_file, line_number, "Missing default -> '{0}'".format(key)) # Validate that the last record is good & then add it to the list self.check_file_pattern_complete(current_file_pattern) self.file_pattern_list.append(current_file_pattern) except ParseError as parse_exception : raise DTPOFileError( config_file, line_number, parse_exception.message) except IOError as io_exception : # Failed to access the config file raise DTPOFileError (config_file, 0, "Error accessing config file -> '{0}'" \ .format(str(io_exception)))
def __init__(self, config_file) : """ read through the file and set the parameters """ dtpo_log('debug', 'DTPOParseSpec. Source File -> %s', config_file) self.default_database = None self.default_group = None self.default_tag = None self.file_pattern_list = [] self.string1_search_dict = {} self.string2_search_dict = {} self.date_search_dict = {} self.parse_pattern_file(config_file) self.create_reference_lists()
def get_import_parameters(source_file, pattern_spec, test_parse) : """ Imports the specified file into DTPO using the spec given """ dtpo_log('debug', "get_import_parameters source_file -> %s", source_file) # # parse the file and turn it into a list # file_parser = TextExtractor(source_file, test_parse = test_parse) # Parse the file and then do the import dtpo_import_parameters = parse_source_file(file_parser, pattern_spec) dtpo_import_parameters.file_type = file_parser.file_type dtpo_import_parameters.mime_type = file_parser.mime_type return dtpo_import_parameters
def parse_line(line, line_number) : """ Parse a line to extract a key and value and return a key, value tuple """ dtpo_log('debug', "%04d -> '%s'", line_number, line) # Want lines that dont start with # search = re.match('(^[^#]*)::(.[^#]*)', line.lstrip()) return_value = (False, "") # if we found something then if (search) : key = search.group(1).lstrip() value = search.group(2).rstrip() dtpo_log('debug', "key -> '%s', value -> '%s'", key, value) return_value = (key, value) return return_value
def main() : """ Get the command line arguments """ p = optparse.OptionParser() p.add_option("-d", action="store_true", dest="debug") p.add_option("--debug", action="store_true", dest="debug") p.add_option("--config_file", action="store", dest="config_file") p.add_option("--test_parse", action="store_true", dest="test_parse") p.set_defaults(debug = False) opts, source_file_args = p.parse_args() try : # Config File is mandatory if not opts.config_file : raise ParseError("No Config file") # # Upload the configs # Config(opts.config_file) pattern_spec = DTPOParseSpec(Config.config.get_pattern_file()) except DTPOFileError as file_error: dtpo_alert(log_type = 'fatal', reason = file_error.message) raise SystemExit("FATAL ERROR - Failed to parse config file") except ParseError as parse_error : dtpo_alert('fatal', reason = parse_error.message) raise SystemExit("FATAL ERROR - Failed to parse pattern file") # # Now iterate through the files # for source_file in source_file_args: dtpo_log('info', "Started processing -> %s", source_file) try : # TODO - we're assuming PDF files here # Check that the file name actually ends in # pdf if not rename it as it will save trouble with DTPO later suffix = source_file[-3:] if suffix.lower() != 'pdf' : dtpo_log('debug', "Adding pdf suffix on to '%s'", source_file) source_dir = Config.config.get_source_directory() + '/' os.rename(source_dir + source_file, source_dir + source_file + '.pdf') source_file += '.pdf' # # Convert the file to text if we can and then parse it # import_details = get_import_parameters(source_file, pattern_spec, opts.test_parse) if opts.test_parse : import_details.print_import_details(source_file) else : execute_import(import_details) trash_file(source_file, import_details.get_document_name()) dtpo_alert('info', file_name = import_details.get_document_name(), group_name = import_details.group) except DTPOFileError as file_error : # We failed ... Leave the file be as there is a problem with it dtpo_log('error', "Import failed for '%s' - file not touched\n%s", basename(source_file), file_error.message) dtpo_alert('fatal', reason = file_error.message, file_name = source_file) except ParseError as parse_error : # We failed ... Move the file to the Orphan directory dtpo_log('error', "Import failed for '%s' - orphaning file\n%s", basename(source_file), parse_error.message) dtpo_alert('error', reason = parse_error.message, file_name = source_file) orphan_file(source_file) except Exception as exception : # Something horrible has happend dtpo_log('fatal', "System error for '%s'\n%s", basename(source_file), str(exception)) dtpo_alert('fatal', reason = str(exception), file_name = source_file) dtpo_log('debug', 'Completed Successfully')
def execute_import(import_parameters) : """ Now run the actual import into DTPO """ assert import_parameters.source_file assert import_parameters.file_type assert import_parameters.mime_type assert import_parameters.group assert import_parameters.tags source_file = import_parameters.source_file database = Config.config.get_database_directory() + '/' + \ import_parameters.database document_name = import_parameters.get_document_name() dtpo_log('info', "execute_import source file -> %s", source_file) dtpo_log('info', "execute_import database -> %s", database) dtpo_log('info', "execute_import group -> %s", import_parameters.group) dtpo_log('info', "execute_import tags -> %s", import_parameters.tags) dtpo_log('info', "execute_import document name -> %s", document_name) try : try : # First see if the relevant database is open already dtpo_db_id = None dt = app(u'DEVONthink Pro') for dtpo_db in dt.databases.get() : if dtpo_db.path() == database : dtpo_db_id = dtpo_db.id() break if dtpo_db_id is None : dtpo_db = app(u'DEVONthink Pro').open_database(database) dtpo_db_id = dtpo_db.id() except AttributeError as attribute_error : message = "Failed to open database {0} -> {1}".format( import_parameters.database, str(attribute_error)) raise ParseError(message) try : dtpo_group = app(u'DEVONthink Pro').create_location( import_parameters.group, in_=app.databases.ID(dtpo_db_id)) # get the group to check that it's there dtpo_group_id = dtpo_group.id() #pylint: disable-msg=W0612 except AttributeError as attribute_error : message = "Failed access group {0} -> {1}".format( import_parameters.group, str(attribute_error)) raise ParseError(message) try : doc = app(u'DEVONthink Pro').import_( import_parameters.source_file, name = document_name, to = dtpo_group) docid = doc.id() except AttributeError as attribute_error : message = "Failed import document {0} -> {1}".format( document_name, str(attribute_error)) raise ParseError(message) try : app(u'DEVONthink Pro').databases.ID( dtpo_db_id).contents.ID(docid).unread.set(True) app(u'DEVONthink Pro').databases.ID( dtpo_db_id).contents.ID(docid).tags.set(import_parameters.tags) app(u'DEVONthink Pro').databases.ID( dtpo_db_id).contents.ID(docid).URL.set('') duplicate = app(u'DEVONthink Pro').databases.ID( dtpo_db_id).contents.ID(docid).number_of_duplicates.get() if int(duplicate) > 0 : dtpo_alert('warn', reason = '{0} duplicates of '\ .format(duplicate), file_name = document_name) except AttributeError as attribute_error : message = "Failed set attributes {0} -> {1}".format( import_parameters.get_document_name(), str(attribute_error)) raise ParseError(message) except ParseError as parse_error: raise parse_error except Exception as exception : ex_type = type(exception) message = "Unexpected exception {0} -> {1}".format( ex_type, str(exception)) raise Exception(message) return True