def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') items = [] try: with ftplib.FTP(config.get('host')) as ftp: ftp.login(config.get('username'), config.get('password')) ftp.cwd(config.get('path', '')) ftp.set_pasv(config.get('passive', False)) items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue dest = os.path.join(config['dest_path'], filename) try: with open(dest, 'xb') as f: ftp.retrbinary('RETR %s' % filename, f.write) except FileExistsError: continue xml = etree.parse(dest).getroot() parser = get_xml_parser(xml) if not parser: raise IngestFtpError.ftpUnknownParserError( Exception('Parser not found'), provider, filename) parsed = parser.parse_message(xml, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') items = [] try: with ftplib.FTP(config.get('host')) as ftp: ftp.login(config.get('username'), config.get('password')) ftp.cwd(config.get('path', '')) ftp.set_pasv(config.get('passive', False)) items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue dest = os.path.join(config['dest_path'], filename) try: with open(dest, 'xb') as f: ftp.retrbinary('RETR %s' % filename, f.write) except FileExistsError: continue xml = etree.parse(dest).getroot() parser = get_xml_parser(xml) if not parser: raise IngestFtpError.ftpUnknownParserError(Exception('Parser not found'), provider, filename) parsed = parser.parse_message(xml, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _list_files(self, ftp, provider): self._timer.start("ftp_list") try: return [(filename, facts["modify"]) for filename, facts in ftp.mlsd() if facts.get("type") == "file"] except Exception as ex: if "500" in str(ex): file_list = [] file_name_list = [] date_list = [] ftp.dir(file_list.append) self.DATE_FORMAT = "%Y %b %d %H:%M:%S" for line in file_list: col = line.split() date_string = "{} ".format(datetime.now().year) + " ".join( col[5:8]) + ":00" date_list.append(date_string) file_name_list.append(col[8]) return zip(file_name_list, date_list) else: raise IngestFtpError.ftpError(ex, provider) finally: self._log_msg("FTP list files. Exec time: {:.4f} secs.".format( self._timer.stop("ftp_list")))
def _list_files(self, ftp, provider): self._timer.start('ftp_list') try: return [(filename, facts['modify']) for filename, facts in ftp.mlsd() if facts.get('type') == 'file'] except Exception as ex: if '500' in str(ex): file_list = [] file_name_list = [] date_list = [] ftp.dir(file_list.append) self.DATE_FORMAT = '%Y %b %d %H:%M:%S' for line in file_list: col = line.split() date_string = '{} '.format(datetime.now().year) + ' '.join( col[5:8]) + ':00' date_list.append(date_string) file_name_list.append(col[8]) return zip(file_name_list, date_list) else: raise IngestFtpError.ftpError(ex, provider) finally: self._log_msg("FTP list files. Exec time: {:.4f} secs.".format( self._timer.stop('ftp_list')))
def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') crt_last_updated = None if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) logger.exception('Exception retrieving from FTP server') continue except FileExistsError: logger.exception('Exception retrieving from FTP server, file already exists') continue registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _test(self, provider): config = provider.get('config', {}) try: with ftp_connect(config) as ftp: ftp.mlsd() except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) logger.exception( 'Exception retrieving from FTP server') continue except FileExistsError: continue registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def ftp_connect(config): """Get ftp connection for given config. use with `with` :param config: dict with `host`, `username`, `password`, `path`, `passive` and `use_ftp` """ if config.get("use_ftps", False): try: ftp = ftplib.FTP_TLS(config.get("host"), timeout=app.config.get("FTP_TIMEOUT", 300)) except socket.gaierror as e: raise IngestFtpError.ftpHostError(exception=e) try: ftp.auth() except ftplib.error_perm as ae: ftp.close() raise IngestFtpError.ftpAuthError(exception=ae) else: try: ftp = ftplib.FTP(config.get("host"), timeout=app.config.get("FTP_TIMEOUT", 300)) except socket.gaierror as e: raise IngestFtpError.ftpHostError(exception=e) if config.get("username"): try: ftp.login(config.get("username"), config.get("password")) except ftplib.error_perm as e: raise IngestFtpError.ftpAuthError(exception=e) # set encryption on data channel if able if hasattr(ftp, "prot_p"): ftp.prot_p() if config.get("path"): ftp.cwd(config.get("path", "").lstrip("/")) if config.get( "passive" ) is False: # only set this when not active, it's passive by default ftp.set_pasv(False) yield ftp ftp.close()
def _update(self, provider): config = provider.get("config", {}) last_updated = provider.get("last_updated") if "dest_path" not in config: config["dest_path"] = tempfile.mkdtemp(prefix="superdesk_ingest_") try: with ftplib.FTP(config.get("host")) as ftp: ftp.login(config.get("username"), config.get("password")) ftp.cwd(config.get("path", "")) ftp.set_pasv(config.get("passive", False)) items = [] for filename, facts in ftp.mlsd(): if facts.get("type", "") != "file": continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime(facts["modify"], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue local_file_path = os.path.join(config["dest_path"], filename) try: with open(local_file_path, "xb") as f: try: ftp.retrbinary("RETR %s" % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) logger.exception("Exception retrieving from FTP server") continue except FileExistsError: continue registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _test(self, provider): config = provider.get("config", {}) try: with ftp_connect(config) as ftp: ftp.mlsd() except IngestFtpError: raise except Exception as ex: if "500" in str(ex): ftp.nlst() else: raise IngestFtpError.ftpError(ex, provider)
def ftp_connect(config): """Get ftp connection for given config. use with `with` :param config: dict with `host`, `username`, `password`, `path` and `passive` """ try: ftp = ftplib.FTP(config.get('host'), timeout=app.config.get('FTP_TIMEOUT', 300)) except socket.gaierror as e: raise IngestFtpError.ftpHostError(exception=e) if config.get('username'): try: ftp.login(config.get('username'), config.get('password')) except ftplib.error_perm as e: raise IngestFtpError.ftpAuthError(exception=e) if config.get('path'): ftp.cwd(config.get('path', '').lstrip('/')) if config.get('passive') is False: # only set this when not active, it's passive by default ftp.set_pasv(False) yield ftp ftp.close()
def test_raise_ftpUnknownParserError(self): with assert_raises(IngestFtpError) as error_context: try: raise Exception("Testing ftpUnknownParserError") except Exception as ex: raise IngestFtpError.ftpUnknownParserError(ex, self.provider, 'test.xml') exception = error_context.exception self.assertTrue(exception.code == 5001) self.assertTrue(exception.message == "FTP parser could not be found") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing ftpUnknownParserError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "IngestFtpError Error 5001 - FTP parser could not be found: " "Testing ftpUnknownParserError on channel TestProvider file=test.xml")
def _list_files(self, ftp, provider): self._timer.start("ftp_list") try: return [(filename, facts["modify"]) for filename, facts in ftp.mlsd() if facts.get("type") == "file"] except Exception as ex: if "500" in str(ex): now = utcnow() return [(file_name, now) for file_name in ftp.nlst()] else: raise IngestFtpError.ftpError(ex, provider) finally: self._log_msg("FTP list files. Exec time: {:.4f} secs.".format( self._timer.stop("ftp_list")))
def test_raise_ftpError(self): with assert_raises(IngestFtpError) as error_context: try: ex = Exception("Testing ftpError") raise ex except Exception: raise IngestFtpError.ftpError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 5000) self.assertTrue(exception.message == "FTP ingest error") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing ftpError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "IngestFtpError Error 5000 - FTP ingest error: " "Testing ftpError on channel TestProvider")
def _list_items(self, ftp, provider): try: return [(filename, facts['modify']) for filename, facts in ftp.mlsd() if facts.get('type') == 'file'] except Exception as ex: if '500' in str(ex): file_list = [] file_name_list = [] date_list = [] ftp.dir(file_list.append) self.DATE_FORMAT = '%Y %b %d %H:%M:%S' for line in file_list: col = line.split() date_string = '{} '.format(datetime.now().year) + ' '.join(col[5:8]) + ':00' date_list.append(date_string) file_name_list.append(col[8]) return zip(file_name_list, date_list) else: raise IngestFtpError.ftpError(ex, provider)
def test_raise_ftpUnknownParserError(self): with assert_raises(IngestFtpError) as error_context: try: ex = Exception("Testing ftpUnknownParserError") raise ex except Exception: raise IngestFtpError.ftpUnknownParserError(ex, self.provider, 'test.xml') exception = error_context.exception self.assertTrue(exception.code == 5001) self.assertTrue(exception.message == "FTP parser could not be found") self.assertIsNotNone(exception.system_exception) self.assertEquals(exception.system_exception.args[0], "Testing ftpUnknownParserError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 2) self.assertEqual(self.mock_logger_handler.messages['error'][1], "IngestFtpError Error 5001 - FTP parser could not be found: " "Testing ftpUnknownParserError on channel TestProvider") self.assertEqual(self.mock_logger_handler.messages['error'][0], "Provider: TestProvider - File: test.xml unknown file format. " "Parser couldn't be found.")
def test_raise_ftpUnknownParserError(self): with assert_raises(IngestFtpError) as error_context: try: ex = Exception("Testing ftpUnknownParserError") raise ex except Exception: raise IngestFtpError.ftpUnknownParserError( ex, self.provider, 'test.xml') exception = error_context.exception self.assertTrue(exception.code == 5001) self.assertTrue(exception.message == "FTP parser could not be found") self.assertIsNotNone(exception.system_exception) self.assertEquals(exception.system_exception.args[0], "Testing ftpUnknownParserError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 2) self.assertEqual( self.mock_logger_handler.messages['error'][1], "IngestFtpError Error 5001 - FTP parser could not be found: " "Testing ftpUnknownParserError on channel TestProvider") self.assertEqual( self.mock_logger_handler.messages['error'][0], "Provider: TestProvider - File: test.xml unknown file format. " "Parser couldn't be found.")
class FTPFeedingService(FeedingService): """ Feeding Service class which can read article(s) which exist in a file system and accessible using FTP. """ NAME = "ftp" ERRORS = [ IngestFtpError.ftpUnknownParserError().get_error_description(), IngestFtpError.ftpError().get_error_description(), ] label = "FTP feed" fields = [ { "id": "host", "type": "text", "label": "Host", "placeholder": "FTP Server URL", "required": True, "errors": { 5003: "Server not found." }, }, { "id": "username", "type": "text", "label": "Username", "placeholder": "Username", "required": False, "errors": { 5002: "Credentials error." }, }, { "id": "password", "type": "password", "label": "Password", "placeholder": "Password", "required": False }, { "id": "path", "type": "text", "label": "Path", "placeholder": "FTP Server Path", "required": False }, { "id": "dest_path", "type": "text", "label": "Local Path", "placeholder": "Local Path", "required": True }, { "id": "passive", "type": "boolean", "label": "Passive", "placeholder": "Passive", "required": False, "default": True, }, { "id": "move", "type": "boolean", "label": "Move items after ingestion", "placeholder": "Move items after ingestion", "required": False, }, { "id": "ftp_move_path", "type": "text", "label": "Move ingested items to", "placeholder": "FTP Server Path, keep empty to use default path", "required": False, "show_expression": "provider.config.move === true", }, { "id": "move_path_error", "type": "text", "label": "Move *NOT* ingested items (i.e. on error) to", "placeholder": "FTP Server Path, keep empty to use default path", "required": False, "show_expression": "provider.config.move === true", }, ] DATE_FORMAT = "%Y%m%d%H%M%S" ALLOWED_EXT_DEFAULT = {".json", ".xml"} def config_from_url(self, url): """ Parse given url into ftp config. :param url: url in form `ftp://username:password@host:port/dir` """ url_parts = urlparse(url) return { "username": url_parts.username, "password": url_parts.password, "host": url_parts.hostname, "path": url_parts.path.lstrip("/"), } def _test(self, provider): config = provider.get("config", {}) try: with ftp_connect(config) as ftp: ftp.mlsd() except IngestFtpError: raise except Exception as ex: if "500" in str(ex): ftp.nlst() else: raise IngestFtpError.ftpError(ex, provider) def _move(self, ftp, src, dest, file_modify, failed): """Move distant file file won't be moved if it is failed and last modification was made recently enough (i.e. before config's INGEST_OLD_CONTENT_MINUTES is expired). In other words, if a file fails, it will be tried again until INGEST_OLD_CONTENT_MINUTES delay expires. :param ftp: FTP instance to use :type ftp: ftplib.FTP :param src: source path of the file to move :type src: str :param dest: dest path of the file to move :type dest: str :param file_modify: date of last file modification :type file_modify: datetime :param failed: True if something when wrong during ingestion :type failed: bool """ if failed and not self.is_old_content(file_modify): logger.warning( "{src!r} ingestion failed, but we are in the backstop delay, it will be " "tried again next time".format(src=src)) return try: ftp.rename(src, dest) except ftplib.all_errors as e: logger.warning( "Can't move file from {src} to {dest}: {reason}".format( src=src, dest=dest, reason=e)) def _create_if_missing(self, ftp, path): """Check if a dir exists, and create it else :param ftp: FTP instance to use :type ftp: ftplib.FTP :param src: dir path to check :type src: str """ base_path = ftp.pwd() try: ftp.cwd(path) except ftplib.all_errors: # path probably doesn't exist # catching all_errors is a bit overkill, # but ftplib doesn't really have precise error # for missing directory if path.startswith("./"): ftp.cwd("/") ftp.mkd(path) elif not path.startswith("/"): ftp.mkd("/" + path) else: ftp.mkd(path) finally: ftp.cwd(base_path) def _create_move_folders(self, config, ftp): if not config.get("ftp_move_path"): logger.debug("missing move_path, default will be used") move_path = os.path.join( config.get("path", ""), config.get("ftp_move_path") or DEFAULT_SUCCESS_PATH) if not config.get("move_path_error"): logger.debug("missing move_path_error, default will be used") move_path_error = os.path.join( config.get("path", ""), config.get("move_path_error") or DEFAULT_FAILURE_PATH) try: self._create_if_missing(ftp, move_path) self._create_if_missing(ftp, move_path_error) except ftplib.all_errors as e: logger.error( "Can't create move directory: {reason}".format(reason=e)) raise e return move_path, move_path_error def _is_allowed(self, filename, allowed_ext): """Test if given file is allowed to be ingested.""" _, ext = os.path.splitext(filename) return ext.lower() in allowed_ext def _is_empty(self, file_path): """Test if given file path is empty, return True if a file is empty""" return not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0) def _list_files(self, ftp, provider): self._timer.start("ftp_list") try: return [(filename, facts["modify"]) for filename, facts in ftp.mlsd() if facts.get("type") == "file"] except Exception as ex: if "500" in str(ex): file_list = [] file_name_list = [] date_list = [] ftp.dir(file_list.append) self.DATE_FORMAT = "%Y %b %d %H:%M:%S" for line in file_list: col = line.split() date_string = "{} ".format(datetime.now().year) + " ".join( col[5:8]) + ":00" date_list.append(date_string) file_name_list.append(col[8]) return zip(file_name_list, date_list) else: raise IngestFtpError.ftpError(ex, provider) finally: self._log_msg("FTP list files. Exec time: {:.4f} secs.".format( self._timer.stop("ftp_list"))) def _sort_files(self, files): self._timer.start("sort_files") files = sorted(files, key=lambda x: x[1]) self._log_msg("Sort {} files. Exec time: {:.4f} secs.".format( len(files), self._timer.stop("sort_files"))) return files def _retrieve_and_parse(self, ftp, config, filename, provider, registered_parser): self._timer.start("retrieve_parse") if "dest_path" not in config: config["dest_path"] = tempfile.mkdtemp(prefix="superdesk_ingest_") local_file_path = os.path.join(config["dest_path"], filename) with open(local_file_path, "wb") as f: try: ftp.retrbinary("RETR %s" % filename, f.write) self._log_msg( "Download finished. Exec time: {:.4f} secs. Size: {} bytes. File: {}." .format(self._timer.split("retrieve_parse"), os.path.getsize(local_file_path), filename)) except ftplib.all_errors: self._log_msg( "Download failed. Exec time: {:.4f} secs. File: {}.". format(self._timer.stop("retrieve_parse"), filename)) os.remove(local_file_path) raise Exception( "Exception retrieving file from FTP server ({filename})". format(filename=filename)) if self._is_empty(local_file_path): logger.info( "ignoring empty file {filename}".format(filename=filename)) raise EmptyFile(local_file_path) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) self._log_msg( "Parsing finished. Exec time: {:.4f} secs. File: {}.".format( self._timer.stop("retrieve_parse"), filename)) return [parsed] if isinstance(parsed, dict) else parsed def _update(self, provider, update): config = provider.get("config", {}) do_move = config.get("move", False) last_processed_file_modify = provider.get( "private", {}).get("last_processed_file_modify") limit = app.config.get("FTP_INGEST_FILES_LIST_LIMIT", 100) registered_parser = self.get_feed_parser(provider) allowed_ext = getattr(registered_parser, "ALLOWED_EXT", self.ALLOWED_EXT_DEFAULT) try: self._timer.start("ftp_connect") with ftp_connect(config) as ftp: self._log_msg( "Connected to FTP server. Exec time: {:.4f} secs.".format( self._timer.stop("ftp_connect"))) files_to_process = [] files = self._sort_files(self._list_files(ftp, provider)) if do_move: move_path, move_path_error = self._create_move_folders( config, ftp) self._timer.start("files_to_process") for filename, modify in files: # filter by extension if not self._is_allowed(filename, allowed_ext): logger.info( "ignoring file {filename} because of file extension" .format(filename=filename)) continue # filter by modify datetime file_modify = datetime.strptime( modify, self.DATE_FORMAT).replace(tzinfo=utc) if last_processed_file_modify: # ignore limit and add files for processing if last_processed_file_modify == file_modify: files_to_process.append((filename, file_modify)) elif last_processed_file_modify < file_modify: # even if we have reached a limit, we must add at least one file to increment # a `last_processed_file_modify` in provider files_to_process.append((filename, file_modify)) # limit amount of files to process per ingest update if len(files_to_process) >= limit: break else: # limit amount of files to process per ingest update if len(files_to_process) >= limit: break # add files for processing files_to_process.append((filename, file_modify)) self._log_msg( "Got {} file for processing. Exec time: {:.4f} secs.". format(len(files_to_process), self._timer.stop("files_to_process"))) # process files self._timer.start("start_processing") for filename, file_modify in files_to_process: try: update["private"] = { "last_processed_file_modify": file_modify } failed = yield self._retrieve_and_parse( ftp, config, filename, provider, registered_parser) if do_move: move_dest_file_path = os.path.join( move_path if not failed else move_path_error, filename) self._move(ftp, filename, move_dest_file_path, file_modify, failed=failed) except EmptyFile: continue except Exception as e: logger.error( "Error while parsing {filename}: {msg}".format( filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join( move_path_error, filename) self._move(ftp, filename, move_dest_file_path_error, file_modify, failed=True) self._log_msg( "Processing finished. Exec time: {:.4f} secs.".format( self._timer.stop("start_processing"))) except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider, update): config = provider.get("config", {}) do_move = config.get("move", False) last_processed_file_modify = provider.get( "private", {}).get("last_processed_file_modify") limit = app.config.get("FTP_INGEST_FILES_LIST_LIMIT", 100) registered_parser = self.get_feed_parser(provider) allowed_ext = getattr(registered_parser, "ALLOWED_EXT", self.ALLOWED_EXT_DEFAULT) try: self._timer.start("ftp_connect") with ftp_connect(config) as ftp: self._log_msg( "Connected to FTP server. Exec time: {:.4f} secs.".format( self._timer.stop("ftp_connect"))) files_to_process = [] files = self._sort_files(self._list_files(ftp, provider)) if do_move: move_path, move_path_error = self._create_move_folders( config, ftp) self._timer.start("files_to_process") for filename, modify in files: # filter by extension if not self._is_allowed(filename, allowed_ext): logger.info( "ignoring file {filename} because of file extension" .format(filename=filename)) continue # filter by modify datetime file_modify = datetime.strptime( modify, self.DATE_FORMAT).replace(tzinfo=utc) if last_processed_file_modify: # ignore limit and add files for processing if last_processed_file_modify == file_modify: files_to_process.append((filename, file_modify)) elif last_processed_file_modify < file_modify: # even if we have reached a limit, we must add at least one file to increment # a `last_processed_file_modify` in provider files_to_process.append((filename, file_modify)) # limit amount of files to process per ingest update if len(files_to_process) >= limit: break else: # limit amount of files to process per ingest update if len(files_to_process) >= limit: break # add files for processing files_to_process.append((filename, file_modify)) self._log_msg( "Got {} file for processing. Exec time: {:.4f} secs.". format(len(files_to_process), self._timer.stop("files_to_process"))) # process files self._timer.start("start_processing") for filename, file_modify in files_to_process: try: update["private"] = { "last_processed_file_modify": file_modify } failed = yield self._retrieve_and_parse( ftp, config, filename, provider, registered_parser) if do_move: move_dest_file_path = os.path.join( move_path if not failed else move_path_error, filename) self._move(ftp, filename, move_dest_file_path, file_modify, failed=failed) except EmptyFile: continue except Exception as e: logger.error( "Error while parsing {filename}: {msg}".format( filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join( move_path_error, filename) self._move(ftp, filename, move_dest_file_path_error, file_modify, failed=True) self._log_msg( "Processing finished. Exec time: {:.4f} secs.".format( self._timer.stop("start_processing"))) except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') crt_last_updated = None if config.get('move', False): do_move = True if not config.get('move_path'): logger.debug('missing move_path, default will be used') move_dest_path = os.path.join(config.get('path', ''), config.get('move_path') or DEFAULT_SUCCESS_PATH) if not config.get('move_path_error'): logger.debug('missing move_path_error, default will be used') move_dest_path_error = os.path.join(config.get('path', ''), config.get('move_path_error') or DEFAULT_FAILURE_PATH) else: do_move = False if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: if do_move: try: self._create_if_missing(ftp, move_dest_path) self._create_if_missing(ftp, move_dest_path_error) except ftplib.all_errors as e: logger.warning("Can't create move directory, files will not be moved: {reason}".format( reason=e)) do_move = False items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue try: if not filename.lower().endswith(self.FILE_SUFFIX): raise if last_updated: item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) raise Exception('Exception retrieving file from FTP server ({filename})'.format( filename=filename)) except FileExistsError as e: raise Exception('Exception retrieving from FTP server, file already exists ({filename])' .format(filename=local_file_path)) registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) if do_move: move_dest_file_path = os.path.join(move_dest_path, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error("Error while parsing {filename}: {msg}".format(filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join(move_dest_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
class FTPFeedingService(FeedingService): """ Feeding Service class which can read article(s) which exist in a file system and accessible using FTP. """ NAME = 'ftp' ERRORS = [IngestFtpError.ftpUnknownParserError().get_error_description(), IngestFtpError.ftpError().get_error_description()] label = 'FTP feed' fields = [ { 'id': 'host', 'type': 'text', 'label': 'Host', 'placeholder': 'FTP Server URL', 'required': True, 'errors': {5003: 'Server not found.'} }, { 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required': False, 'errors': {5002: 'Credentials error.'} }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': False }, { 'id': 'path', 'type': 'text', 'label': 'Path', 'placeholder': 'FTP Server Path', 'required': False }, { 'id': 'dest_path', 'type': 'text', 'label': 'Local Path', 'placeholder': 'Local Path', 'required': True }, { 'id': 'passive', 'type': 'boolean', 'label': 'Passive', 'placeholder': 'Passive', 'required': False, 'default': True }, { 'id': 'move', 'type': 'boolean', 'label': 'Move items after ingestion', 'placeholder': 'Move items after ingestion', 'required': False }, { 'id': 'ftp_move_path', 'type': 'text', 'label': 'Move ingested items to', 'placeholder': 'FTP Server Path, keep empty to use default path', 'required': False, 'show_expression': '{move}' }, { 'id': 'move_path_error', 'type': 'text', 'label': 'Move *NOT* ingested items (i.e. on error) to', 'placeholder': 'FTP Server Path, keep empty to use default path', 'required': False, 'show_expression': '{move}' } ] DATE_FORMAT = '%Y%m%d%H%M%S' ALLOWED_EXT_DEFAULT = {'.json', '.xml'} def config_from_url(self, url): """ Parse given url into ftp config. :param url: url in form `ftp://username:password@host:port/dir` """ url_parts = urlparse(url) return { 'username': url_parts.username, 'password': url_parts.password, 'host': url_parts.hostname, 'path': url_parts.path.lstrip('/'), } def _test(self, provider): config = provider.get('config', {}) try: with ftp_connect(config) as ftp: ftp.mlsd() except IngestFtpError: raise except Exception as ex: if '500' in str(ex): ftp.nlst() else: raise IngestFtpError.ftpError(ex, provider) def _move(self, ftp, src, dest): """Move distant file :param ftp: FTP instance to use :type ftp: ftplib.FTP :param src: source path of the file to move :type src: str :param dest: dest path of the file to move :type dest: str """ try: ftp.rename(src, dest) except ftplib.all_errors as e: logger.warning("Can't move file from {src} to {dest}: {reason}".format( src=src, dest=dest, reason=e)) def _create_if_missing(self, ftp, path): """Check if a dir exists, and create it else :param ftp: FTP instance to use :type ftp: ftplib.FTP :param src: dir path to check :type src: str """ base_path = ftp.pwd() try: ftp.cwd(path) except ftplib.all_errors: # path probably doesn't exist # catching all_errors is a bit overkill, # but ftplib doesn't really have precise error # for missing directory ftp.mkd(path) finally: ftp.cwd(base_path) def _create_move_folders(self, config, ftp): if not config.get('ftp_move_path'): logger.debug('missing move_path, default will be used') move_path = os.path.join(config.get('path', ''), config.get('ftp_move_path') or DEFAULT_SUCCESS_PATH) if not config.get('move_path_error'): logger.debug('missing move_path_error, default will be used') move_path_error = os.path.join(config.get('path', ''), config.get('move_path_error') or DEFAULT_FAILURE_PATH) try: self._create_if_missing(ftp, move_path) self._create_if_missing(ftp, move_path_error) except ftplib.all_errors as e: logger.error("Can't create move directory: {reason}".format(reason=e)) raise e return move_path, move_path_error def _is_allowed(self, filename, allowed_ext): """Test if given file is allowed to be ingested.""" _, ext = os.path.splitext(filename) return ext.lower() in allowed_ext def _list_files(self, ftp, provider): self._timer.start('ftp_list') try: return [(filename, facts['modify']) for filename, facts in ftp.mlsd() if facts.get('type') == 'file'] except Exception as ex: if '500' in str(ex): file_list = [] file_name_list = [] date_list = [] ftp.dir(file_list.append) self.DATE_FORMAT = '%Y %b %d %H:%M:%S' for line in file_list: col = line.split() date_string = '{} '.format(datetime.now().year) + ' '.join(col[5:8]) + ':00' date_list.append(date_string) file_name_list.append(col[8]) return zip(file_name_list, date_list) else: raise IngestFtpError.ftpError(ex, provider) finally: self._log_msg("FTP list files. Exec time: {:.4f} secs.".format(self._timer.stop('ftp_list'))) def _sort_files(self, files): self._timer.start('sort_files') files = sorted(files, key=lambda x: x[1]) self._log_msg("Sort {} files. Exec time: {:.4f} secs.".format(len(files), self._timer.stop('sort_files'))) return files def _retrieve_and_parse(self, ftp, config, filename, provider, registered_parser): self._timer.start('retrieve_parse') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') local_file_path = os.path.join(config['dest_path'], filename) with open(local_file_path, 'wb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) self._log_msg( "Download finished. Exec time: {:.4f} secs. Size: {} bytes. File: {}.".format( self._timer.split('retrieve_parse'), os.path.getsize(local_file_path), filename ) ) except ftplib.all_errors: self._log_msg( "Download failed. Exec time: {:.4f} secs. File: {}.".format( self._timer.stop('retrieve_parse'), filename ) ) os.remove(local_file_path) raise Exception('Exception retrieving file from FTP server ({filename})'.format( filename=filename)) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) self._log_msg( "Parsing finished. Exec time: {:.4f} secs. File: {}.".format( self._timer.stop('retrieve_parse'), filename ) ) return [parsed] if isinstance(parsed, dict) else parsed def _update(self, provider, update): config = provider.get('config', {}) do_move = config.get('move', False) last_processed_file_modify = provider.get('private', {}).get('last_processed_file_modify') limit = app.config.get('FTP_INGEST_FILES_LIST_LIMIT', 100) registered_parser = self.get_feed_parser(provider) allowed_ext = getattr(registered_parser, 'ALLOWED_EXT', self.ALLOWED_EXT_DEFAULT) try: self._timer.start('ftp_connect') with ftp_connect(config) as ftp: self._log_msg("Connected to FTP server. Exec time: {:.4f} secs.".format( self._timer.stop('ftp_connect') )) items = [] files_to_process = [] files = self._sort_files(self._list_files(ftp, provider)) if do_move: move_path, move_path_error = self._create_move_folders(config, ftp) self._timer.start('files_to_process') for filename, modify in files: # filter by extension if not self._is_allowed(filename, allowed_ext): logger.info('ignoring file {filename} because of file extension'.format(filename=filename)) continue # filter by modify datetime file_modify = datetime.strptime(modify, self.DATE_FORMAT).replace(tzinfo=utc) if last_processed_file_modify: # ignore limit and add files for processing if last_processed_file_modify == file_modify: files_to_process.append((filename, file_modify)) elif last_processed_file_modify < file_modify: # even if we have reached a limit, we must add at least one file to increment # a `last_processed_file_modify` in provider files_to_process.append((filename, file_modify)) # limit amount of files to process per ingest update if len(files_to_process) >= limit: break else: # limit amount of files to process per ingest update if len(files_to_process) >= limit: break # add files for processing files_to_process.append((filename, file_modify)) self._log_msg( "Got {} file for processing. Exec time: {:.4f} secs.".format( len(files_to_process), self._timer.stop('files_to_process') ) ) # process files self._timer.start('start_processing') for filename, file_modify in files_to_process: try: update['private'] = {'last_processed_file_modify': file_modify} failed = yield self._retrieve_and_parse(ftp, config, filename, provider, registered_parser) if do_move: move_dest_file_path = os.path.join(move_path if not failed else move_path_error, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error("Error while parsing {filename}: {msg}".format(filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join(move_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) self._log_msg( "Processing finished. Exec time: {:.4f} secs.".format(self._timer.stop('start_processing')) ) except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
class FTPFeedingService(FeedingService): """ Feeding Service class which can read article(s) which exist in a file system and accessible using FTP. """ NAME = 'ftp' ERRORS = [ IngestFtpError.ftpUnknownParserError().get_error_description(), IngestFtpError.ftpError().get_error_description() ] label = 'FTP feed API' fields = [{ 'id': 'host', 'type': 'text', 'label': 'Host', 'placeholder': 'FTP Server URL', 'required': True, 'errors': { 5003: 'Server not found.' } }, { 'id': 'username', 'type': 'text', 'label': 'Username', 'placeholder': 'Username', 'required': False, 'errors': { 5002: 'Credentials error.' } }, { 'id': 'password', 'type': 'password', 'label': 'Password', 'placeholder': 'Password', 'required': False }, { 'id': 'path', 'type': 'text', 'label': 'Path', 'placeholder': 'FTP Server Path', 'required': False }, { 'id': 'dest_path', 'type': 'text', 'label': 'Local Path', 'placeholder': 'Local Path', 'required': True }, { 'id': 'passive', 'type': 'boolean', 'label': 'Passive', 'placeholder': 'Passive', 'required': False, 'default': True }, { 'id': 'move', 'type': 'boolean', 'label': 'Move items after ingestion', 'placeholder': 'Move items after ingestion', 'required': False }, { 'id': 'ftp_move_path', 'type': 'text', 'label': 'Move ingested items to', 'placeholder': 'FTP Server Path, keep empty to use default path', 'required_expression': '{move}', 'show_expression': '{move}' }, { 'id': 'move_path_error', 'type': 'text', 'label': 'Move *NOT* ingested items (i.e. on error) to', 'placeholder': 'FTP Server Path, keep empty to use default path', 'required_expression': '{move}', 'show_expression': '{move}' }] DATE_FORMAT = '%Y%m%d%H%M%S' ALLOWED_EXT_DEFAULT = {'.json', '.xml'} def config_from_url(self, url): """ Parse given url into ftp config. :param url: url in form `ftp://username:password@host:port/dir` """ url_parts = urlparse(url) return { 'username': url_parts.username, 'password': url_parts.password, 'host': url_parts.hostname, 'path': url_parts.path.lstrip('/'), } def _test(self, provider): config = provider.get('config', {}) try: with ftp_connect(config) as ftp: ftp.mlsd() except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider) def _move(self, ftp, src, dest): """Move distant file :param ftp: FTP instance to use :type ftp: ftplib.FTP :param src: source path of the file to move :type src: str :param dest: dest path of the file to move :type dest: str """ try: ftp.rename(src, dest) except ftplib.all_errors as e: logger.warning( "Can't move file from {src} to {dest}: {reason}".format( src=src, dest=dest, reason=e)) def _create_if_missing(self, ftp, path): """Check if a dir exists, and create it else :param ftp: FTP instance to use :type ftp: ftplib.FTP :param src: dir path to check :type src: str """ base_path = ftp.pwd() try: ftp.cwd(path) except ftplib.all_errors: # path probably doesn't exist # catching all_errors is a bit overkill, # but ftplib doesn't really have precise error # for missing directory ftp.mkd(path) finally: ftp.cwd(base_path) def _is_allowed(self, filename, allowed_ext): """Test if given file is allowed to be ingested.""" _, ext = os.path.splitext(filename) return ext.lower() in allowed_ext def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') registered_parser = self.get_feed_parser(provider) try: allowed_ext = registered_parser.ALLOWED_EXT except AttributeError: allowed_ext = self.ALLOWED_EXT_DEFAULT crt_last_updated = None if config.get('move', False): do_move = True if not config.get('move_path'): logger.debug('missing move_path, default will be used') move_dest_path = os.path.join( config.get('path', ''), config.get('move_path') or DEFAULT_SUCCESS_PATH) if not config.get('move_path_error'): logger.debug('missing move_path_error, default will be used') move_dest_path_error = os.path.join( config.get('path', ''), config.get('move_path_error') or DEFAULT_FAILURE_PATH) else: do_move = False if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: if do_move: try: self._create_if_missing(ftp, move_dest_path) self._create_if_missing(ftp, move_dest_path_error) except ftplib.all_errors as e: logger.warning( "Can't create move directory, files will not be moved: {reason}" .format(reason=e)) do_move = False items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue try: if not self._is_allowed(filename, allowed_ext): logger.info( 'ignoring file {filename} because of file extension' .format(filename=filename)) continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated <= last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated local_file_path = os.path.join(config['dest_path'], filename) with open(local_file_path, 'wb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors: os.remove(local_file_path) raise Exception( 'Exception retrieving file from FTP server ({filename})' .format(filename=filename)) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser( provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) if do_move: move_dest_file_path = os.path.join( move_dest_path, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error( "Error while parsing {filename}: {msg}".format( filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join( move_dest_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider, update): config = provider.get('config', {}) do_move = config.get('move', False) last_processed_file_modify = provider.get( 'private', {}).get('last_processed_file_modify') limit = app.config.get('FTP_INGEST_FILES_LIST_LIMIT', 100) registered_parser = self.get_feed_parser(provider) allowed_ext = getattr(registered_parser, 'ALLOWED_EXT', self.ALLOWED_EXT_DEFAULT) try: with ftp_connect(config) as ftp: items = [] files_to_process = [] files = self._sort_files(self._list_files(ftp, provider)) if do_move: move_path, move_path_error = self._create_move_folders( config, ftp) for filename, modify in files: # filter by extension if not self._is_allowed(filename, allowed_ext): logger.info( 'ignoring file {filename} because of file extension' .format(filename=filename)) continue # filter by modify datetime file_modify = datetime.strptime( modify, self.DATE_FORMAT).replace(tzinfo=utc) if last_processed_file_modify: # ignore limit and add files for processing if last_processed_file_modify == file_modify: files_to_process.append((filename, file_modify)) elif last_processed_file_modify < file_modify: # evenv if we have reached a limit, we must add at least one file to increment # a `last_processed_file_modify` in provider files_to_process.append((filename, file_modify)) # limit amount of files to process per ingest update if len(files_to_process) >= limit: break else: # limit amount of files to process per ingest update if len(files_to_process) >= limit: break # add files for processing files_to_process.append((filename, file_modify)) # process files for filename, file_modify in files_to_process: try: items += self._retrieve_and_parse( ftp, config, filename, provider, registered_parser) update['private'] = { 'last_processed_file_modify': file_modify } if do_move: move_dest_file_path = os.path.join( move_path, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error( "Error while parsing {filename}: {msg}".format( filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join( move_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') registered_parser = self.get_feed_parser(provider) try: allowed_ext = registered_parser.ALLOWED_EXT except AttributeError: allowed_ext = self.ALLOWED_EXT_DEFAULT crt_last_updated = None if config.get('move', False): do_move = True if not config.get('ftp_move_path'): logger.debug('missing move_path, default will be used') move_dest_path = os.path.join(config.get('path', ''), config.get('ftp_move_path') or DEFAULT_SUCCESS_PATH) if not config.get('move_path_error'): logger.debug('missing move_path_error, default will be used') move_dest_path_error = os.path.join(config.get('path', ''), config.get('move_path_error') or DEFAULT_FAILURE_PATH) else: do_move = False if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: if do_move: try: self._create_if_missing(ftp, move_dest_path) self._create_if_missing(ftp, move_dest_path_error) except ftplib.all_errors as e: logger.warning("Can't create move directory, files will not be moved: {reason}".format( reason=e)) do_move = False items = [] for filename, facts in self._list_items(ftp, provider): try: if not self._is_allowed(filename, allowed_ext): logger.info('ignoring file {filename} because of file extension'.format(filename=filename)) continue if last_updated: item_last_updated = datetime.strptime(facts, self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated <= last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated items += self._retrieve_and_parse(ftp, config, filename, provider, registered_parser) if do_move: move_dest_file_path = os.path.join(move_dest_path, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error("Error while parsing {filename}: {msg}".format(filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join(move_dest_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
class FTPService(IngestService): """FTP Ingest Service.""" DATE_FORMAT = '%Y%m%d%H%M%S' FILE_SUFFIX = '.xml' PROVIDER = 'ftp' ERRORS = [ IngestFtpError.ftpUnknownParserError().get_error_description(), IngestFtpError.ftpError().get_error_description() ] def config_from_url(self, url): """Parse given url into ftp config. :param url: url in form `ftp://username:password@host:port/dir` """ url_parts = urlparse(url) return { 'username': url_parts.username, 'password': url_parts.password, 'host': url_parts.hostname, 'path': url_parts.path.lstrip('/'), } def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') items = [] try: with ftplib.FTP(config.get('host')) as ftp: ftp.login(config.get('username'), config.get('password')) ftp.cwd(config.get('path', '')) ftp.set_pasv(config.get('passive', False)) items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue dest = os.path.join(config['dest_path'], filename) try: with open(dest, 'xb') as f: ftp.retrbinary('RETR %s' % filename, f.write) except FileExistsError: continue xml = etree.parse(dest).getroot() parser = get_xml_parser(xml) if not parser: raise IngestFtpError.ftpUnknownParserError( Exception('Parser not found'), provider, filename) parsed = parser.parse_message(xml, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
class FTPFeedingService(FeedingService): """ Feeding Service class which can read article(s) which exist in a file system and accessible using FTP. """ NAME = 'ftp' ERRORS = [ IngestFtpError.ftpUnknownParserError().get_error_description(), IngestFtpError.ftpError().get_error_description() ] FILE_SUFFIX = '.xml' DATE_FORMAT = '%Y%m%d%H%M%S' def config_from_url(self, url): """ Parse given url into ftp config. :param url: url in form `ftp://username:password@host:port/dir` """ url_parts = urlparse(url) return { 'username': url_parts.username, 'password': url_parts.password, 'host': url_parts.hostname, 'path': url_parts.path.lstrip('/'), } def _test(self, provider): config = provider.get('config', {}) try: with ftp_connect(config) as ftp: ftp.mlsd() except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider) def _move(self, ftp, src, dest): """Move distant file :param ftp: FTP instance to use :type ftp: ftplib.FTP :param src: source path of the file to move :type src: str :param dest: dest path of the file to move :type dest: str """ try: ftp.rename(src, dest) except ftplib.all_errors as e: logger.warning( "Can't move file from {src} to {dest}: {reason}".format( src=src, dest=dest, reason=e)) def _create_if_missing(self, ftp, path): """Check if a dir exists, and create it else :param ftp: FTP instance to use :type ftp: ftplib.FTP :param src: dir path to check :type src: str """ base_path = ftp.pwd() try: ftp.cwd(path) except ftplib.all_errors: # path probably doesn't exist # catching all_errors is a bit overkill, # but ftplib doesn't really have precise error # for missing directory ftp.mkd(path) finally: ftp.cwd(base_path) def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') crt_last_updated = None if config.get('move', False): do_move = True if not config.get('move_path'): logger.debug('missing move_path, default will be used') move_dest_path = os.path.join( config.get('path', ''), config.get('move_path') or DEFAULT_SUCCESS_PATH) if not config.get('move_path_error'): logger.debug('missing move_path_error, default will be used') move_dest_path_error = os.path.join( config.get('path', ''), config.get('move_path_error') or DEFAULT_FAILURE_PATH) else: do_move = False if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: if do_move: try: self._create_if_missing(ftp, move_dest_path) self._create_if_missing(ftp, move_dest_path_error) except ftplib.all_errors as e: logger.warning( "Can't create move directory, files will not be moved: {reason}" .format(reason=e)) do_move = False items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue try: if not filename.lower().endswith(self.FILE_SUFFIX): raise if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) raise Exception( 'Exception retrieving file from FTP server ({filename})' .format(filename=filename)) except FileExistsError as e: raise Exception( 'Exception retrieving from FTP server, file already exists ({filename])' .format(filename=local_file_path)) registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser( provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) if do_move: move_dest_file_path = os.path.join( move_dest_path, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error( "Error while parsing {filename}: {msg}".format( filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join( move_dest_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
class FTPFeedingService(FeedingService): """ Feeding Service class which can read article(s) which exist in a file system and accessible using FTP. """ NAME = 'ftp' ERRORS = [ IngestFtpError.ftpUnknownParserError().get_error_description(), IngestFtpError.ftpError().get_error_description() ] FILE_SUFFIX = '.xml' DATE_FORMAT = '%Y%m%d%H%M%S' def config_from_url(self, url): """ Parse given url into ftp config. :param url: url in form `ftp://username:password@host:port/dir` """ url_parts = urlparse(url) return { 'username': url_parts.username, 'password': url_parts.password, 'host': url_parts.hostname, 'path': url_parts.path.lstrip('/'), } def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) logger.exception( 'Exception retrieving from FTP server') continue except FileExistsError: continue registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') registered_parser = self.get_feed_parser(provider) try: allowed_ext = registered_parser.ALLOWED_EXT except AttributeError: allowed_ext = self.ALLOWED_EXT_DEFAULT crt_last_updated = None if config.get('move', False): do_move = True if not config.get('move_path'): logger.debug('missing move_path, default will be used') move_dest_path = os.path.join( config.get('path', ''), config.get('move_path') or DEFAULT_SUCCESS_PATH) if not config.get('move_path_error'): logger.debug('missing move_path_error, default will be used') move_dest_path_error = os.path.join( config.get('path', ''), config.get('move_path_error') or DEFAULT_FAILURE_PATH) else: do_move = False if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: if do_move: try: self._create_if_missing(ftp, move_dest_path) self._create_if_missing(ftp, move_dest_path_error) except ftplib.all_errors as e: logger.warning( "Can't create move directory, files will not be moved: {reason}" .format(reason=e)) do_move = False items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue try: if not self._is_allowed(filename, allowed_ext): logger.info( 'ignoring file {filename} because of file extension' .format(filename=filename)) continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated <= last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated local_file_path = os.path.join(config['dest_path'], filename) with open(local_file_path, 'wb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors: os.remove(local_file_path) raise Exception( 'Exception retrieving file from FTP server ({filename})' .format(filename=filename)) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser( provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) if do_move: move_dest_file_path = os.path.join( move_dest_path, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error( "Error while parsing {filename}: {msg}".format( filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join( move_dest_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
# # For the full copyright and license information, please see the # AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license import os import ftplib import tempfile from datetime import datetime from superdesk.utc import utc from superdesk.etree import etree from superdesk.io import get_xml_parser, register_provider from .ingest_service import IngestService from superdesk.errors import IngestFtpError errors = [ IngestFtpError.ftpUnknownParserError().get_error_description(), IngestFtpError.ftpError().get_error_description() ] try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse class FTPService(IngestService): """FTP Ingest Service.""" DATE_FORMAT = '%Y%m%d%H%M%S' FILE_SUFFIX = '.xml'