Exemplo n.º 1
0
    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        items = []
        try:
            with ftplib.FTP(config.get('host')) as ftp:
                ftp.login(config.get('username'), config.get('password'))
                ftp.cwd(config.get('path', ''))
                ftp.set_pasv(config.get('passive', False))

                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(
                            facts['modify'],
                            self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    dest = os.path.join(config['dest_path'], filename)

                    try:
                        with open(dest, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    xml = etree.parse(dest).getroot()
                    parser = get_xml_parser(xml)
                    if not parser:
                        raise IngestFtpError.ftpUnknownParserError(
                            Exception('Parser not found'), provider, filename)
                    parsed = parser.parse_message(xml, provider)
                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 2
0
    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        items = []
        try:
            with ftplib.FTP(config.get('host')) as ftp:
                ftp.login(config.get('username'), config.get('password'))
                ftp.cwd(config.get('path', ''))
                ftp.set_pasv(config.get('passive', False))

                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    dest = os.path.join(config['dest_path'], filename)

                    try:
                        with open(dest, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    xml = etree.parse(dest).getroot()
                    parser = get_xml_parser(xml)
                    if not parser:
                        raise IngestFtpError.ftpUnknownParserError(Exception('Parser not found'),
                                                                   provider, filename)
                    parsed = parser.parse_message(xml, provider)
                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 3
0
 def _list_files(self, ftp, provider):
     self._timer.start("ftp_list")
     try:
         return [(filename, facts["modify"])
                 for filename, facts in ftp.mlsd()
                 if facts.get("type") == "file"]
     except Exception as ex:
         if "500" in str(ex):
             file_list = []
             file_name_list = []
             date_list = []
             ftp.dir(file_list.append)
             self.DATE_FORMAT = "%Y %b %d %H:%M:%S"
             for line in file_list:
                 col = line.split()
                 date_string = "{} ".format(datetime.now().year) + " ".join(
                     col[5:8]) + ":00"
                 date_list.append(date_string)
                 file_name_list.append(col[8])
             return zip(file_name_list, date_list)
         else:
             raise IngestFtpError.ftpError(ex, provider)
     finally:
         self._log_msg("FTP list files. Exec time: {:.4f} secs.".format(
             self._timer.stop("ftp_list")))
Exemplo n.º 4
0
 def _list_files(self, ftp, provider):
     self._timer.start('ftp_list')
     try:
         return [(filename, facts['modify'])
                 for filename, facts in ftp.mlsd()
                 if facts.get('type') == 'file']
     except Exception as ex:
         if '500' in str(ex):
             file_list = []
             file_name_list = []
             date_list = []
             ftp.dir(file_list.append)
             self.DATE_FORMAT = '%Y %b %d %H:%M:%S'
             for line in file_list:
                 col = line.split()
                 date_string = '{} '.format(datetime.now().year) + ' '.join(
                     col[5:8]) + ':00'
                 date_list.append(date_string)
                 file_name_list.append(col[8])
             return zip(file_name_list, date_list)
         else:
             raise IngestFtpError.ftpError(ex, provider)
     finally:
         self._log_msg("FTP list files. Exec time: {:.4f} secs.".format(
             self._timer.stop('ftp_list')))
Exemplo n.º 5
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        crt_last_updated = None

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue
                        elif not crt_last_updated or item_last_updated > crt_last_updated:
                            crt_last_updated = item_last_updated

                    local_file_path = os.path.join(config['dest_path'], filename)
                    try:
                        with open(local_file_path, 'xb') as f:
                            try:
                                ftp.retrbinary('RETR %s' % filename, f.write)
                            except ftplib.all_errors as ex:
                                os.remove(local_file_path)
                                logger.exception('Exception retrieving from FTP server')
                                continue
                    except FileExistsError:
                        logger.exception('Exception retrieving from FTP server, file already exists')
                        continue

                    registered_parser = self.get_feed_parser(provider)
                    if isinstance(registered_parser, XMLFeedParser):
                        xml = etree.parse(local_file_path).getroot()
                        parser = self.get_feed_parser(provider, xml)
                        parsed = parser.parse(xml, provider)
                    else:
                        parser = self.get_feed_parser(provider, local_file_path)
                        parsed = parser.parse(local_file_path, provider)

                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 6
0
 def _test(self, provider):
     config = provider.get('config', {})
     try:
         with ftp_connect(config) as ftp:
             ftp.mlsd()
     except IngestFtpError:
         raise
     except Exception as ex:
         raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 7
0
 def _test(self, provider):
     config = provider.get('config', {})
     try:
         with ftp_connect(config) as ftp:
             ftp.mlsd()
     except IngestFtpError:
         raise
     except Exception as ex:
         raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 8
0
    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(
                            facts['modify'],
                            self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    local_file_path = os.path.join(config['dest_path'],
                                                   filename)
                    try:
                        with open(local_file_path, 'xb') as f:
                            try:
                                ftp.retrbinary('RETR %s' % filename, f.write)
                            except ftplib.all_errors as ex:
                                os.remove(local_file_path)
                                logger.exception(
                                    'Exception retrieving from FTP server')
                                continue
                    except FileExistsError:
                        continue

                    registered_parser = self.get_feed_parser(provider)
                    if isinstance(registered_parser, XMLFeedParser):
                        xml = etree.parse(local_file_path).getroot()
                        parser = self.get_feed_parser(provider, xml)
                        parsed = parser.parse(xml, provider)
                    else:
                        parser = self.get_feed_parser(provider,
                                                      local_file_path)
                        parsed = parser.parse(local_file_path, provider)

                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 9
0
def ftp_connect(config):
    """Get ftp connection for given config.

    use with `with`

    :param config: dict with `host`, `username`, `password`, `path`, `passive` and `use_ftp`
    """
    if config.get("use_ftps", False):
        try:
            ftp = ftplib.FTP_TLS(config.get("host"),
                                 timeout=app.config.get("FTP_TIMEOUT", 300))
        except socket.gaierror as e:
            raise IngestFtpError.ftpHostError(exception=e)

        try:
            ftp.auth()
        except ftplib.error_perm as ae:
            ftp.close()
            raise IngestFtpError.ftpAuthError(exception=ae)
    else:
        try:
            ftp = ftplib.FTP(config.get("host"),
                             timeout=app.config.get("FTP_TIMEOUT", 300))
        except socket.gaierror as e:
            raise IngestFtpError.ftpHostError(exception=e)

    if config.get("username"):
        try:
            ftp.login(config.get("username"), config.get("password"))
        except ftplib.error_perm as e:
            raise IngestFtpError.ftpAuthError(exception=e)

    # set encryption on data channel if able
    if hasattr(ftp, "prot_p"):
        ftp.prot_p()

    if config.get("path"):
        ftp.cwd(config.get("path", "").lstrip("/"))
    if config.get(
            "passive"
    ) is False:  # only set this when not active, it's passive by default
        ftp.set_pasv(False)
    yield ftp
    ftp.close()
Exemplo n.º 10
0
    def _update(self, provider):
        config = provider.get("config", {})
        last_updated = provider.get("last_updated")

        if "dest_path" not in config:
            config["dest_path"] = tempfile.mkdtemp(prefix="superdesk_ingest_")

        try:
            with ftplib.FTP(config.get("host")) as ftp:
                ftp.login(config.get("username"), config.get("password"))
                ftp.cwd(config.get("path", ""))
                ftp.set_pasv(config.get("passive", False))

                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get("type", "") != "file":
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(facts["modify"], self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    local_file_path = os.path.join(config["dest_path"], filename)
                    try:
                        with open(local_file_path, "xb") as f:
                            try:
                                ftp.retrbinary("RETR %s" % filename, f.write)
                            except ftplib.all_errors as ex:
                                os.remove(local_file_path)
                                logger.exception("Exception retrieving from FTP server")
                                continue
                    except FileExistsError:
                        continue

                    registered_parser = self.get_feed_parser(provider)
                    if isinstance(registered_parser, XMLFeedParser):
                        xml = etree.parse(local_file_path).getroot()
                        parser = self.get_feed_parser(provider, xml)
                        parsed = parser.parse(xml, provider)
                    else:
                        parser = self.get_feed_parser(provider, local_file_path)
                        parsed = parser.parse(local_file_path, provider)

                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 11
0
 def _test(self, provider):
     config = provider.get("config", {})
     try:
         with ftp_connect(config) as ftp:
             ftp.mlsd()
     except IngestFtpError:
         raise
     except Exception as ex:
         if "500" in str(ex):
             ftp.nlst()
         else:
             raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 12
0
def ftp_connect(config):
    """Get ftp connection for given config.

    use with `with`

    :param config: dict with `host`, `username`, `password`, `path` and `passive`
    """
    try:
        ftp = ftplib.FTP(config.get('host'), timeout=app.config.get('FTP_TIMEOUT', 300))
    except socket.gaierror as e:
        raise IngestFtpError.ftpHostError(exception=e)
    if config.get('username'):
        try:
            ftp.login(config.get('username'), config.get('password'))
        except ftplib.error_perm as e:
            raise IngestFtpError.ftpAuthError(exception=e)
    if config.get('path'):
        ftp.cwd(config.get('path', '').lstrip('/'))
    if config.get('passive') is False:  # only set this when not active, it's passive by default
        ftp.set_pasv(False)
    yield ftp
    ftp.close()
Exemplo n.º 13
0
 def test_raise_ftpUnknownParserError(self):
     with assert_raises(IngestFtpError) as error_context:
         try:
             raise Exception("Testing ftpUnknownParserError")
         except Exception as ex:
             raise IngestFtpError.ftpUnknownParserError(ex, self.provider, 'test.xml')
     exception = error_context.exception
     self.assertTrue(exception.code == 5001)
     self.assertTrue(exception.message == "FTP parser could not be found")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing ftpUnknownParserError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "IngestFtpError Error 5001 - FTP parser could not be found: "
                      "Testing ftpUnknownParserError on channel TestProvider file=test.xml")
Exemplo n.º 14
0
 def _list_files(self, ftp, provider):
     self._timer.start("ftp_list")
     try:
         return [(filename, facts["modify"])
                 for filename, facts in ftp.mlsd()
                 if facts.get("type") == "file"]
     except Exception as ex:
         if "500" in str(ex):
             now = utcnow()
             return [(file_name, now) for file_name in ftp.nlst()]
         else:
             raise IngestFtpError.ftpError(ex, provider)
     finally:
         self._log_msg("FTP list files. Exec time: {:.4f} secs.".format(
             self._timer.stop("ftp_list")))
Exemplo n.º 15
0
 def test_raise_ftpError(self):
     with assert_raises(IngestFtpError) as error_context:
         try:
             ex = Exception("Testing ftpError")
             raise ex
         except Exception:
             raise IngestFtpError.ftpError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 5000)
     self.assertTrue(exception.message == "FTP ingest error")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing ftpError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "IngestFtpError Error 5000 - FTP ingest error: "
                      "Testing ftpError on channel TestProvider")
Exemplo n.º 16
0
 def test_raise_ftpError(self):
     with assert_raises(IngestFtpError) as error_context:
         try:
             ex = Exception("Testing ftpError")
             raise ex
         except Exception:
             raise IngestFtpError.ftpError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 5000)
     self.assertTrue(exception.message == "FTP ingest error")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing ftpError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "IngestFtpError Error 5000 - FTP ingest error: "
                      "Testing ftpError on channel TestProvider")
Exemplo n.º 17
0
 def _list_items(self, ftp, provider):
     try:
         return [(filename, facts['modify']) for filename, facts in ftp.mlsd() if facts.get('type') == 'file']
     except Exception as ex:
         if '500' in str(ex):
             file_list = []
             file_name_list = []
             date_list = []
             ftp.dir(file_list.append)
             self.DATE_FORMAT = '%Y %b %d %H:%M:%S'
             for line in file_list:
                 col = line.split()
                 date_string = '{} '.format(datetime.now().year) + ' '.join(col[5:8]) + ':00'
                 date_list.append(date_string)
                 file_name_list.append(col[8])
             return zip(file_name_list, date_list)
         else:
             raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 18
0
 def test_raise_ftpUnknownParserError(self):
     with assert_raises(IngestFtpError) as error_context:
         try:
             ex = Exception("Testing ftpUnknownParserError")
             raise ex
         except Exception:
             raise IngestFtpError.ftpUnknownParserError(ex, self.provider, 'test.xml')
     exception = error_context.exception
     self.assertTrue(exception.code == 5001)
     self.assertTrue(exception.message == "FTP parser could not be found")
     self.assertIsNotNone(exception.system_exception)
     self.assertEquals(exception.system_exception.args[0], "Testing ftpUnknownParserError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 2)
     self.assertEqual(self.mock_logger_handler.messages['error'][1],
                      "IngestFtpError Error 5001 - FTP parser could not be found: "
                      "Testing ftpUnknownParserError on channel TestProvider")
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "Provider: TestProvider - File: test.xml unknown file format. "
                      "Parser couldn't be found.")
Exemplo n.º 19
0
 def test_raise_ftpUnknownParserError(self):
     with assert_raises(IngestFtpError) as error_context:
         try:
             ex = Exception("Testing ftpUnknownParserError")
             raise ex
         except Exception:
             raise IngestFtpError.ftpUnknownParserError(
                 ex, self.provider, 'test.xml')
     exception = error_context.exception
     self.assertTrue(exception.code == 5001)
     self.assertTrue(exception.message == "FTP parser could not be found")
     self.assertIsNotNone(exception.system_exception)
     self.assertEquals(exception.system_exception.args[0],
                       "Testing ftpUnknownParserError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 2)
     self.assertEqual(
         self.mock_logger_handler.messages['error'][1],
         "IngestFtpError Error 5001 - FTP parser could not be found: "
         "Testing ftpUnknownParserError on channel TestProvider")
     self.assertEqual(
         self.mock_logger_handler.messages['error'][0],
         "Provider: TestProvider - File: test.xml unknown file format. "
         "Parser couldn't be found.")
Exemplo n.º 20
0
class FTPFeedingService(FeedingService):
    """
    Feeding Service class which can read article(s) which exist in a file system and accessible using FTP.
    """

    NAME = "ftp"

    ERRORS = [
        IngestFtpError.ftpUnknownParserError().get_error_description(),
        IngestFtpError.ftpError().get_error_description(),
    ]

    label = "FTP feed"

    fields = [
        {
            "id": "host",
            "type": "text",
            "label": "Host",
            "placeholder": "FTP Server URL",
            "required": True,
            "errors": {
                5003: "Server not found."
            },
        },
        {
            "id": "username",
            "type": "text",
            "label": "Username",
            "placeholder": "Username",
            "required": False,
            "errors": {
                5002: "Credentials error."
            },
        },
        {
            "id": "password",
            "type": "password",
            "label": "Password",
            "placeholder": "Password",
            "required": False
        },
        {
            "id": "path",
            "type": "text",
            "label": "Path",
            "placeholder": "FTP Server Path",
            "required": False
        },
        {
            "id": "dest_path",
            "type": "text",
            "label": "Local Path",
            "placeholder": "Local Path",
            "required": True
        },
        {
            "id": "passive",
            "type": "boolean",
            "label": "Passive",
            "placeholder": "Passive",
            "required": False,
            "default": True,
        },
        {
            "id": "move",
            "type": "boolean",
            "label": "Move items after ingestion",
            "placeholder": "Move items after ingestion",
            "required": False,
        },
        {
            "id": "ftp_move_path",
            "type": "text",
            "label": "Move ingested items to",
            "placeholder": "FTP Server Path, keep empty to use default path",
            "required": False,
            "show_expression": "provider.config.move === true",
        },
        {
            "id": "move_path_error",
            "type": "text",
            "label": "Move *NOT* ingested items (i.e. on error) to",
            "placeholder": "FTP Server Path, keep empty to use default path",
            "required": False,
            "show_expression": "provider.config.move === true",
        },
    ]

    DATE_FORMAT = "%Y%m%d%H%M%S"

    ALLOWED_EXT_DEFAULT = {".json", ".xml"}

    def config_from_url(self, url):
        """
        Parse given url into ftp config.

        :param url: url in form `ftp://username:password@host:port/dir`
        """
        url_parts = urlparse(url)
        return {
            "username": url_parts.username,
            "password": url_parts.password,
            "host": url_parts.hostname,
            "path": url_parts.path.lstrip("/"),
        }

    def _test(self, provider):
        config = provider.get("config", {})
        try:
            with ftp_connect(config) as ftp:
                ftp.mlsd()
        except IngestFtpError:
            raise
        except Exception as ex:
            if "500" in str(ex):
                ftp.nlst()
            else:
                raise IngestFtpError.ftpError(ex, provider)

    def _move(self, ftp, src, dest, file_modify, failed):
        """Move distant file

        file won't be moved if it is failed and last modification was made
        recently enough (i.e. before config's INGEST_OLD_CONTENT_MINUTES is
        expired). In other words, if a file fails, it will be tried again until
        INGEST_OLD_CONTENT_MINUTES delay expires.

        :param ftp: FTP instance to use
        :type ftp: ftplib.FTP
        :param src: source path of the file to move
        :type src: str
        :param dest: dest path of the file to move
        :type dest: str
        :param file_modify: date of last file modification
        :type file_modify: datetime
        :param failed: True if something when wrong during ingestion
        :type failed: bool
        """
        if failed and not self.is_old_content(file_modify):
            logger.warning(
                "{src!r} ingestion failed, but we are in the backstop delay, it will be "
                "tried again next time".format(src=src))
            return
        try:
            ftp.rename(src, dest)
        except ftplib.all_errors as e:
            logger.warning(
                "Can't move file from {src} to {dest}: {reason}".format(
                    src=src, dest=dest, reason=e))

    def _create_if_missing(self, ftp, path):
        """Check if a dir exists, and create it else

        :param ftp: FTP instance to use
        :type ftp: ftplib.FTP
        :param src: dir path to check
        :type src: str
        """
        base_path = ftp.pwd()
        try:
            ftp.cwd(path)
        except ftplib.all_errors:
            # path probably doesn't exist
            # catching all_errors is a bit overkill,
            # but ftplib doesn't really have precise error
            # for missing directory
            if path.startswith("./"):
                ftp.cwd("/")
                ftp.mkd(path)
            elif not path.startswith("/"):
                ftp.mkd("/" + path)
            else:
                ftp.mkd(path)
        finally:
            ftp.cwd(base_path)

    def _create_move_folders(self, config, ftp):
        if not config.get("ftp_move_path"):
            logger.debug("missing move_path, default will be used")
        move_path = os.path.join(
            config.get("path", ""),
            config.get("ftp_move_path") or DEFAULT_SUCCESS_PATH)

        if not config.get("move_path_error"):
            logger.debug("missing move_path_error, default will be used")
        move_path_error = os.path.join(
            config.get("path", ""),
            config.get("move_path_error") or DEFAULT_FAILURE_PATH)

        try:
            self._create_if_missing(ftp, move_path)
            self._create_if_missing(ftp, move_path_error)
        except ftplib.all_errors as e:
            logger.error(
                "Can't create move directory: {reason}".format(reason=e))
            raise e

        return move_path, move_path_error

    def _is_allowed(self, filename, allowed_ext):
        """Test if given file is allowed to be ingested."""
        _, ext = os.path.splitext(filename)
        return ext.lower() in allowed_ext

    def _is_empty(self, file_path):
        """Test if given file path is empty, return True if a file is empty"""
        return not (os.path.isfile(file_path)
                    and os.path.getsize(file_path) > 0)

    def _list_files(self, ftp, provider):
        self._timer.start("ftp_list")
        try:
            return [(filename, facts["modify"])
                    for filename, facts in ftp.mlsd()
                    if facts.get("type") == "file"]
        except Exception as ex:
            if "500" in str(ex):
                file_list = []
                file_name_list = []
                date_list = []
                ftp.dir(file_list.append)
                self.DATE_FORMAT = "%Y %b %d %H:%M:%S"
                for line in file_list:
                    col = line.split()
                    date_string = "{} ".format(datetime.now().year) + " ".join(
                        col[5:8]) + ":00"
                    date_list.append(date_string)
                    file_name_list.append(col[8])
                return zip(file_name_list, date_list)
            else:
                raise IngestFtpError.ftpError(ex, provider)
        finally:
            self._log_msg("FTP list files. Exec time: {:.4f} secs.".format(
                self._timer.stop("ftp_list")))

    def _sort_files(self, files):
        self._timer.start("sort_files")
        files = sorted(files, key=lambda x: x[1])
        self._log_msg("Sort {} files. Exec time: {:.4f} secs.".format(
            len(files), self._timer.stop("sort_files")))
        return files

    def _retrieve_and_parse(self, ftp, config, filename, provider,
                            registered_parser):
        self._timer.start("retrieve_parse")

        if "dest_path" not in config:
            config["dest_path"] = tempfile.mkdtemp(prefix="superdesk_ingest_")
        local_file_path = os.path.join(config["dest_path"], filename)

        with open(local_file_path, "wb") as f:
            try:
                ftp.retrbinary("RETR %s" % filename, f.write)
                self._log_msg(
                    "Download finished. Exec time: {:.4f} secs. Size: {} bytes. File: {}."
                    .format(self._timer.split("retrieve_parse"),
                            os.path.getsize(local_file_path), filename))
            except ftplib.all_errors:
                self._log_msg(
                    "Download failed. Exec time: {:.4f} secs. File: {}.".
                    format(self._timer.stop("retrieve_parse"), filename))
                os.remove(local_file_path)
                raise Exception(
                    "Exception retrieving file from FTP server ({filename})".
                    format(filename=filename))

        if self._is_empty(local_file_path):
            logger.info(
                "ignoring empty file {filename}".format(filename=filename))
            raise EmptyFile(local_file_path)

        if isinstance(registered_parser, XMLFeedParser):
            xml = etree.parse(local_file_path).getroot()
            parser = self.get_feed_parser(provider, xml)
            parsed = parser.parse(xml, provider)
        else:
            parser = self.get_feed_parser(provider, local_file_path)
            parsed = parser.parse(local_file_path, provider)

        self._log_msg(
            "Parsing finished. Exec time: {:.4f} secs. File: {}.".format(
                self._timer.stop("retrieve_parse"), filename))

        return [parsed] if isinstance(parsed, dict) else parsed

    def _update(self, provider, update):
        config = provider.get("config", {})
        do_move = config.get("move", False)
        last_processed_file_modify = provider.get(
            "private", {}).get("last_processed_file_modify")
        limit = app.config.get("FTP_INGEST_FILES_LIST_LIMIT", 100)
        registered_parser = self.get_feed_parser(provider)
        allowed_ext = getattr(registered_parser, "ALLOWED_EXT",
                              self.ALLOWED_EXT_DEFAULT)

        try:
            self._timer.start("ftp_connect")
            with ftp_connect(config) as ftp:
                self._log_msg(
                    "Connected to FTP server. Exec time: {:.4f} secs.".format(
                        self._timer.stop("ftp_connect")))
                files_to_process = []
                files = self._sort_files(self._list_files(ftp, provider))

                if do_move:
                    move_path, move_path_error = self._create_move_folders(
                        config, ftp)

                self._timer.start("files_to_process")

                for filename, modify in files:
                    # filter by extension
                    if not self._is_allowed(filename, allowed_ext):
                        logger.info(
                            "ignoring file {filename} because of file extension"
                            .format(filename=filename))
                        continue

                    # filter by modify datetime
                    file_modify = datetime.strptime(
                        modify, self.DATE_FORMAT).replace(tzinfo=utc)
                    if last_processed_file_modify:
                        # ignore limit and add files for processing
                        if last_processed_file_modify == file_modify:
                            files_to_process.append((filename, file_modify))
                        elif last_processed_file_modify < file_modify:
                            # even if we have reached a limit, we must add at least one file to increment
                            # a `last_processed_file_modify` in provider
                            files_to_process.append((filename, file_modify))
                            # limit amount of files to process per ingest update
                            if len(files_to_process) >= limit:
                                break
                    else:
                        # limit amount of files to process per ingest update
                        if len(files_to_process) >= limit:
                            break
                        # add files for processing
                        files_to_process.append((filename, file_modify))

                self._log_msg(
                    "Got {} file for processing. Exec time: {:.4f} secs.".
                    format(len(files_to_process),
                           self._timer.stop("files_to_process")))

                # process files
                self._timer.start("start_processing")
                for filename, file_modify in files_to_process:
                    try:
                        update["private"] = {
                            "last_processed_file_modify": file_modify
                        }
                        failed = yield self._retrieve_and_parse(
                            ftp, config, filename, provider, registered_parser)

                        if do_move:
                            move_dest_file_path = os.path.join(
                                move_path if not failed else move_path_error,
                                filename)
                            self._move(ftp,
                                       filename,
                                       move_dest_file_path,
                                       file_modify,
                                       failed=failed)
                    except EmptyFile:
                        continue
                    except Exception as e:
                        logger.error(
                            "Error while parsing {filename}: {msg}".format(
                                filename=filename, msg=e))

                        if do_move:
                            move_dest_file_path_error = os.path.join(
                                move_path_error, filename)
                            self._move(ftp,
                                       filename,
                                       move_dest_file_path_error,
                                       file_modify,
                                       failed=True)

                self._log_msg(
                    "Processing finished. Exec time: {:.4f} secs.".format(
                        self._timer.stop("start_processing")))

        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 21
0
    def _update(self, provider, update):
        config = provider.get("config", {})
        do_move = config.get("move", False)
        last_processed_file_modify = provider.get(
            "private", {}).get("last_processed_file_modify")
        limit = app.config.get("FTP_INGEST_FILES_LIST_LIMIT", 100)
        registered_parser = self.get_feed_parser(provider)
        allowed_ext = getattr(registered_parser, "ALLOWED_EXT",
                              self.ALLOWED_EXT_DEFAULT)

        try:
            self._timer.start("ftp_connect")
            with ftp_connect(config) as ftp:
                self._log_msg(
                    "Connected to FTP server. Exec time: {:.4f} secs.".format(
                        self._timer.stop("ftp_connect")))
                files_to_process = []
                files = self._sort_files(self._list_files(ftp, provider))

                if do_move:
                    move_path, move_path_error = self._create_move_folders(
                        config, ftp)

                self._timer.start("files_to_process")

                for filename, modify in files:
                    # filter by extension
                    if not self._is_allowed(filename, allowed_ext):
                        logger.info(
                            "ignoring file {filename} because of file extension"
                            .format(filename=filename))
                        continue

                    # filter by modify datetime
                    file_modify = datetime.strptime(
                        modify, self.DATE_FORMAT).replace(tzinfo=utc)
                    if last_processed_file_modify:
                        # ignore limit and add files for processing
                        if last_processed_file_modify == file_modify:
                            files_to_process.append((filename, file_modify))
                        elif last_processed_file_modify < file_modify:
                            # even if we have reached a limit, we must add at least one file to increment
                            # a `last_processed_file_modify` in provider
                            files_to_process.append((filename, file_modify))
                            # limit amount of files to process per ingest update
                            if len(files_to_process) >= limit:
                                break
                    else:
                        # limit amount of files to process per ingest update
                        if len(files_to_process) >= limit:
                            break
                        # add files for processing
                        files_to_process.append((filename, file_modify))

                self._log_msg(
                    "Got {} file for processing. Exec time: {:.4f} secs.".
                    format(len(files_to_process),
                           self._timer.stop("files_to_process")))

                # process files
                self._timer.start("start_processing")
                for filename, file_modify in files_to_process:
                    try:
                        update["private"] = {
                            "last_processed_file_modify": file_modify
                        }
                        failed = yield self._retrieve_and_parse(
                            ftp, config, filename, provider, registered_parser)

                        if do_move:
                            move_dest_file_path = os.path.join(
                                move_path if not failed else move_path_error,
                                filename)
                            self._move(ftp,
                                       filename,
                                       move_dest_file_path,
                                       file_modify,
                                       failed=failed)
                    except EmptyFile:
                        continue
                    except Exception as e:
                        logger.error(
                            "Error while parsing {filename}: {msg}".format(
                                filename=filename, msg=e))

                        if do_move:
                            move_dest_file_path_error = os.path.join(
                                move_path_error, filename)
                            self._move(ftp,
                                       filename,
                                       move_dest_file_path_error,
                                       file_modify,
                                       failed=True)

                self._log_msg(
                    "Processing finished. Exec time: {:.4f} secs.".format(
                        self._timer.stop("start_processing")))

        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 22
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        crt_last_updated = None
        if config.get('move', False):
            do_move = True
            if not config.get('move_path'):
                logger.debug('missing move_path, default will be used')
            move_dest_path = os.path.join(config.get('path', ''), config.get('move_path') or DEFAULT_SUCCESS_PATH)
            if not config.get('move_path_error'):
                logger.debug('missing move_path_error, default will be used')
            move_dest_path_error = os.path.join(config.get('path', ''),
                                                config.get('move_path_error') or DEFAULT_FAILURE_PATH)
        else:
            do_move = False

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                if do_move:
                    try:
                        self._create_if_missing(ftp, move_dest_path)
                        self._create_if_missing(ftp, move_dest_path_error)
                    except ftplib.all_errors as e:
                        logger.warning("Can't create move directory, files will not be moved: {reason}".format(
                            reason=e))
                        do_move = False
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue
                    try:
                        if not filename.lower().endswith(self.FILE_SUFFIX):
                            raise

                        if last_updated:
                            item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                            if item_last_updated < last_updated:
                                continue
                            elif not crt_last_updated or item_last_updated > crt_last_updated:
                                crt_last_updated = item_last_updated

                        local_file_path = os.path.join(config['dest_path'], filename)
                        try:
                            with open(local_file_path, 'xb') as f:
                                try:
                                    ftp.retrbinary('RETR %s' % filename, f.write)
                                except ftplib.all_errors as ex:
                                    os.remove(local_file_path)
                                    raise Exception('Exception retrieving file from FTP server ({filename})'.format(
                                                    filename=filename))
                        except FileExistsError as e:
                            raise Exception('Exception retrieving from FTP server, file already exists ({filename])'
                                            .format(filename=local_file_path))

                        registered_parser = self.get_feed_parser(provider)
                        if isinstance(registered_parser, XMLFeedParser):
                            xml = etree.parse(local_file_path).getroot()
                            parser = self.get_feed_parser(provider, xml)
                            parsed = parser.parse(xml, provider)
                        else:
                            parser = self.get_feed_parser(provider, local_file_path)
                            parsed = parser.parse(local_file_path, provider)

                        if isinstance(parsed, dict):
                            parsed = [parsed]

                        items.append(parsed)
                        if do_move:
                            move_dest_file_path = os.path.join(move_dest_path, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error("Error while parsing {filename}: {msg}".format(filename=filename, msg=e))
                        if do_move:
                            move_dest_file_path_error = os.path.join(move_dest_path_error, filename)
                            self._move(ftp, filename, move_dest_file_path_error)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 23
0
class FTPFeedingService(FeedingService):
    """
    Feeding Service class which can read article(s) which exist in a file system and accessible using FTP.
    """

    NAME = 'ftp'

    ERRORS = [IngestFtpError.ftpUnknownParserError().get_error_description(),
              IngestFtpError.ftpError().get_error_description()]

    label = 'FTP feed'

    fields = [
        {
            'id': 'host', 'type': 'text', 'label': 'Host',
            'placeholder': 'FTP Server URL', 'required': True,
            'errors': {5003: 'Server not found.'}
        },
        {
            'id': 'username', 'type': 'text', 'label': 'Username',
            'placeholder': 'Username', 'required': False,
            'errors': {5002: 'Credentials error.'}
        },
        {
            'id': 'password', 'type': 'password', 'label': 'Password',
            'placeholder': 'Password', 'required': False
        },
        {
            'id': 'path', 'type': 'text', 'label': 'Path',
            'placeholder': 'FTP Server Path', 'required': False
        },
        {
            'id': 'dest_path', 'type': 'text', 'label': 'Local Path',
            'placeholder': 'Local Path', 'required': True
        },
        {
            'id': 'passive', 'type': 'boolean', 'label': 'Passive',
            'placeholder': 'Passive', 'required': False, 'default': True
        },
        {
            'id': 'move', 'type': 'boolean', 'label': 'Move items after ingestion',
            'placeholder': 'Move items after ingestion', 'required': False
        },
        {
            'id': 'ftp_move_path', 'type': 'text', 'label': 'Move ingested items to',
            'placeholder': 'FTP Server Path, keep empty to use default path',
            'required': False, 'show_expression': '{move}'
        },
        {
            'id': 'move_path_error', 'type': 'text', 'label': 'Move *NOT* ingested items (i.e. on error) to',
            'placeholder': 'FTP Server Path, keep empty to use default path',
            'required': False, 'show_expression': '{move}'
        }
    ]

    DATE_FORMAT = '%Y%m%d%H%M%S'

    ALLOWED_EXT_DEFAULT = {'.json', '.xml'}

    def config_from_url(self, url):
        """
        Parse given url into ftp config.

        :param url: url in form `ftp://username:password@host:port/dir`
        """
        url_parts = urlparse(url)
        return {
            'username': url_parts.username,
            'password': url_parts.password,
            'host': url_parts.hostname,
            'path': url_parts.path.lstrip('/'),
        }

    def _test(self, provider):
        config = provider.get('config', {})
        try:
            with ftp_connect(config) as ftp:
                ftp.mlsd()
        except IngestFtpError:
            raise
        except Exception as ex:
            if '500' in str(ex):
                ftp.nlst()
            else:
                raise IngestFtpError.ftpError(ex, provider)

    def _move(self, ftp, src, dest):
        """Move distant file

        :param ftp: FTP instance to use
        :type ftp: ftplib.FTP
        :param src: source path of the file to move
        :type src: str
        :param dest: dest path of the file to move
        :type dest: str
        """
        try:
            ftp.rename(src, dest)
        except ftplib.all_errors as e:
            logger.warning("Can't move file from {src} to {dest}: {reason}".format(
                src=src,
                dest=dest,
                reason=e))

    def _create_if_missing(self, ftp, path):
        """Check if a dir exists, and create it else

        :param ftp: FTP instance to use
        :type ftp: ftplib.FTP
        :param src: dir path to check
        :type src: str
        """
        base_path = ftp.pwd()
        try:
            ftp.cwd(path)
        except ftplib.all_errors:
            # path probably doesn't exist
            # catching all_errors is a bit overkill,
            # but ftplib doesn't really have precise error
            # for missing directory
            ftp.mkd(path)
        finally:
            ftp.cwd(base_path)

    def _create_move_folders(self, config, ftp):
        if not config.get('ftp_move_path'):
            logger.debug('missing move_path, default will be used')
        move_path = os.path.join(config.get('path', ''), config.get('ftp_move_path') or DEFAULT_SUCCESS_PATH)

        if not config.get('move_path_error'):
            logger.debug('missing move_path_error, default will be used')
        move_path_error = os.path.join(config.get('path', ''),
                                       config.get('move_path_error') or DEFAULT_FAILURE_PATH)

        try:
            self._create_if_missing(ftp, move_path)
            self._create_if_missing(ftp, move_path_error)
        except ftplib.all_errors as e:
            logger.error("Can't create move directory: {reason}".format(reason=e))
            raise e

        return move_path, move_path_error

    def _is_allowed(self, filename, allowed_ext):
        """Test if given file is allowed to be ingested."""
        _, ext = os.path.splitext(filename)
        return ext.lower() in allowed_ext

    def _list_files(self, ftp, provider):
        self._timer.start('ftp_list')
        try:
            return [(filename, facts['modify']) for filename, facts in ftp.mlsd() if facts.get('type') == 'file']
        except Exception as ex:
            if '500' in str(ex):
                file_list = []
                file_name_list = []
                date_list = []
                ftp.dir(file_list.append)
                self.DATE_FORMAT = '%Y %b %d %H:%M:%S'
                for line in file_list:
                    col = line.split()
                    date_string = '{} '.format(datetime.now().year) + ' '.join(col[5:8]) + ':00'
                    date_list.append(date_string)
                    file_name_list.append(col[8])
                return zip(file_name_list, date_list)
            else:
                raise IngestFtpError.ftpError(ex, provider)
        finally:
            self._log_msg("FTP list files. Exec time: {:.4f} secs.".format(self._timer.stop('ftp_list')))

    def _sort_files(self, files):
        self._timer.start('sort_files')
        files = sorted(files, key=lambda x: x[1])
        self._log_msg("Sort {} files. Exec time: {:.4f} secs.".format(len(files), self._timer.stop('sort_files')))
        return files

    def _retrieve_and_parse(self, ftp, config, filename, provider, registered_parser):
        self._timer.start('retrieve_parse')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')
        local_file_path = os.path.join(config['dest_path'], filename)

        with open(local_file_path, 'wb') as f:
            try:
                ftp.retrbinary('RETR %s' % filename, f.write)
                self._log_msg(
                    "Download finished. Exec time: {:.4f} secs. Size: {} bytes. File: {}.".format(
                        self._timer.split('retrieve_parse'),
                        os.path.getsize(local_file_path),
                        filename
                    )
                )
            except ftplib.all_errors:
                self._log_msg(
                    "Download failed. Exec time: {:.4f} secs. File: {}.".format(
                        self._timer.stop('retrieve_parse'),
                        filename
                    )
                )
                os.remove(local_file_path)
                raise Exception('Exception retrieving file from FTP server ({filename})'.format(
                                filename=filename))

        if isinstance(registered_parser, XMLFeedParser):
            xml = etree.parse(local_file_path).getroot()
            parser = self.get_feed_parser(provider, xml)
            parsed = parser.parse(xml, provider)
        else:
            parser = self.get_feed_parser(provider, local_file_path)
            parsed = parser.parse(local_file_path, provider)

        self._log_msg(
            "Parsing finished. Exec time: {:.4f} secs. File: {}.".format(
                self._timer.stop('retrieve_parse'),
                filename
            )
        )

        return [parsed] if isinstance(parsed, dict) else parsed

    def _update(self, provider, update):
        config = provider.get('config', {})
        do_move = config.get('move', False)
        last_processed_file_modify = provider.get('private', {}).get('last_processed_file_modify')
        limit = app.config.get('FTP_INGEST_FILES_LIST_LIMIT', 100)
        registered_parser = self.get_feed_parser(provider)
        allowed_ext = getattr(registered_parser, 'ALLOWED_EXT', self.ALLOWED_EXT_DEFAULT)

        try:
            self._timer.start('ftp_connect')
            with ftp_connect(config) as ftp:
                self._log_msg("Connected to FTP server. Exec time: {:.4f} secs.".format(
                    self._timer.stop('ftp_connect')
                ))
                items = []
                files_to_process = []
                files = self._sort_files(self._list_files(ftp, provider))

                if do_move:
                    move_path, move_path_error = self._create_move_folders(config, ftp)

                self._timer.start('files_to_process')

                for filename, modify in files:
                    # filter by extension
                    if not self._is_allowed(filename, allowed_ext):
                        logger.info('ignoring file {filename} because of file extension'.format(filename=filename))
                        continue

                    # filter by modify datetime
                    file_modify = datetime.strptime(modify, self.DATE_FORMAT).replace(tzinfo=utc)
                    if last_processed_file_modify:
                        # ignore limit and add files for processing
                        if last_processed_file_modify == file_modify:
                            files_to_process.append((filename, file_modify))
                        elif last_processed_file_modify < file_modify:
                            # even if we have reached a limit, we must add at least one file to increment
                            # a `last_processed_file_modify` in provider
                            files_to_process.append((filename, file_modify))
                            # limit amount of files to process per ingest update
                            if len(files_to_process) >= limit:
                                break
                    else:
                        # limit amount of files to process per ingest update
                        if len(files_to_process) >= limit:
                            break
                        # add files for processing
                        files_to_process.append((filename, file_modify))

                self._log_msg(
                    "Got {} file for processing. Exec time: {:.4f} secs.".format(
                        len(files_to_process), self._timer.stop('files_to_process')
                    )
                )

                # process files
                self._timer.start('start_processing')
                for filename, file_modify in files_to_process:
                    try:
                        update['private'] = {'last_processed_file_modify': file_modify}
                        failed = yield self._retrieve_and_parse(ftp, config, filename, provider, registered_parser)

                        if do_move:
                            move_dest_file_path = os.path.join(move_path if not failed else move_path_error, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error("Error while parsing {filename}: {msg}".format(filename=filename, msg=e))

                        if do_move:
                            move_dest_file_path_error = os.path.join(move_path_error, filename)
                            self._move(ftp, filename, move_dest_file_path_error)

                self._log_msg(
                    "Processing finished. Exec time: {:.4f} secs.".format(self._timer.stop('start_processing'))
                )

        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 24
0
class FTPFeedingService(FeedingService):
    """
    Feeding Service class which can read article(s) which exist in a file system and accessible using FTP.
    """

    NAME = 'ftp'

    ERRORS = [
        IngestFtpError.ftpUnknownParserError().get_error_description(),
        IngestFtpError.ftpError().get_error_description()
    ]

    label = 'FTP feed API'

    fields = [{
        'id': 'host',
        'type': 'text',
        'label': 'Host',
        'placeholder': 'FTP Server URL',
        'required': True,
        'errors': {
            5003: 'Server not found.'
        }
    }, {
        'id': 'username',
        'type': 'text',
        'label': 'Username',
        'placeholder': 'Username',
        'required': False,
        'errors': {
            5002: 'Credentials error.'
        }
    }, {
        'id': 'password',
        'type': 'password',
        'label': 'Password',
        'placeholder': 'Password',
        'required': False
    }, {
        'id': 'path',
        'type': 'text',
        'label': 'Path',
        'placeholder': 'FTP Server Path',
        'required': False
    }, {
        'id': 'dest_path',
        'type': 'text',
        'label': 'Local Path',
        'placeholder': 'Local Path',
        'required': True
    }, {
        'id': 'passive',
        'type': 'boolean',
        'label': 'Passive',
        'placeholder': 'Passive',
        'required': False,
        'default': True
    }, {
        'id': 'move',
        'type': 'boolean',
        'label': 'Move items after ingestion',
        'placeholder': 'Move items after ingestion',
        'required': False
    }, {
        'id': 'ftp_move_path',
        'type': 'text',
        'label': 'Move ingested items to',
        'placeholder': 'FTP Server Path, keep empty to use default path',
        'required_expression': '{move}',
        'show_expression': '{move}'
    }, {
        'id': 'move_path_error',
        'type': 'text',
        'label': 'Move *NOT* ingested items (i.e. on error) to',
        'placeholder': 'FTP Server Path, keep empty to use default path',
        'required_expression': '{move}',
        'show_expression': '{move}'
    }]

    DATE_FORMAT = '%Y%m%d%H%M%S'

    ALLOWED_EXT_DEFAULT = {'.json', '.xml'}

    def config_from_url(self, url):
        """
        Parse given url into ftp config.

        :param url: url in form `ftp://username:password@host:port/dir`
        """
        url_parts = urlparse(url)
        return {
            'username': url_parts.username,
            'password': url_parts.password,
            'host': url_parts.hostname,
            'path': url_parts.path.lstrip('/'),
        }

    def _test(self, provider):
        config = provider.get('config', {})
        try:
            with ftp_connect(config) as ftp:
                ftp.mlsd()
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)

    def _move(self, ftp, src, dest):
        """Move distant file

        :param ftp: FTP instance to use
        :type ftp: ftplib.FTP
        :param src: source path of the file to move
        :type src: str
        :param dest: dest path of the file to move
        :type dest: str
        """
        try:
            ftp.rename(src, dest)
        except ftplib.all_errors as e:
            logger.warning(
                "Can't move file from {src} to {dest}: {reason}".format(
                    src=src, dest=dest, reason=e))

    def _create_if_missing(self, ftp, path):
        """Check if a dir exists, and create it else

        :param ftp: FTP instance to use
        :type ftp: ftplib.FTP
        :param src: dir path to check
        :type src: str
        """
        base_path = ftp.pwd()
        try:
            ftp.cwd(path)
        except ftplib.all_errors:
            # path probably doesn't exist
            # catching all_errors is a bit overkill,
            # but ftplib doesn't really have precise error
            # for missing directory
            ftp.mkd(path)
        finally:
            ftp.cwd(base_path)

    def _is_allowed(self, filename, allowed_ext):
        """Test if given file is allowed to be ingested."""
        _, ext = os.path.splitext(filename)
        return ext.lower() in allowed_ext

    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        registered_parser = self.get_feed_parser(provider)
        try:
            allowed_ext = registered_parser.ALLOWED_EXT
        except AttributeError:
            allowed_ext = self.ALLOWED_EXT_DEFAULT
        crt_last_updated = None
        if config.get('move', False):
            do_move = True
            if not config.get('move_path'):
                logger.debug('missing move_path, default will be used')
            move_dest_path = os.path.join(
                config.get('path', ''),
                config.get('move_path') or DEFAULT_SUCCESS_PATH)
            if not config.get('move_path_error'):
                logger.debug('missing move_path_error, default will be used')
            move_dest_path_error = os.path.join(
                config.get('path', ''),
                config.get('move_path_error') or DEFAULT_FAILURE_PATH)
        else:
            do_move = False

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                if do_move:
                    try:
                        self._create_if_missing(ftp, move_dest_path)
                        self._create_if_missing(ftp, move_dest_path_error)
                    except ftplib.all_errors as e:
                        logger.warning(
                            "Can't create move directory, files will not be moved: {reason}"
                            .format(reason=e))
                        do_move = False
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue
                    try:
                        if not self._is_allowed(filename, allowed_ext):
                            logger.info(
                                'ignoring file {filename} because of file extension'
                                .format(filename=filename))
                            continue

                        if last_updated:
                            item_last_updated = datetime.strptime(
                                facts['modify'],
                                self.DATE_FORMAT).replace(tzinfo=utc)
                            if item_last_updated <= last_updated:
                                continue
                            elif not crt_last_updated or item_last_updated > crt_last_updated:
                                crt_last_updated = item_last_updated

                        local_file_path = os.path.join(config['dest_path'],
                                                       filename)
                        with open(local_file_path, 'wb') as f:
                            try:
                                ftp.retrbinary('RETR %s' % filename, f.write)
                            except ftplib.all_errors:
                                os.remove(local_file_path)
                                raise Exception(
                                    'Exception retrieving file from FTP server ({filename})'
                                    .format(filename=filename))

                        if isinstance(registered_parser, XMLFeedParser):
                            xml = etree.parse(local_file_path).getroot()
                            parser = self.get_feed_parser(provider, xml)
                            parsed = parser.parse(xml, provider)
                        else:
                            parser = self.get_feed_parser(
                                provider, local_file_path)
                            parsed = parser.parse(local_file_path, provider)

                        if isinstance(parsed, dict):
                            parsed = [parsed]

                        items.append(parsed)
                        if do_move:
                            move_dest_file_path = os.path.join(
                                move_dest_path, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error(
                            "Error while parsing {filename}: {msg}".format(
                                filename=filename, msg=e))
                        if do_move:
                            move_dest_file_path_error = os.path.join(
                                move_dest_path_error, filename)
                            self._move(ftp, filename,
                                       move_dest_file_path_error)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 25
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        do_move = config.get('move', False)
        last_processed_file_modify = provider.get(
            'private', {}).get('last_processed_file_modify')
        limit = app.config.get('FTP_INGEST_FILES_LIST_LIMIT', 100)
        registered_parser = self.get_feed_parser(provider)
        allowed_ext = getattr(registered_parser, 'ALLOWED_EXT',
                              self.ALLOWED_EXT_DEFAULT)

        try:
            with ftp_connect(config) as ftp:
                items = []
                files_to_process = []
                files = self._sort_files(self._list_files(ftp, provider))

                if do_move:
                    move_path, move_path_error = self._create_move_folders(
                        config, ftp)

                for filename, modify in files:
                    # filter by extension
                    if not self._is_allowed(filename, allowed_ext):
                        logger.info(
                            'ignoring file {filename} because of file extension'
                            .format(filename=filename))
                        continue

                    # filter by modify datetime
                    file_modify = datetime.strptime(
                        modify, self.DATE_FORMAT).replace(tzinfo=utc)
                    if last_processed_file_modify:
                        # ignore limit and add files for processing
                        if last_processed_file_modify == file_modify:
                            files_to_process.append((filename, file_modify))
                        elif last_processed_file_modify < file_modify:
                            # evenv if we have reached a limit, we must add at least one file to increment
                            # a `last_processed_file_modify` in provider
                            files_to_process.append((filename, file_modify))
                            # limit amount of files to process per ingest update
                            if len(files_to_process) >= limit:
                                break
                    else:
                        # limit amount of files to process per ingest update
                        if len(files_to_process) >= limit:
                            break
                        # add files for processing
                        files_to_process.append((filename, file_modify))

                # process files
                for filename, file_modify in files_to_process:
                    try:
                        items += self._retrieve_and_parse(
                            ftp, config, filename, provider, registered_parser)
                        update['private'] = {
                            'last_processed_file_modify': file_modify
                        }

                        if do_move:
                            move_dest_file_path = os.path.join(
                                move_path, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error(
                            "Error while parsing {filename}: {msg}".format(
                                filename=filename, msg=e))

                        if do_move:
                            move_dest_file_path_error = os.path.join(
                                move_path_error, filename)
                            self._move(ftp, filename,
                                       move_dest_file_path_error)

            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 26
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        registered_parser = self.get_feed_parser(provider)
        try:
            allowed_ext = registered_parser.ALLOWED_EXT
        except AttributeError:
            allowed_ext = self.ALLOWED_EXT_DEFAULT
        crt_last_updated = None
        if config.get('move', False):
            do_move = True
            if not config.get('ftp_move_path'):
                logger.debug('missing move_path, default will be used')
            move_dest_path = os.path.join(config.get('path', ''), config.get('ftp_move_path') or DEFAULT_SUCCESS_PATH)
            if not config.get('move_path_error'):
                logger.debug('missing move_path_error, default will be used')
            move_dest_path_error = os.path.join(config.get('path', ''),
                                                config.get('move_path_error') or DEFAULT_FAILURE_PATH)
        else:
            do_move = False

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                if do_move:
                    try:
                        self._create_if_missing(ftp, move_dest_path)
                        self._create_if_missing(ftp, move_dest_path_error)
                    except ftplib.all_errors as e:
                        logger.warning("Can't create move directory, files will not be moved: {reason}".format(
                            reason=e))
                        do_move = False
                items = []

                for filename, facts in self._list_items(ftp, provider):
                    try:
                        if not self._is_allowed(filename, allowed_ext):
                            logger.info('ignoring file {filename} because of file extension'.format(filename=filename))
                            continue

                        if last_updated:
                            item_last_updated = datetime.strptime(facts, self.DATE_FORMAT).replace(tzinfo=utc)
                            if item_last_updated <= last_updated:
                                continue
                            elif not crt_last_updated or item_last_updated > crt_last_updated:
                                crt_last_updated = item_last_updated

                        items += self._retrieve_and_parse(ftp, config, filename, provider, registered_parser)
                        if do_move:
                            move_dest_file_path = os.path.join(move_dest_path, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error("Error while parsing {filename}: {msg}".format(filename=filename, msg=e))
                        if do_move:
                            move_dest_file_path_error = os.path.join(move_dest_path_error, filename)
                            self._move(ftp, filename, move_dest_file_path_error)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 27
0
class FTPService(IngestService):
    """FTP Ingest Service."""

    DATE_FORMAT = '%Y%m%d%H%M%S'
    FILE_SUFFIX = '.xml'

    PROVIDER = 'ftp'

    ERRORS = [
        IngestFtpError.ftpUnknownParserError().get_error_description(),
        IngestFtpError.ftpError().get_error_description()
    ]

    def config_from_url(self, url):
        """Parse given url into ftp config.

        :param url: url in form `ftp://username:password@host:port/dir`
        """
        url_parts = urlparse(url)
        return {
            'username': url_parts.username,
            'password': url_parts.password,
            'host': url_parts.hostname,
            'path': url_parts.path.lstrip('/'),
        }

    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        items = []
        try:
            with ftplib.FTP(config.get('host')) as ftp:
                ftp.login(config.get('username'), config.get('password'))
                ftp.cwd(config.get('path', ''))
                ftp.set_pasv(config.get('passive', False))

                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(
                            facts['modify'],
                            self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    dest = os.path.join(config['dest_path'], filename)

                    try:
                        with open(dest, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    xml = etree.parse(dest).getroot()
                    parser = get_xml_parser(xml)
                    if not parser:
                        raise IngestFtpError.ftpUnknownParserError(
                            Exception('Parser not found'), provider, filename)
                    parsed = parser.parse_message(xml, provider)
                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 28
0
class FTPFeedingService(FeedingService):
    """
    Feeding Service class which can read article(s) which exist in a file system and accessible using FTP.
    """

    NAME = 'ftp'
    ERRORS = [
        IngestFtpError.ftpUnknownParserError().get_error_description(),
        IngestFtpError.ftpError().get_error_description()
    ]

    FILE_SUFFIX = '.xml'
    DATE_FORMAT = '%Y%m%d%H%M%S'

    def config_from_url(self, url):
        """
        Parse given url into ftp config.

        :param url: url in form `ftp://username:password@host:port/dir`
        """
        url_parts = urlparse(url)
        return {
            'username': url_parts.username,
            'password': url_parts.password,
            'host': url_parts.hostname,
            'path': url_parts.path.lstrip('/'),
        }

    def _test(self, provider):
        config = provider.get('config', {})
        try:
            with ftp_connect(config) as ftp:
                ftp.mlsd()
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)

    def _move(self, ftp, src, dest):
        """Move distant file

        :param ftp: FTP instance to use
        :type ftp: ftplib.FTP
        :param src: source path of the file to move
        :type src: str
        :param dest: dest path of the file to move
        :type dest: str
        """
        try:
            ftp.rename(src, dest)
        except ftplib.all_errors as e:
            logger.warning(
                "Can't move file from {src} to {dest}: {reason}".format(
                    src=src, dest=dest, reason=e))

    def _create_if_missing(self, ftp, path):
        """Check if a dir exists, and create it else

        :param ftp: FTP instance to use
        :type ftp: ftplib.FTP
        :param src: dir path to check
        :type src: str
        """
        base_path = ftp.pwd()
        try:
            ftp.cwd(path)
        except ftplib.all_errors:
            # path probably doesn't exist
            # catching all_errors is a bit overkill,
            # but ftplib doesn't really have precise error
            # for missing directory
            ftp.mkd(path)
        finally:
            ftp.cwd(base_path)

    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        crt_last_updated = None
        if config.get('move', False):
            do_move = True
            if not config.get('move_path'):
                logger.debug('missing move_path, default will be used')
            move_dest_path = os.path.join(
                config.get('path', ''),
                config.get('move_path') or DEFAULT_SUCCESS_PATH)
            if not config.get('move_path_error'):
                logger.debug('missing move_path_error, default will be used')
            move_dest_path_error = os.path.join(
                config.get('path', ''),
                config.get('move_path_error') or DEFAULT_FAILURE_PATH)
        else:
            do_move = False

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                if do_move:
                    try:
                        self._create_if_missing(ftp, move_dest_path)
                        self._create_if_missing(ftp, move_dest_path_error)
                    except ftplib.all_errors as e:
                        logger.warning(
                            "Can't create move directory, files will not be moved: {reason}"
                            .format(reason=e))
                        do_move = False
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue
                    try:
                        if not filename.lower().endswith(self.FILE_SUFFIX):
                            raise

                        if last_updated:
                            item_last_updated = datetime.strptime(
                                facts['modify'],
                                self.DATE_FORMAT).replace(tzinfo=utc)
                            if item_last_updated < last_updated:
                                continue
                            elif not crt_last_updated or item_last_updated > crt_last_updated:
                                crt_last_updated = item_last_updated

                        local_file_path = os.path.join(config['dest_path'],
                                                       filename)
                        try:
                            with open(local_file_path, 'xb') as f:
                                try:
                                    ftp.retrbinary('RETR %s' % filename,
                                                   f.write)
                                except ftplib.all_errors as ex:
                                    os.remove(local_file_path)
                                    raise Exception(
                                        'Exception retrieving file from FTP server ({filename})'
                                        .format(filename=filename))
                        except FileExistsError as e:
                            raise Exception(
                                'Exception retrieving from FTP server, file already exists ({filename])'
                                .format(filename=local_file_path))

                        registered_parser = self.get_feed_parser(provider)
                        if isinstance(registered_parser, XMLFeedParser):
                            xml = etree.parse(local_file_path).getroot()
                            parser = self.get_feed_parser(provider, xml)
                            parsed = parser.parse(xml, provider)
                        else:
                            parser = self.get_feed_parser(
                                provider, local_file_path)
                            parsed = parser.parse(local_file_path, provider)

                        if isinstance(parsed, dict):
                            parsed = [parsed]

                        items.append(parsed)
                        if do_move:
                            move_dest_file_path = os.path.join(
                                move_dest_path, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error(
                            "Error while parsing {filename}: {msg}".format(
                                filename=filename, msg=e))
                        if do_move:
                            move_dest_file_path_error = os.path.join(
                                move_dest_path_error, filename)
                            self._move(ftp, filename,
                                       move_dest_file_path_error)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 29
0
class FTPFeedingService(FeedingService):
    """
    Feeding Service class which can read article(s) which exist in a file system and accessible using FTP.
    """

    NAME = 'ftp'
    ERRORS = [
        IngestFtpError.ftpUnknownParserError().get_error_description(),
        IngestFtpError.ftpError().get_error_description()
    ]

    FILE_SUFFIX = '.xml'
    DATE_FORMAT = '%Y%m%d%H%M%S'

    def config_from_url(self, url):
        """
        Parse given url into ftp config.

        :param url: url in form `ftp://username:password@host:port/dir`
        """
        url_parts = urlparse(url)
        return {
            'username': url_parts.username,
            'password': url_parts.password,
            'host': url_parts.hostname,
            'path': url_parts.path.lstrip('/'),
        }

    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(
                            facts['modify'],
                            self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    local_file_path = os.path.join(config['dest_path'],
                                                   filename)
                    try:
                        with open(local_file_path, 'xb') as f:
                            try:
                                ftp.retrbinary('RETR %s' % filename, f.write)
                            except ftplib.all_errors as ex:
                                os.remove(local_file_path)
                                logger.exception(
                                    'Exception retrieving from FTP server')
                                continue
                    except FileExistsError:
                        continue

                    registered_parser = self.get_feed_parser(provider)
                    if isinstance(registered_parser, XMLFeedParser):
                        xml = etree.parse(local_file_path).getroot()
                        parser = self.get_feed_parser(provider, xml)
                        parsed = parser.parse(xml, provider)
                    else:
                        parser = self.get_feed_parser(provider,
                                                      local_file_path)
                        parsed = parser.parse(local_file_path, provider)

                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 30
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        registered_parser = self.get_feed_parser(provider)
        try:
            allowed_ext = registered_parser.ALLOWED_EXT
        except AttributeError:
            allowed_ext = self.ALLOWED_EXT_DEFAULT
        crt_last_updated = None
        if config.get('move', False):
            do_move = True
            if not config.get('move_path'):
                logger.debug('missing move_path, default will be used')
            move_dest_path = os.path.join(
                config.get('path', ''),
                config.get('move_path') or DEFAULT_SUCCESS_PATH)
            if not config.get('move_path_error'):
                logger.debug('missing move_path_error, default will be used')
            move_dest_path_error = os.path.join(
                config.get('path', ''),
                config.get('move_path_error') or DEFAULT_FAILURE_PATH)
        else:
            do_move = False

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                if do_move:
                    try:
                        self._create_if_missing(ftp, move_dest_path)
                        self._create_if_missing(ftp, move_dest_path_error)
                    except ftplib.all_errors as e:
                        logger.warning(
                            "Can't create move directory, files will not be moved: {reason}"
                            .format(reason=e))
                        do_move = False
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue
                    try:
                        if not self._is_allowed(filename, allowed_ext):
                            logger.info(
                                'ignoring file {filename} because of file extension'
                                .format(filename=filename))
                            continue

                        if last_updated:
                            item_last_updated = datetime.strptime(
                                facts['modify'],
                                self.DATE_FORMAT).replace(tzinfo=utc)
                            if item_last_updated <= last_updated:
                                continue
                            elif not crt_last_updated or item_last_updated > crt_last_updated:
                                crt_last_updated = item_last_updated

                        local_file_path = os.path.join(config['dest_path'],
                                                       filename)
                        with open(local_file_path, 'wb') as f:
                            try:
                                ftp.retrbinary('RETR %s' % filename, f.write)
                            except ftplib.all_errors:
                                os.remove(local_file_path)
                                raise Exception(
                                    'Exception retrieving file from FTP server ({filename})'
                                    .format(filename=filename))

                        if isinstance(registered_parser, XMLFeedParser):
                            xml = etree.parse(local_file_path).getroot()
                            parser = self.get_feed_parser(provider, xml)
                            parsed = parser.parse(xml, provider)
                        else:
                            parser = self.get_feed_parser(
                                provider, local_file_path)
                            parsed = parser.parse(local_file_path, provider)

                        if isinstance(parsed, dict):
                            parsed = [parsed]

                        items.append(parsed)
                        if do_move:
                            move_dest_file_path = os.path.join(
                                move_dest_path, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error(
                            "Error while parsing {filename}: {msg}".format(
                                filename=filename, msg=e))
                        if do_move:
                            move_dest_file_path_error = os.path.join(
                                move_dest_path_error, filename)
                            self._move(ftp, filename,
                                       move_dest_file_path_error)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemplo n.º 31
0
#
# For the full copyright and license information, please see the
# AUTHORS and LICENSE files distributed with this source code, or
# at https://www.sourcefabric.org/superdesk/license

import os
import ftplib
import tempfile
from datetime import datetime
from superdesk.utc import utc
from superdesk.etree import etree
from superdesk.io import get_xml_parser, register_provider
from .ingest_service import IngestService
from superdesk.errors import IngestFtpError
errors = [
    IngestFtpError.ftpUnknownParserError().get_error_description(),
    IngestFtpError.ftpError().get_error_description()
]

try:
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse


class FTPService(IngestService):
    """FTP Ingest Service."""

    DATE_FORMAT = '%Y%m%d%H%M%S'
    FILE_SUFFIX = '.xml'