def _retrieve_and_parse(self, ftp, config, filename, provider, registered_parser): items = [] if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') local_file_path = os.path.join(config['dest_path'], filename) with open(local_file_path, 'wb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors: os.remove(local_file_path) raise Exception( 'Exception retrieving file from FTP server ({filename})'. format(filename=filename)) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items
def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') items = [] with ftplib.FTP(config.get('host')) as ftp: ftp.login(config.get('username'), config.get('password')) ftp.cwd(config.get('path', '')) for filename, facts in ftp.mlsd(): if not filename.endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue dest = '%s/%s' % (config['dest_path'], filename) try: with open(dest, 'xb') as f: ftp.retrbinary('RETR %s' % filename, f.write) except FileExistsError: continue xml = etree.parse(dest).getroot() items.append(get_xml_parser(xml).parse_message(xml)) return items
def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') crt_last_updated = None if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) logger.exception('Exception retrieving from FTP server') continue except FileExistsError: logger.exception('Exception retrieving from FTP server, file already exists') continue registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) logger.exception( 'Exception retrieving from FTP server') continue except FileExistsError: continue registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider): config = provider.get("config", {}) last_updated = provider.get("last_updated") if "dest_path" not in config: config["dest_path"] = tempfile.mkdtemp(prefix="superdesk_ingest_") try: with ftplib.FTP(config.get("host")) as ftp: ftp.login(config.get("username"), config.get("password")) ftp.cwd(config.get("path", "")) ftp.set_pasv(config.get("passive", False)) items = [] for filename, facts in ftp.mlsd(): if facts.get("type", "") != "file": continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime(facts["modify"], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue local_file_path = os.path.join(config["dest_path"], filename) try: with open(local_file_path, "xb") as f: try: ftp.retrbinary("RETR %s" % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) logger.exception("Exception retrieving from FTP server") continue except FileExistsError: continue registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') items = [] try: with ftplib.FTP(config.get('host')) as ftp: ftp.login(config.get('username'), config.get('password')) ftp.cwd(config.get('path', '')) ftp.set_pasv(config.get('passive', False)) items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue dest = os.path.join(config['dest_path'], filename) try: with open(dest, 'xb') as f: ftp.retrbinary('RETR %s' % filename, f.write) except FileExistsError: continue xml = etree.parse(dest).getroot() parser = get_xml_parser(xml) if not parser: raise IngestFtpError.ftpUnknownParserError( Exception('Parser not found'), provider, filename) parsed = parser.parse_message(xml, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftplib.FTP(config.get('host')) as ftp: ftp.login(config.get('username'), config.get('password')) ftp.cwd(config.get('path', '')) ftp.set_pasv(config.get('passive', False)) items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: ftp.retrbinary('RETR %s' % filename, f.write) except FileExistsError: continue registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider): config = provider.get('config', {}) last_updated = provider.get('last_updated') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') items = [] try: with ftplib.FTP(config.get('host')) as ftp: ftp.login(config.get('username'), config.get('password')) ftp.cwd(config.get('path', '')) items = self._get_items(ftp) for filename, facts in items: if facts.get('type', '') != 'file': continue if not filename.lower().endswith(self.FILE_SUFFIX): continue if last_updated: item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue dest = os.path.join(config['dest_path'], filename) try: with open(dest, 'xb') as f: ftp.retrbinary('RETR %s' % filename, f.write) except FileExistsError: continue xml = etree.parse(dest).getroot() parser = get_xml_parser(xml) if not parser: raise IngestFtpError.ftpUnknownParserError(Exception('Parser not found'), provider, filename) items.append(parser.parse_message(xml, provider)) return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _retrieve_and_parse(self, ftp, config, filename, provider, registered_parser): self._timer.start('retrieve_parse') if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') local_file_path = os.path.join(config['dest_path'], filename) with open(local_file_path, 'wb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) self._log_msg( "Download finished. Exec time: {:.4f} secs. Size: {} bytes. File: {}.".format( self._timer.split('retrieve_parse'), os.path.getsize(local_file_path), filename ) ) except ftplib.all_errors: self._log_msg( "Download failed. Exec time: {:.4f} secs. File: {}.".format( self._timer.stop('retrieve_parse'), filename ) ) os.remove(local_file_path) raise Exception('Exception retrieving file from FTP server ({filename})'.format( filename=filename)) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) self._log_msg( "Parsing finished. Exec time: {:.4f} secs. File: {}.".format( self._timer.stop('retrieve_parse'), filename ) ) return [parsed] if isinstance(parsed, dict) else parsed
def _retrieve_and_parse(self, ftp, config, filename, provider, registered_parser): self._timer.start("retrieve_parse") if "dest_path" not in config: config["dest_path"] = tempfile.mkdtemp(prefix="superdesk_ingest_") local_file_path = os.path.join(config["dest_path"], filename) with open(local_file_path, "wb") as f: try: ftp.retrbinary("RETR %s" % filename, f.write) self._log_msg( "Download finished. Exec time: {:.4f} secs. Size: {} bytes. File: {}." .format(self._timer.split("retrieve_parse"), os.path.getsize(local_file_path), filename)) except ftplib.all_errors: self._log_msg( "Download failed. Exec time: {:.4f} secs. File: {}.". format(self._timer.stop("retrieve_parse"), filename)) os.remove(local_file_path) raise Exception( "Exception retrieving file from FTP server ({filename})". format(filename=filename)) if self._is_empty(local_file_path): logger.info( "ignoring empty file {filename}".format(filename=filename)) raise EmptyFile(local_file_path) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) self._log_msg( "Parsing finished. Exec time: {:.4f} secs. File: {}.".format( self._timer.stop("retrieve_parse"), filename)) return [parsed] if isinstance(parsed, dict) else parsed
def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') registered_parser = self.get_feed_parser(provider) try: allowed_ext = registered_parser.ALLOWED_EXT except AttributeError: allowed_ext = self.ALLOWED_EXT_DEFAULT crt_last_updated = None if config.get('move', False): do_move = True if not config.get('move_path'): logger.debug('missing move_path, default will be used') move_dest_path = os.path.join( config.get('path', ''), config.get('move_path') or DEFAULT_SUCCESS_PATH) if not config.get('move_path_error'): logger.debug('missing move_path_error, default will be used') move_dest_path_error = os.path.join( config.get('path', ''), config.get('move_path_error') or DEFAULT_FAILURE_PATH) else: do_move = False if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: if do_move: try: self._create_if_missing(ftp, move_dest_path) self._create_if_missing(ftp, move_dest_path_error) except ftplib.all_errors as e: logger.warning( "Can't create move directory, files will not be moved: {reason}" .format(reason=e)) do_move = False items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue try: if not self._is_allowed(filename, allowed_ext): logger.info( 'ignoring file {filename} because of file extension' .format(filename=filename)) continue if last_updated: item_last_updated = datetime.strptime( facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated <= last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated local_file_path = os.path.join(config['dest_path'], filename) with open(local_file_path, 'wb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors: os.remove(local_file_path) raise Exception( 'Exception retrieving file from FTP server ({filename})' .format(filename=filename)) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser( provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) if do_move: move_dest_file_path = os.path.join( move_dest_path, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error( "Error while parsing {filename}: {msg}".format( filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join( move_dest_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)
def _update(self, provider, update): config = provider.get('config', {}) last_updated = provider.get('last_updated') crt_last_updated = None if config.get('move', False): do_move = True if not config.get('move_path'): logger.debug('missing move_path, default will be used') move_dest_path = os.path.join(config.get('path', ''), config.get('move_path') or DEFAULT_SUCCESS_PATH) if not config.get('move_path_error'): logger.debug('missing move_path_error, default will be used') move_dest_path_error = os.path.join(config.get('path', ''), config.get('move_path_error') or DEFAULT_FAILURE_PATH) else: do_move = False if 'dest_path' not in config: config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_') try: with ftp_connect(config) as ftp: if do_move: try: self._create_if_missing(ftp, move_dest_path) self._create_if_missing(ftp, move_dest_path_error) except ftplib.all_errors as e: logger.warning("Can't create move directory, files will not be moved: {reason}".format( reason=e)) do_move = False items = [] for filename, facts in ftp.mlsd(): if facts.get('type', '') != 'file': continue try: if not filename.lower().endswith(self.FILE_SUFFIX): raise if last_updated: item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc) if item_last_updated < last_updated: continue elif not crt_last_updated or item_last_updated > crt_last_updated: crt_last_updated = item_last_updated local_file_path = os.path.join(config['dest_path'], filename) try: with open(local_file_path, 'xb') as f: try: ftp.retrbinary('RETR %s' % filename, f.write) except ftplib.all_errors as ex: os.remove(local_file_path) raise Exception('Exception retrieving file from FTP server ({filename})'.format( filename=filename)) except FileExistsError as e: raise Exception('Exception retrieving from FTP server, file already exists ({filename])' .format(filename=local_file_path)) registered_parser = self.get_feed_parser(provider) if isinstance(registered_parser, XMLFeedParser): xml = etree.parse(local_file_path).getroot() parser = self.get_feed_parser(provider, xml) parsed = parser.parse(xml, provider) else: parser = self.get_feed_parser(provider, local_file_path) parsed = parser.parse(local_file_path, provider) if isinstance(parsed, dict): parsed = [parsed] items.append(parsed) if do_move: move_dest_file_path = os.path.join(move_dest_path, filename) self._move(ftp, filename, move_dest_file_path) except Exception as e: logger.error("Error while parsing {filename}: {msg}".format(filename=filename, msg=e)) if do_move: move_dest_file_path_error = os.path.join(move_dest_path_error, filename) self._move(ftp, filename, move_dest_file_path_error) if crt_last_updated: update[LAST_UPDATED] = crt_last_updated return items except IngestFtpError: raise except Exception as ex: raise IngestFtpError.ftpError(ex, provider)