示例#1
0
    def restore_all_tables_for_all_dbs(self,
                                       ts: t.Union[dt.datetime, str],
                                       job_dir: t.Optional[t.Union[Path, str]] = None,
                                       truncate_first: bool = False) -> None:
        """Restore all tables for all databases"""
        ts = parse_timestamp(ts, raise_parse_error=True)

        for dbt in DBType:
            td = None
            try:
                if job_dir:
                    job_dir = Path(job_dir).resolve()
                    restore_job_dir = Path(job_dir, dbt.value)
                    restore_job_dir.mkdir(exist_ok=False)
                else:
                    td = TemporaryDirectory()
                    restore_job_dir = Path(td.name)

                self.restore_all_tables(
                    db_type=dbt,
                    ts=ts,
                    job_dir=restore_job_dir,
                    truncate_first=truncate_first
                )
            finally:
                if td:
                    td.cleanup()
示例#2
0
    def backup_all_tables(self,
                          db_type: t.Union[DBType, str],
                          ts: t.Union[dt.datetime, str],
                          job_dir: t.Optional[t.Union[Path, str]] = None) -> None:
        """Backup all tables for given db_type to S3
        :param db_type: type of db - web/orch
        :param ts: backup timestamp that determines s3 backup path
        :param job_dir: directory used to store downloaded files - temp dir by default
        """
        db_type = DBType(db_type)
        ts = parse_timestamp(ts, raise_parse_error=True)
        backup_prefix = self.get_backup_prefix(db_type=db_type, ts=ts)

        if self.s3u.prefix_exists(backup_prefix):
            raise ValueError(f"Cannot backup to given timestamped prefix because it already exists: {backup_prefix}")

        try:
            td = None
            if job_dir:
                job_dir = Path(job_dir).resolve()
                job_dir.mkdir(exist_ok=True)
            else:
                td = TemporaryDirectory()
                job_dir = Path(td.name)

            print(f"Backing up tables to S3 {backup_prefix}.", file=sys.stderr)
            self.export_all_tables(db_type=db_type, export_base_dir=job_dir)
            self.s3u.upload_dir(local_dir=job_dir, prefix_path=backup_prefix)
        finally:
            if td:
                td.cleanup()
示例#3
0
    def restore_current_snapshot(
            self, snapshot_type: t.Union[SnapshotType, str],
            snapshot_ts: t.Union[dt.datetime, str]) -> str:
        """Restore current raw/parsed snapshot from one corresponding to a timestamp
        :param snapshot_type: type of snapshot - raw/parsed
        :param snapshot_tis: timestamp to use when figuring out what prefix to restore from
        :return: s3 prefix to the current snapshot
        """
        snapshot_type = SnapshotType(snapshot_type)
        snapshot_ts = parse_timestamp(snapshot_ts)
        current_prefix = self.get_current_prefix(snapshot_type)
        backup_prefix = self.get_backup_prefix_for_ts(
            snapshot_type=snapshot_type, ts=snapshot_ts)

        if not self.s3u.prefix_exists(backup_prefix):
            raise ValueError(
                f"Cannot restore backup prefix, it doesn't exist: {backup_prefix}"
            )
        if self.s3u.prefix_exists(current_prefix):
            print(
                f"Deleting current prefix prior to restore: {current_prefix} ...",
                file=sys.stderr)
            for obj_path in self.s3u.iter_object_paths_at_prefix(
                    current_prefix):
                self.s3u.delete_object(obj_path)

        print(
            f"Restoring current prefix - {current_prefix} - from backup at {backup_prefix} ...",
            file=sys.stderr)
        self.s3u.copy_prefix(src_prefix=backup_prefix,
                             dst_prefix=current_prefix)
        return current_prefix
示例#4
0
    def backup_current_snapshot(
        self,
        snapshot_type: t.Union[SnapshotType, str],
        snapshot_ts: t.Union[dt.datetime, str] = dt.datetime.now()
    ) -> t.Optional[str]:
        """Backup current raw/parsed snapshot to timestamped location in S3
        :param snapshot_type: type of snapshot - raw/parsed
        :param snapshot_ts: timestamp to use when constructing the backup prefix
        :return: s3 prefix to the backup
        """
        snapshot_type = SnapshotType(snapshot_type)
        snapshot_ts = parse_timestamp(snapshot_ts, raise_parse_error=True)
        current_prefix = self.get_current_prefix(snapshot_type)
        backup_prefix = self.get_backup_prefix_for_ts(
            snapshot_type=snapshot_type, ts=snapshot_ts)

        if self.s3u.prefix_exists(backup_prefix):
            raise ValueError(
                f"Cannot backup current snapshot because corresponding prefix already exists: {backup_prefix}"
            )
        if not self.s3u.prefix_exists(current_prefix):
            print(
                f"Cannot backup current snapshot because there's nothing there: {current_prefix}",
                file=sys.stderr)
            return None

        print(
            f"Backing up current prefix {current_prefix} to archive {backup_prefix} ...",
            file=sys.stderr)
        self.s3u.copy_prefix(src_prefix=current_prefix,
                             dst_prefix=backup_prefix)
        return backup_prefix
示例#5
0
    def process_db_doc_updates(self, idgs: t.Iterable[IngestableDocGroup],
                               ts: t.Union[dt.datetime, str]) -> None:
        """Process versioned_doc table db updates using given docs"""
        ts = parse_timestamp(ts=ts, raise_parse_error=True)

        with Config.connection_helper.orch_db_session_scope('rw') as session:
            for idg in idgs:
                if not idg.metadata_idoc:
                    continue

                metadata = idg.metadata_idoc.metadata
                existing_doc = VersionedDoc.get_existing_from_doc(
                    doc=metadata, session=session)
                if existing_doc:
                    session.add(existing_doc)
                else:
                    pub = Publication.get_or_create_from_document(
                        doc=metadata, session=session)
                    if pub:
                        session.add(pub)
                        vdoc = VersionedDoc.create_from_document(
                            doc=metadata,
                            pub=pub,
                            filename=idg.raw_idoc.local_path.name,
                            doc_location=idg.raw_idoc.s3_path or "",
                            batch_timestamp=ts)
                        session.add(vdoc)
            session.commit()
示例#6
0
def get_publication_date(doc_dict):
    try:
        parsed_date = parse_timestamp(doc_dict.get("publication_date", None))
        if parsed_date:
            return datetime.strftime(parsed_date, '%Y-%m-%dT%H:%M:%S')
    except:
        return ""
示例#7
0
    def get_backup_prefix(self, db_type: t.Union[DBType, str], ts: t.Union[dt.datetime, str]) -> str:
        """Get S3 backup prefix for given db_type and timestamp"""
        db_type = DBType(db_type)
        ts = parse_timestamp(ts, raise_parse_error=True)
        ts_str = ts.strftime(Config.TIMESTAMP_FORMAT)

        return {
            DBType.WEB: self.db_backup_base_prefix + DBType.WEB.value + '/' + ts_str,
            DBType.ORCH: self.db_backup_base_prefix + DBType.ORCH.value + '/' + ts_str
        }[db_type]
示例#8
0
 def restore_all_current_snapshots(
     self, snapshot_ts: t.Union[dt.datetime, str] = dt.datetime.now()
 ) -> t.List[str]:
     """Restore current snapshots for all databases"""
     snapshot_ts = parse_timestamp(ts=snapshot_ts, raise_parse_error=True)
     restored_current_prefixes: t.List[str] = []
     for st in SnapshotType:
         s3_path = self.restore_current_snapshot(snapshot_type=st,
                                                 snapshot_ts=snapshot_ts)
         restored_current_prefixes.append(s3_path)
     return restored_current_prefixes
示例#9
0
 def backup_all_current_snapshots(
     self, snapshot_ts: t.Union[dt.datetime, str] = dt.datetime.now()
 ) -> t.List[str]:
     """Backup snapshots for all databases"""
     snapshot_ts = parse_timestamp(ts=snapshot_ts, raise_parse_error=True)
     backed_up_snapshot_paths: t.List[str] = []
     for st in SnapshotType:
         s3_path = self.backup_current_snapshot(snapshot_type=st,
                                                snapshot_ts=snapshot_ts)
         if s3_path:
             backed_up_snapshot_paths.append(s3_path)
     return backed_up_snapshot_paths
示例#10
0
    def upload_docs_to_s3(self, idgs: t.Iterable[IngestableDocGroup],
                          ts: t.Union[dt.datetime,
                                      str], max_threads: int) -> List[str]:
        """Upload all raw/parsed/metadata docs in a group to s3"""
        ts = parse_timestamp(ts, raise_parse_error=True)
        uploaded_files: List[str] = []

        def _upload_to_s3(idoc: GenericIngestableDoc, ts=dt.datetime) -> str:
            print(f"Uploading doc {idoc.local_path!s} to S3 ... ",
                  file=sys.stderr)
            s3_location = Config.s3_utils.upload_file(
                file=idoc.local_path,
                object_prefix=self.get_timestamped_archive_prefix_for_idoc(
                    idoc=idoc, ts=ts))
            return s3_location

        # if we use all available resources
        # NOT recommended. This uses all computing power at once, will probably crash if big directory
        if max_threads < 0:
            max_workers = multiprocessing.cpu_count()

        # if we don't use multithreading or if we do partitioned multithreading
        elif max_threads >= 1:
            max_workers = max_threads

        # else, bad value inserted for max_threads
        else:
            raise ValueError(
                f"Invalid max_threads value given: ${max_threads}")

        def dl_inner_func(file, ts_set):
            file.raw_idoc.s3_path = _upload_to_s3(file.raw_idoc, ts=ts_set)
            if file.parsed_idoc:
                file.parsed_idoc.s3_path = _upload_to_s3(file.parsed_idoc,
                                                         ts=ts_set)
            if file.metadata_idoc:
                file.metadata_idoc.s3_path = _upload_to_s3(file.metadata_idoc,
                                                           ts=ts_set)
            if file.thumbnail_idoc:
                file.thumbnail_idoc.s3_path = _upload_to_s3(
                    file.thumbnail_idoc, ts=ts_set)

            yield file

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            r = executor.map(dl_inner_func, (idg for idg in idgs),
                             (ts for _ in idgs))
            for result in r:
                if result:
                    uploaded_files.append(next(result))

        return uploaded_files
示例#11
0
    def load(self, raw_dir: t.Union[Path, str],
             metadata_dir: t.Optional[t.Union[Path, str]],
             parsed_dir: t.Optional[t.Union[Path, str]],
             thumbnail_dir: t.Optional[t.Union[Path, str]],
             ingest_ts: t.Union[dt.datetime, str], max_threads: int,
             update_s3: bool, update_db: bool) -> None:
        """Process all doc/pub updates for eligible files"""
        ingest_ts = parse_timestamp(ts=ingest_ts, raise_parse_error=True)
        print(
            f"Running load:\n\traw_dir={raw_dir}\n\tmetadata_dir={metadata_dir}\n\tparsed_dir={parsed_dir}\n\t"
            f"thumbnail_dir={thumbnail_dir}")

        idgs = list(
            self.get_ingestable_docs(raw_dir=raw_dir,
                                     metadata_dir=metadata_dir,
                                     parsed_dir=parsed_dir,
                                     thumbnail_dir=thumbnail_dir))

        if update_db:
            print("Updating pub entries in 'publications' table ...",
                  file=sys.stderr)
            self.process_db_pub_updates(idgs=idgs)
        else:
            print("Skipping updates to 'publications' table ...",
                  file=sys.stderr)

        uploaded_idgs = None
        if update_s3:
            print("Uploading docs to S3 ...", file=sys.stderr)
            uploaded_idgs = list(
                self.upload_docs_to_s3(idgs=idgs,
                                       ts=ingest_ts,
                                       max_threads=max_threads))
        else:
            print("Skipping s3 uploads of docs ...", file=sys.stderr)

        if update_db:
            print("Updating pub entries in 'versioned_docs' table ...",
                  file=sys.stderr)
            self.process_db_doc_updates(idgs=uploaded_idgs or idgs,
                                        ts=ingest_ts)
        else:
            print("Skipping updates to 'versioned_docs' table ...",
                  file=sys.stderr)
示例#12
0
 def create_from_document(doc: Dict[str, Any], doc_location: str,
                          filename: str, batch_timestamp: dt.datetime,
                          pub: Publication) -> 'VersionedDoc':
     """Generate VersionedDoc from Document obj. and associated Publication"""
     return VersionedDoc(
         publication=pub,
         name=doc['doc_name'],
         type=doc['doc_type'],
         number=doc['doc_num'],
         # TODO: Pass actual filename using ProcessedDoc instead of Doc
         # TODO: Tweak for clones?
         filename=filename,
         doc_location=doc_location,
         batch_timestamp=batch_timestamp,
         publication_date=parse_timestamp(doc['publication_date']),
         json_metadata=doc,
         version_hash=doc['version_hash'],
         md5_hash="",
         is_ignored=False)
示例#13
0
    def get_checkpoint_ts(self,
                          checkpoint_path: str,
                          bucket: Optional[str] = None) -> Optional[datetime.datetime]:
        """Get timestamp from the checkpoint file
        :param checkpoint_path: Path to timestamp checkpoint file
        :param bucket: Bucket name
        :return: Timestamp from the checkpoint file, if one exists
        """

        bucket_name = bucket or self.bucket
        s3_resource = self.ch.s3_resource

        if not self.object_exists(object_path=checkpoint_path, bucket=bucket_name):
            return None

        response: Dict[str, Any] = s3_resource.Object(
            bucket_name,
            checkpoint_path
        ).get()

        ts_str = response['Body'].read().decode(encoding="utf-8")

        return parse_timestamp(ts_str)
示例#14
0
    def restore_all_tables(self,
                           db_type: t.Union[DBType, str],
                           ts: t.Union[dt.datetime, str],
                           job_dir: t.Optional[t.Union[Path, str]] = None,
                           truncate_first: bool = False) -> None:
        """Restore all tables for db_type from given backup timestamp
        :param ts: backup timestamp that determines s3 backup path
        :param db_type: DB type - web/orch
        :param job_dir: directory used to store downloaded files - temp dir by default
        :param truncate_first: whether to truncate target db tables before importing data
        """
        db_type = DBType(db_type)
        ts = parse_timestamp(ts, raise_parse_error=True)
        backup_prefix = self.get_backup_prefix(db_type=db_type, ts=ts)

        if not self.s3u.prefix_exists(backup_prefix):
            raise ValueError(f"There is no backup at given prefix to import: {backup_prefix}")

        try:
            td = None
            if job_dir:
                job_dir = Path(job_dir).resolve()
                job_dir.mkdir(exist_ok=True)
            else:
                td = TemporaryDirectory()
                job_dir = Path(td.name)

            print(f"Restoring from backups at {backup_prefix}.", file=sys.stderr)
            self.s3u.download_dir(local_dir=job_dir, prefix_path=backup_prefix)
            if truncate_first:
                print("Truncating tables before import ...", file=sys.stderr)
                self.truncate_backup_tables(db_type=db_type)
            self.import_all_tables(db_type=db_type, import_base_dir=job_dir)
        finally:
            if td:
                td.cleanup()
示例#15
0
 def get_prefix_at_ts(base_prefix: str, ts: t.Union[dt.datetime, str], ts_fmt: str = TIMESTAMP_FORMAT) -> str:
     """Get prefix for a given timestamp"""
     ts = parse_timestamp(ts=ts, raise_parse_error=True)
     base_prefix = S3Utils.format_as_prefix(base_prefix)
     return S3Utils.path_join(base_prefix, ts.strftime(ts_fmt))