Exemplo n.º 1
0
    def extract_and_populate_data(
            self,
            content: Union[str, Iterable[str]],
            ingest_info: IngestInfo = None) -> IngestInfo:
        if not isinstance(content, str):
            raise DirectIngestError(
                msg=f"{content} is not a string",
                error_type=DirectIngestErrorType.READ_ERROR)

        if not ingest_info:
            ingest_info = IngestInfo()

        rows = csv.DictReader(content.splitlines())
        for row_index, row in enumerate(rows):
            row_ii = IngestInfo()
            for k, v in row.items():
                if k not in self.all_keys:
                    raise ValueError("Unmapped key: [%s]" % k)

                if not v:
                    continue

                self._set_value_if_key_exists(k, v, row_ii, defaultdict(set),
                                              {})

            try:
                self._merge_row_into_ingest_info(ingest_info, row_ii)
            except DirectIngestError as e:
                raise DirectIngestError(
                    msg=f"While parsing CSV row {row_index + 1}: " + str(e),
                    error_type=DirectIngestErrorType.READ_ERROR)

        return ingest_info
    def _postprocess_ingest_info(self, ingest_info):
        """Validate the ingest info and extract some fields (e.g., charge)
        that are packed as HTML into a single field.
        """
        def replace_html_tags(in_str, replacement=''):
            return re.sub(r'<[^>]*>', replacement, in_str)

        for person in ingest_info.people:
            if len(person.bookings) != 1:
                raise DirectIngestError(
                    msg="Person did not have exactly one booking as expected.",
                    error_type=DirectIngestErrorType.PARSE_ERROR)

            booking = person.bookings[0]

            if booking.arrest and booking.arrest.agency:
                booking.arrest.agency = replace_html_tags(
                    booking.arrest.agency, '/')

            if not booking.charges:
                continue

            if len(booking.charges) != 1:
                raise DirectIngestError(
                    msg="Booking did not have exactly one charge as expected.",
                    error_type=DirectIngestErrorType.PARSE_ERROR)

            charge = booking.charges[0]
            if charge.name:
                charge_html = charge.name

                bond = charge.bond

                booking.charges = []
                charges = charge_html.split('<TR>')[1:]
                for charge_row in charges:
                    try:
                        (_, case_number, charge_date, charge_status,
                         charge_names,
                         *last) = re.sub(r'(<>)\1+', '<>',
                                         replace_html_tags(charge_row,
                                                           '<>')).split('<>')
                        if len(last) > 1:
                            raise DirectIngestError(
                                msg="Found more columns than expected in "
                                "charge row",
                                error_type=DirectIngestErrorType.PARSE_ERROR)

                    except ValueError as e:
                        if len(charge_html) == 255 or len(charge_html) == 254:
                            continue
                        raise e

                    if charge_status != 'In County':
                        booking.create_hold(jurisdiction_name=charge_status)
                    for charge_name in charge_names.split(';'):
                        booking.create_charge(name=charge_name,
                                              offense_date=charge_date,
                                              case_number=case_number,
                                              bond=bond)
Exemplo n.º 3
0
    def _parse_and_persist_contents(self, args: IngestArgsType,
                                    contents_handle: ContentsHandleType):
        """
        Runs the full ingest process for this controller for files with
        non-empty contents.
        """
        ingest_info = self._parse(args, contents_handle)
        if not ingest_info:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PARSE_ERROR,
                msg="No IngestInfo after parse.")

        logging.info("Successfully parsed data for ingest run [%s]",
                     self._job_tag(args))

        ingest_info_proto = \
            ingest_utils.convert_ingest_info_to_proto(ingest_info)

        logging.info(
            "Successfully converted ingest_info to proto for ingest "
            "run [%s]", self._job_tag(args))

        ingest_metadata = self._get_ingest_metadata(args)
        persist_success = persistence.write(ingest_info_proto, ingest_metadata)

        if not persist_success:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PERSISTENCE_ERROR,
                msg="Persist step failed")

        logging.info("Successfully persisted for ingest run [%s]",
                     self._job_tag(args))
Exemplo n.º 4
0
def controller_for_region_code(
        region_code: str,
        allow_unlaunched: bool = False) -> BaseDirectIngestController:
    """Returns an instance of the region's controller, if one exists."""
    if region_code not in get_supported_direct_ingest_region_codes():
        raise DirectIngestError(
            msg=
            f"Unsupported direct ingest region [{region_code}] in project [{metadata.project_id()}]",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    try:
        region = regions.get_region(region_code, is_direct_ingest=True)
    except FileNotFoundError:
        raise DirectIngestError(
            msg=f"Region [{region_code}] has no registered manifest",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    if not allow_unlaunched and not region.is_ingest_launched_in_env():
        check_is_region_launched_in_env(region)

    controller = region.get_ingestor()

    if not isinstance(controller, BaseDirectIngestController):
        raise DirectIngestError(
            msg=
            f"Controller for direct ingest region [{region_code}] has unexpected type [{type(controller)}]",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    return controller
Exemplo n.º 5
0
    def _merge_row_into_ingest_info(self, ingest_info, row_ii):
        row_person = scraper_utils.one('person', row_ii)
        existing_person = ingest_info.get_person_by_id(row_person.person_id)
        if not existing_person:
            ingest_info.people.append(row_person)
            return

        if len(row_person.bookings) != 1:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PARSE_ERROR,
                msg="Exactly one booking must be on each row.")
        row_booking = row_person.bookings[0]

        existing_booking = existing_person.get_booking_by_id(
            row_booking.booking_id)
        if not existing_booking:
            existing_person.bookings.append(row_booking)
            return

        if len(row_booking.charges) != 1:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PARSE_ERROR,
                msg="Exactly one charge must be on each row.")
        row_charge = row_booking.charges[0]
        existing_booking.charges.append(row_charge)
Exemplo n.º 6
0
def ensure_all_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    """Ensures that all file paths in the ingest buckets for all direct ingest states have properly normalized
    file names, to ensure that repeat uploads of files into those buckets don't fail or overwrite data."""
    logging.info(
        "Received request for direct ingest ensure_all_file_paths_normalized: "
        "%s",
        request.values,
    )

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        with monitoring.push_region_tag(region_code):
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=True)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                controller.region, can_start_ingest=can_start_ingest)
    return "", HTTPStatus.OK
    def _read_contents(self, args: GcsfsIngestArgs) -> Optional[Iterable[str]]:
        if not args.file_path:
            raise DirectIngestError(
                msg=f"File path not set for job [{self._job_tag(args)}]",
                error_type=DirectIngestErrorType.INPUT_ERROR)

        if not self.fs.exists(args.file_path):
            logging.info(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted", args.file_path)
            return None

        # TODO(1840): Turn this into a generator that only reads / yields lines
        #  one at a time so we don't hold entire large files in memory. NOTE:
        #  calling fp.readLine() does a GET request every time, so this impl
        #  would have to be smarter about calling read() in chunks.
        with self.fs.open(args.file_path) as fp:
            now = datetime.datetime.now()
            logging.info(
                "Opened path [%s] - now reading contents (time: [%s]).",
                args.file_path, now.isoformat())
            binary_contents = fp.read()
            now = datetime.datetime.now()
            logging.info(
                "Finished reading binary contents for path [%s] (time: [%s]), "
                "now decoding.",
                args.file_path, now.isoformat())

        return binary_contents.decode('utf-8').splitlines()
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info('Received request to process direct ingest job: [%s]',
                 request.values)
    region_code = get_str_param_value('region', request.values)

    if not region_code:
        return f'Bad parameters [{request.values}]', HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_args = _get_ingest_args(json_data)

        if not ingest_args:
            return f'Could not parse ingest args', HTTPStatus.BAD_REQUEST
        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                if not ingest_args:
                    raise DirectIngestError(
                        msg=f"process_job was called with no IngestArgs.",
                        error_type=DirectIngestErrorType.INPUT_ERROR)

                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.run_ingest_job_and_kick_scheduler_on_completion(
                ingest_args)
    return '', HTTPStatus.OK
Exemplo n.º 9
0
def handle_sftp_files() -> Tuple[str, HTTPStatus]:
    """Schedules the SFTP downloads into the appropriate cloud task queue."""
    logging.info("Received request for handling SFTP files: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code, ingest_instance=None):
        try:
            region = _region_for_region_code(region_code)
            direct_ingest_cloud_task_manager = DirectIngestCloudTaskManagerImpl(
            )
            direct_ingest_cloud_task_manager.create_direct_ingest_sftp_download_task(
                region)
        except FileNotFoundError as e:
            raise DirectIngestError(
                msg=f"Region [{region_code}] has no registered manifest",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            ) from e

    return "", HTTPStatus.OK
Exemplo n.º 10
0
def ensure_all_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    logging.info(
        'Received request for direct ingest ensure_all_file_paths_normalized: '
        '%s', request.values)

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        with monitoring.push_region_tag(region_code):
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=True)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR)

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.\
                create_direct_ingest_handle_new_files_task(
                    controller.region, can_start_ingest=can_start_ingest)
    return '', HTTPStatus.OK
Exemplo n.º 11
0
def handle_direct_ingest_file() -> Tuple[str, HTTPStatus]:
    """Called from a Cloud Function when a new file is added to a direct ingest
    bucket. Will trigger a job that deals with normalizing and splitting the
    file as is appropriate, then start the scheduler if allowed.
    """
    region_code = get_str_param_value('region', request.args)
    # The bucket name for the file to ingest
    bucket = get_str_param_value('bucket', request.args)
    # The relative path to the file, not including the bucket name
    relative_file_path = get_str_param_value('relative_file_path',
                                             request.args,
                                             preserve_case=True)
    start_ingest = \
        get_bool_param_value('start_ingest', request.args, default=False)

    if not region_code or not bucket \
            or not relative_file_path or start_ingest is None:
        return f'Bad parameters [{request.args}]', HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        controller = controller_for_region_code(region_code,
                                                allow_unlaunched=True)
        if not isinstance(controller, GcsfsDirectIngestController):
            raise DirectIngestError(
                msg=f"Unexpected controller type [{type(controller)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR)

        path = GcsfsPath.from_bucket_and_blob_name(
            bucket_name=bucket, blob_name=relative_file_path)

        if isinstance(path, GcsfsFilePath):
            controller.handle_file(path, start_ingest=start_ingest)

    return '', HTTPStatus.OK
def filename_parts_from_path(file_path: GcsfsFilePath) -> GcsfsFilenameParts:
    match = re.match(_FILEPATH_REGEX, file_path.file_name)
    if not match:
        raise DirectIngestError(
            msg=f"Could not parse upload_ts, file_tag, extension "
            f"from path [{file_path.abs_path()}]",
            error_type=DirectIngestErrorType.INPUT_ERROR)

    full_upload_timestamp_str = match.group(2)
    utc_upload_datetime = \
        datetime.datetime.fromisoformat(full_upload_timestamp_str)

    filename_suffix = match.group(6)
    is_file_split = False
    file_split_size = None
    if filename_suffix:
        filename_suffix_file_split_match = \
            re.match(_FILENAME_SUFFIX_REGEX, filename_suffix)
        if filename_suffix_file_split_match is not None:
            is_file_split = True
            file_split_size_str = filename_suffix_file_split_match.group(3)
            file_split_size = \
                int(file_split_size_str) if file_split_size_str else None

    return GcsfsFilenameParts(
        processed_state=match.group(1),
        utc_upload_datetime=utc_upload_datetime,
        date_str=utc_upload_datetime.date().isoformat(),
        file_tag=match.group(3),
        filename_suffix=filename_suffix,
        extension=match.group(8),
        is_file_split=is_file_split,
        file_split_size=file_split_size,
    )
Exemplo n.º 13
0
def handle_new_files() -> Tuple[str, HTTPStatus]:
    """Normalizes and splits files in the ingest bucket for a given region as
    is appropriate. Will schedule the next process_job task if no renaming /
    splitting work has been done that will trigger subsequent calls to this
    endpoint.
    """
    logging.info('Received request for direct ingest handle_new_files: %s',
                 request.values)
    region_code = get_str_param_value('region', request.values)
    can_start_ingest = \
        get_bool_param_value('can_start_ingest', request.values, default=False)

    if not region_code or can_start_ingest is None:
        return f'Bad parameters [{request.values}]', HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        try:
            controller = controller_for_region_code(region_code,
                                                    allow_unlaunched=True)
        except DirectIngestError as e:
            if e.is_bad_request():
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        if not isinstance(controller, GcsfsDirectIngestController):
            raise DirectIngestError(
                msg=f"Unexpected controller type [{type(controller)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR)

        controller.handle_new_files(can_start_ingest=can_start_ingest)
    return '', HTTPStatus.OK
Exemplo n.º 14
0
    def build(
        cls, *, ingest_bucket_path: GcsfsBucketPath, allow_unlaunched: bool
    ) -> BaseDirectIngestController:
        """Retrieve a direct ingest GcsfsDirectIngestController associated with a
        particular ingest bucket.

        Returns:
            An instance of the region's direct ingest controller class (e.g.,
             UsNdController) that can run ingest operations for the ingest instance
             associated with the input bucket.
        """
        region_code = get_region_code_from_direct_ingest_bucket(
            ingest_bucket_path.bucket_name
        )

        if (
            region_code is None
            or region_code not in get_supported_direct_ingest_region_codes()
        ):
            raise DirectIngestError(
                msg=f"Unsupported direct ingest region [{region_code}] in "
                f"project [{metadata.project_id()}]",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        region = cls._region_for_bucket(ingest_bucket_path)
        if not allow_unlaunched and not region.is_ingest_launched_in_env():
            check_is_region_launched_in_env(region)

        controller_class = cls.get_controller_class(region)
        controller = controller_class(ingest_bucket_path=ingest_bucket_path)
        if not isinstance(controller, BaseDirectIngestController):
            raise ValueError(f"Unexpected controller class type [{type(controller)}]")

        return controller
Exemplo n.º 15
0
def _region_for_region_code(region_code: str) -> Region:
    try:
        return regions.get_region(region_code.lower(), is_direct_ingest=True)
    except FileNotFoundError as e:
        raise DirectIngestError(
            msg=f"Region [{region_code}] has no registered manifest",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        ) from e
Exemplo n.º 16
0
def ingest_view_export() -> Tuple[str, HTTPStatus]:
    """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be
    processed and ingested into our Recidiviz DB.
    """
    logging.info("Received request to do direct ingest view export: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_view_export_args = _parse_cloud_task_args(json_data)

        if not ingest_view_export_args:
            raise DirectIngestError(
                msg="raw_data_import was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )
        with monitoring.push_tags({
                TagKey.INGEST_VIEW_EXPORT_TAG:
                ingest_view_export_args.task_id_tag()
        }):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            if not isinstance(controller, GcsfsDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            controller.do_ingest_view_export(ingest_view_export_args)
    return "", HTTPStatus.OK
Exemplo n.º 17
0
def raw_data_import() -> Tuple[str, HTTPStatus]:
    """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data
    table in BQ.
    """
    logging.info("Received request to do direct ingest raw data import: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        data_import_args = _parse_cloud_task_args(json_data)

        if not data_import_args:
            raise DirectIngestError(
                msg="raw_data_import was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(data_import_args, GcsfsRawDataBQImportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(data_import_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            if not isinstance(controller, GcsfsDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            controller.do_raw_data_import(data_import_args)
    return "", HTTPStatus.OK
Exemplo n.º 18
0
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, IngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not ingest_args:
            return "Could not parse ingest args", HTTPStatus.BAD_REQUEST
        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK
def check_is_region_launched_in_env(region: Region) -> None:
    """Checks if direct ingest has been launched for the provided |region| in the current GAE env and throws if it has
    not."""
    if not region.is_ingest_launched_in_env():
        gae_env = environment.get_gae_environment()
        error_msg = f'Bad environment [{gae_env}] for region [{region.region_code}].'
        logging.error(error_msg)
        raise DirectIngestError(
            msg=error_msg, error_type=DirectIngestErrorType.ENVIRONMENT_ERROR)
Exemplo n.º 20
0
    def _parse_and_persist_contents(self, args: IngestArgsType,
                                    contents: ContentsType):
        """
        Runs the full ingest process for this controller for files with
        non-empty contents.
        """
        ingest_info = self._parse(args, contents)
        # TODO(1738): implement retry on fail.
        if not ingest_info:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PARSE_ERROR,
                msg="No IngestInfo after parse.")

        logging.info("Successfully parsed data for ingest run [%s]",
                     self._job_tag(args))

        ingest_info_proto = \
            ingest_utils.convert_ingest_info_to_proto(ingest_info)

        logging.info(
            "Successfully converted ingest_info to proto for ingest "
            "run [%s]", self._job_tag(args))

        ingest_metadata = IngestMetadata(self.region.region_code,
                                         self.region.jurisdiction_id,
                                         args.ingest_time,
                                         self.get_enum_overrides(),
                                         self.system_level)
        persist_success = persistence.write(ingest_info_proto, ingest_metadata)

        if not persist_success:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PERSISTENCE_ERROR,
                msg="Persist step failed")

        logging.info("Successfully persisted for ingest run [%s]",
                     self._job_tag(args))
Exemplo n.º 21
0
    def _get_next_job_args(self) -> Optional[IngestArgs]:
        df = pd.read_sql_query('SELECT MIN(export_time) FROM booking',
                               self._create_engine())
        ingest_time = df[min][0]
        if not ingest_time:
            logging.info("No more export times - successfully persisted all "
                         "data exports.")
            return None
        if ingest_time in self.scheduled_ingest_times:
            raise DirectIngestError(
                msg=f"Received a second job for ingest time [{ingest_time}]. "
                "Did the previous job delete this export from the database?",
                error_type=DirectIngestErrorType.CLEANUP_ERROR)

        return IngestArgs(ingest_time=ingest_time)
Exemplo n.º 22
0
def gcsfs_direct_ingest_directory_path_for_region(
        region_code: str, system_level: SystemLevel) -> str:
    project_id = metadata.project_id()
    if not project_id:
        raise ValueError("Project id not set")

    if system_level == SystemLevel.COUNTY:
        bucket = f'{project_id}-direct-ingest-county'
        return os.path.join(bucket, region_code)
    if system_level == SystemLevel.STATE:
        normalized_region_code = region_code.replace('_', '-')
        return f'{project_id}-direct-ingest-state-{normalized_region_code}'

    raise DirectIngestError(
        msg=f"Cannot determine ingest directory path for region: "
        f"[{region_code}]",
        error_type=DirectIngestErrorType.INPUT_ERROR)
Exemplo n.º 23
0
    def _parse(self, args: GcsfsIngestArgs,
               contents_handle: GcsfsFileContentsHandle) -> IngestInfo:
        file_tag = self.file_tag(args.file_path)
        gating_context = IngestGatingContext(
            file_tag=file_tag, ingest_instance=self.ingest_instance)

        if file_tag not in self.get_file_tag_rank_list():
            raise DirectIngestError(
                msg=f"No mapping found for tag [{file_tag}]",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        file_mapping = self._yaml_filepath(file_tag)

        row_pre_processors = self._get_row_pre_processors_for_file(file_tag)
        row_post_processors = self._get_row_post_processors_for_file(file_tag)
        file_post_processors = self._get_file_post_processors_for_file(
            file_tag)
        # pylint: disable=assignment-from-none
        primary_key_override_callback = self._get_primary_key_override_for_file(
            file_tag)
        # pylint: disable=assignment-from-none
        ancestor_chain_overrides_callback = (
            self._get_ancestor_chain_overrides_callback_for_file(file_tag))
        should_set_with_empty_values = (
            gating_context.file_tag
            in self._get_files_to_set_with_empty_values())

        data_extractor = CsvDataExtractor(
            file_mapping,
            gating_context,
            row_pre_processors,
            row_post_processors,
            file_post_processors,
            ancestor_chain_overrides_callback,
            primary_key_override_callback,
            self.system_level,
            should_set_with_empty_values,
        )

        return data_extractor.extract_and_populate_data(
            contents_handle.get_contents_iterator())
Exemplo n.º 24
0
def filename_parts_from_path(file_path: str) -> GcsfsFilenameParts:
    _, filename = os.path.split(file_path)
    match = re.match(_FILEPATH_REGEX, filename)
    if not match:
        raise DirectIngestError(
            msg=f"Could not parse upload_ts, file_tag, extension "
            f"from path [{file_path}]",
            error_type=DirectIngestErrorType.INPUT_ERROR)

    full_upload_timestamp_str = match.group(1)
    utc_upload_datetime = \
        datetime.datetime.fromisoformat(full_upload_timestamp_str)

    return GcsfsFilenameParts(
        utc_upload_datetime=utc_upload_datetime,
        date_str=utc_upload_datetime.date().isoformat(),
        file_tag=match.group(2),
        filename_suffix=match.group(4),
        extension=match.group(5),
    )
Exemplo n.º 25
0
def kick_all_schedulers() -> None:
    """Kicks all ingest schedulers to restart ingest"""
    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        with monitoring.push_region_tag(region_code):
            region = region_for_region_code(region_code=region_code)
            if not region.is_ingest_launched_in_env():
                continue
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=False)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            controller.kick_scheduler(just_finished_job=False)
Exemplo n.º 26
0
    def get_admission_reason(self, person_dict: Dict) -> Optional[str]:
        """Returns the appropriate admission reason from the booking admission
        document's types."""
        admission_reasons = {
            admission['booking_admission_doc_type']
            for admission in person_dict['admission']
            if admission['booking_admission_doc_type']
        }
        if len(admission_reasons) == 1:
            return admission_reasons.pop()

        admission_reason_hierarchy = [
            'SENTENCE MITTIMUS',
            '15-DAY PAROLE DETAINER',
            'PERMANENT PAROLE DETAINER',
            'BAIL MITTIMUS',
            'MITTIMUS FOR FINES',
            'CRIMINAL COMPLAINT',
            'CIVIL CAPIAS',
            'CONTEMPT OF COURT',
            'GOVERNORS WARRANT',
            'WARRANT MANAGEMENT SYSTEM',
            'FEDERAL DETAINER',
        ]

        for reason in admission_reasons:
            if reason not in admission_reason_hierarchy:
                raise DirectIngestError(
                    error_type=DirectIngestErrorType.PARSE_ERROR,
                    msg=f"Unknown admission document type seen for person with "
                    f"sysid [{person_dict['booking']['sysid']}]: [{reason}]")

        for reason in admission_reason_hierarchy:
            if reason in admission_reasons:
                return reason

        return None
Exemplo n.º 27
0
    def _parse(self, args: GcsfsIngestArgs,
               contents: Iterable[str]) -> IngestInfo:
        file_tag = self.file_tag(args.file_path)

        if file_tag not in self._get_file_tag_rank_list():
            raise DirectIngestError(
                msg=f"No mapping found for tag [{file_tag}]",
                error_type=DirectIngestErrorType.INPUT_ERROR)

        file_mapping = self._yaml_filepath(file_tag)

        row_pre_processors = self._get_row_pre_processors_for_file(file_tag)
        row_post_processors = self._get_row_post_processors_for_file(file_tag)
        file_post_processors = self._get_file_post_processors_for_file(
            file_tag)
        # pylint: disable=assignment-from-none
        primary_key_override = self._get_primary_key_override_for_file(
            file_tag)
        # pylint: disable=assignment-from-none
        ancestor_key_override = \
            self._get_ancestor_key_override_for_file(file_tag)
        should_set_with_empty_values = \
            file_tag in self._get_files_to_set_with_empty_values()
        should_cache = self._get_should_cache()

        data_extractor = self.csv_data_extractor_cls(
            file_mapping,
            row_pre_processors,
            row_post_processors,
            file_post_processors,
            ancestor_key_override,
            primary_key_override,
            self.system_level,
            should_set_with_empty_values,
            should_cache=should_cache)

        return data_extractor.extract_and_populate_data(contents)
Exemplo n.º 28
0
def raw_data_import() -> Tuple[str, HTTPStatus]:
    """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data
    table in BQ.
    """
    logging.info("Received request to do direct ingest raw data import: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        data_import_args = _parse_cloud_task_args(json_data)

        if not data_import_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsRawDataBQImportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(data_import_args, GcsfsRawDataBQImportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(data_import_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != data_import_args.raw_data_file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {data_import_args.raw_data_file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=data_import_args.raw_data_file_path.
                    bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_raw_data_import(data_import_args)
    return "", HTTPStatus.OK
Exemplo n.º 29
0
def ingest_view_export() -> Tuple[str, HTTPStatus]:
    """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be
    processed and ingested into our Recidiviz DB.
    """
    logging.info("Received request to do direct ingest view export: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    output_bucket_name = get_str_param_value("output_bucket",
                                             request.values,
                                             preserve_case=True)

    if not region_code or not output_bucket_name:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                GcsfsBucketPath(output_bucket_name)).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_view_export_args = _parse_cloud_task_args(json_data)

        if not ingest_view_export_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsIngestViewExportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if output_bucket_name != ingest_view_export_args.output_bucket_name:
            raise DirectIngestError(
                msg=
                f"Different buckets were passed in the url and request body\n"
                f"url: {output_bucket_name}\n"
                f"body: {ingest_view_export_args.output_bucket_name}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags({
                TagKey.INGEST_VIEW_EXPORT_TAG:
                ingest_view_export_args.task_id_tag()
        }):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=GcsfsBucketPath(
                        ingest_view_export_args.output_bucket_name),
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_ingest_view_export(ingest_view_export_args)
    return "", HTTPStatus.OK
Exemplo n.º 30
0
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no GcsfsIngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, GcsfsIngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != ingest_args.file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {ingest_args.file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_args.file_path.bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                logging.warning(str(e))
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK