def fork_and_store(self):
        """Fork and then store input collection

        Need to reopen mongo connection after fork
        """
        self.logger.debug("Process id before forking: {}".format(os.getpid()))

        child_pid_list = []

        # submit a new process
        try:
            pid = os.fork()
        except OSError:
            raise OSError("Could not create a child process.")

        if pid == 0:
            self.logger.debug("In child process with PID {}".format(
                os.getpid()))

            # Need to open separate mongo connection after each fork
            settings = process_manager.service(ConfigObject)
            mongo_connection = MongoConnection()
            mongo_connection.set_config_info(settings)
            mdb = mongo_connection.database

            ds = process_manager.service(DataStore)
            docs = ds[self.read_key]

            # store docs
            etl_utils.dostorage(mdb, docs, self.store_collections,
                                self.clearFirst, self.logger, self.read_key)

            # close connection
            mongo_connection.close()

            # safe jupyter exit when forking
            os._exit(os.EX_OK)
        else:
            self.logger.debug(
                "Back in parent process after forking child {}".format(pid))
            child_pid_list.append(pid)

        # can wait for fork to finish, or just go.
        if self.wait_after_fork:
            # check that fork is finished
            while child_pid_list:
                self.logger.debug("Waiting for child process to finish.")
                finished = os.waitpid(0, 0)
                if finished[0] in child_pid_list:
                    self.logger.debug(
                        "Finished child process {} with status {}".format(
                            finished[0], finished[1]))
                    child_pid_list.remove(finished[0])

        self.logger.debug('Finished fork.')
Exemplo n.º 2
0
    def execute(self):
        """Execute the link.

        :returns: status code of execution
        :rtype: StatusCode
        """
        settings = process_manager.service(ConfigObject)
        ds = process_manager.service(DataStore)

        doc = ds.get(self.read_key, assert_type=(dict, list), assert_len=True)

        self.logger.debug('Started doing storage')
        etl_utils.dostorage(self.mdb, doc, self.store_collections,
                            self.clear_first, self.logger, self.read_key)

        return StatusCode.Success
    def execute(self):
        """ Execute MongoDataToCollection """

        ds = process_manager.service(DataStore)
        docs = ds.get(self.read_key, None)
        if docs is None:
            self.logger.warning('object with key {} is empty. skipping'.format(
                self.read_key))
            return StatusCode.Success

        if self.minimal_input_size > 0:
            invoke = any([ds.get(f, False) for f in self.force_move_keys])
            if len(docs) < self.minimal_input_size and not invoke:
                self.logger.debug(
                    'Input collection <%s> has fewer than %d records. Not stored.'
                    % (self.read_key, self.minimal_input_size))
                return StatusCode.Success
            else:
                self.logger.debug(
                    'Storing input collection <%s> with size %d' %
                    (self.read_key, len(docs)))

        if len(docs) == 0:
            self.logger.info(
                'Input collection <%s> has zero length. Nothing to store.' %
                self.read_key)
            return StatusCode.Success

        if not self.fork:
            # default
            etl_utils.dostorage(self.mdb, docs, self.store_collections,
                                self.clearFirst, self.logger, self.read_key)
        else:
            self.fork_and_store()

        if self.clear_input:
            del ds[self.read_key]

        return StatusCode.Success
    def execute(self):
        """ Execute MongoDFToCollection """

        ds = process_manager.service(DataStore)

        if isinstance(self.nestedFields, dict):
            # assume set correctly
            pass
        elif isinstance(self.nestedFields, str) and self.nestedFields:
            assert self.nestedFields in ds, 'Key %s not in DataStore.' % self.nestedFields
            self.nestedFields = ds[self.nestedFields]

        df_orig = ds[self.read_key]

        if len(df_orig) == 0:
            self.logger.warning(
                'Source collection "{key}" has zero length. Nothing to store in Mongo.',
                key=self.read_key)
        else:
            # MB: May have to do some index magic below for nested fields (pandas v0.18 and greater)
            # We don't want to modify the user's DataFrame here, so we make a shallow copy
            if self.copy:
                df = df_orig.copy(deep=False)
            else:
                df = df_orig

            if self.columns:
                df = df[self.columns]

            if self.checkFieldsAPI:
                df = etl_utils.check_fields_api(df)

            if isinstance(self.nestedFields, dict):
                flattened_nested_columns = etl_utils.flatten_columns(
                    self.nestedFields)
                for c in flattened_nested_columns:
                    if c not in df.columns:
                        raise Exception(
                            'Field %s from nestedFields not present in dataframe %s'
                            % (c, self.read_key))
                other_columns = [
                    c for c in df.columns if c not in flattened_nested_columns
                ]

                # MB: utils.create_nested_df() screws up the original index in pandas v0.18
                # index_name is added to nested_df, so it can be merged later by original index
                index_name = '__index__' if df.index.name is None else '__index__' + df.index.name
                df[index_name] = df.index
                nestedFields = deepcopy(self.nestedFields)
                nestedFields[index_name] = [index_name]
                nested_df = etl_utils.create_nested_df(nestedFields, df)
                nested_df[index_name] = nested_df[index_name].apply(
                    etl_utils.orig_index, key=index_name)
                nested_df.index = nested_df[index_name]
                nested_df.drop(index_name, axis=1, inplace=True)
                # ... and we have the original index again, so merging with other_columns works

                nested_df = pd.concat([nested_df, df[other_columns]], axis=1)
                if self.columnsToAdd is not None:
                    for k, v in self.columnsToAdd.items():
                        nested_df[k] = v
                self.logger.debug('Getting values from nested dataframe.')
                docs = list(nested_df.T.to_dict().values())
            else:
                if self.columnsToAdd is not None:
                    for k, v in self.columnsToAdd.items():
                        df[k] = v
                self.logger.debug('Getting values from dataframe.')
                docs = list(df.T.to_dict().values())

            self.logger.debug('Started doing storage')
            etl_utils.dostorage(self.mdb, docs, self.store_collections,
                                self.clearFirst, self.logger, self.read_key)
        return StatusCode.Success