def fork_and_store(self): """Fork and then store input collection Need to reopen mongo connection after fork """ self.logger.debug("Process id before forking: {}".format(os.getpid())) child_pid_list = [] # submit a new process try: pid = os.fork() except OSError: raise OSError("Could not create a child process.") if pid == 0: self.logger.debug("In child process with PID {}".format( os.getpid())) # Need to open separate mongo connection after each fork settings = process_manager.service(ConfigObject) mongo_connection = MongoConnection() mongo_connection.set_config_info(settings) mdb = mongo_connection.database ds = process_manager.service(DataStore) docs = ds[self.read_key] # store docs etl_utils.dostorage(mdb, docs, self.store_collections, self.clearFirst, self.logger, self.read_key) # close connection mongo_connection.close() # safe jupyter exit when forking os._exit(os.EX_OK) else: self.logger.debug( "Back in parent process after forking child {}".format(pid)) child_pid_list.append(pid) # can wait for fork to finish, or just go. if self.wait_after_fork: # check that fork is finished while child_pid_list: self.logger.debug("Waiting for child process to finish.") finished = os.waitpid(0, 0) if finished[0] in child_pid_list: self.logger.debug( "Finished child process {} with status {}".format( finished[0], finished[1])) child_pid_list.remove(finished[0]) self.logger.debug('Finished fork.')
def execute(self): """Execute the link. :returns: status code of execution :rtype: StatusCode """ settings = process_manager.service(ConfigObject) ds = process_manager.service(DataStore) doc = ds.get(self.read_key, assert_type=(dict, list), assert_len=True) self.logger.debug('Started doing storage') etl_utils.dostorage(self.mdb, doc, self.store_collections, self.clear_first, self.logger, self.read_key) return StatusCode.Success
def execute(self): """ Execute MongoDataToCollection """ ds = process_manager.service(DataStore) docs = ds.get(self.read_key, None) if docs is None: self.logger.warning('object with key {} is empty. skipping'.format( self.read_key)) return StatusCode.Success if self.minimal_input_size > 0: invoke = any([ds.get(f, False) for f in self.force_move_keys]) if len(docs) < self.minimal_input_size and not invoke: self.logger.debug( 'Input collection <%s> has fewer than %d records. Not stored.' % (self.read_key, self.minimal_input_size)) return StatusCode.Success else: self.logger.debug( 'Storing input collection <%s> with size %d' % (self.read_key, len(docs))) if len(docs) == 0: self.logger.info( 'Input collection <%s> has zero length. Nothing to store.' % self.read_key) return StatusCode.Success if not self.fork: # default etl_utils.dostorage(self.mdb, docs, self.store_collections, self.clearFirst, self.logger, self.read_key) else: self.fork_and_store() if self.clear_input: del ds[self.read_key] return StatusCode.Success
def execute(self): """ Execute MongoDFToCollection """ ds = process_manager.service(DataStore) if isinstance(self.nestedFields, dict): # assume set correctly pass elif isinstance(self.nestedFields, str) and self.nestedFields: assert self.nestedFields in ds, 'Key %s not in DataStore.' % self.nestedFields self.nestedFields = ds[self.nestedFields] df_orig = ds[self.read_key] if len(df_orig) == 0: self.logger.warning( 'Source collection "{key}" has zero length. Nothing to store in Mongo.', key=self.read_key) else: # MB: May have to do some index magic below for nested fields (pandas v0.18 and greater) # We don't want to modify the user's DataFrame here, so we make a shallow copy if self.copy: df = df_orig.copy(deep=False) else: df = df_orig if self.columns: df = df[self.columns] if self.checkFieldsAPI: df = etl_utils.check_fields_api(df) if isinstance(self.nestedFields, dict): flattened_nested_columns = etl_utils.flatten_columns( self.nestedFields) for c in flattened_nested_columns: if c not in df.columns: raise Exception( 'Field %s from nestedFields not present in dataframe %s' % (c, self.read_key)) other_columns = [ c for c in df.columns if c not in flattened_nested_columns ] # MB: utils.create_nested_df() screws up the original index in pandas v0.18 # index_name is added to nested_df, so it can be merged later by original index index_name = '__index__' if df.index.name is None else '__index__' + df.index.name df[index_name] = df.index nestedFields = deepcopy(self.nestedFields) nestedFields[index_name] = [index_name] nested_df = etl_utils.create_nested_df(nestedFields, df) nested_df[index_name] = nested_df[index_name].apply( etl_utils.orig_index, key=index_name) nested_df.index = nested_df[index_name] nested_df.drop(index_name, axis=1, inplace=True) # ... and we have the original index again, so merging with other_columns works nested_df = pd.concat([nested_df, df[other_columns]], axis=1) if self.columnsToAdd is not None: for k, v in self.columnsToAdd.items(): nested_df[k] = v self.logger.debug('Getting values from nested dataframe.') docs = list(nested_df.T.to_dict().values()) else: if self.columnsToAdd is not None: for k, v in self.columnsToAdd.items(): df[k] = v self.logger.debug('Getting values from dataframe.') docs = list(df.T.to_dict().values()) self.logger.debug('Started doing storage') etl_utils.dostorage(self.mdb, docs, self.store_collections, self.clearFirst, self.logger, self.read_key) return StatusCode.Success