def log_files_directory(self): log_files_directory = LOG_FILES_DIRECTORY if self.root_directory is not None: log_files_directory = self.root_directory + '/' + log_files_directory file_utils.make_directory(log_files_directory) return log_files_directory
def other_files_directory(self): other_files_directory = OTHER_FILES_DIRECTORY if self.root_directory is not None: other_files_directory = self.root_directory + '/' + other_files_directory file_utils.make_directory(other_files_directory) return other_files_directory
def source_files_directory(self): source_files_directory = SOURCE_FILES_DIRECTORY if self.root_directory is not None: source_files_directory = self.root_directory + '/' + source_files_directory file_utils.make_directory(source_files_directory) return source_files_directory
def generated_files_directory(self): generated_files_directory = GENERATED_FILES_DIRECTORY if self.root_directory is not None: generated_files_directory = self.root_directory + '/' + generated_files_directory file_utils.make_directory(generated_files_directory) return generated_files_directory
def bulk_update_response_directory(self, data_source_batch_name, data_source_name=None): data_source_batch_directory = self.data_source_batch_directory( data_source_batch_name, data_source_name) bulk_update_response_directory = data_source_batch_directory + '/' + BULK_UPDATE_RESPONSE_DIRECTORY file_utils.make_directory(bulk_update_response_directory) return bulk_update_response_directory
def loaded_docs_directory(self, data_source_batch_name, data_source_name=None): data_source_batch_directory = self.data_source_batch_directory( data_source_batch_name, data_source_name) loaded_docs_directory = data_source_batch_directory + '/' + LOADED_DOCS_DIRECTORY file_utils.make_directory(loaded_docs_directory) return loaded_docs_directory
def failed_docs_directory(self, data_source_batch_name, data_source_name=None): data_source_batch_directory = self.data_source_batch_directory( data_source_batch_name, data_source_name) failed_docs_directory = data_source_batch_directory + '/' + FAILED_DOCS_DIRECTORY file_utils.make_directory(failed_docs_directory) return failed_docs_directory
def data_source_directory(self, data_source_name=None): data_source_directory = self.generated_files_directory() if data_source_name is not None: data_source_directory = data_source_directory + '/' + data_source_name elif self.data_source_name is not None: data_source_directory = data_source_directory + '/' + self.data_source_name file_utils.make_directory(data_source_directory) return data_source_directory
def data_source_batch_directory(self, data_source_batch_name, data_source_name=None): data_source_batch_directory = self.data_source_directory( data_source_name) if data_source_batch_name is not None: data_source_batch_directory = data_source_batch_directory + '/' + data_source_batch_name file_utils.make_directory(data_source_batch_directory) return data_source_batch_directory
def process_relations_rows(self, data_rows, data_source_batch_name, source_index_id): data_source_directory = self.load_config.data_source_directory() data_source_batch_directory = self.load_config.data_source_batch_directory( data_source_batch_name) data_source_batch_directory_for_source = data_source_batch_directory + '/' + source_index_id file_utils.make_directory(data_source_batch_directory_for_source) filtered_ids = [] if self.mode == DataProcessor.MODE_RETRY_FAILED_DOCS or self.mode == DataProcessor.MODE_NORMAL_LOAD: loaded_ids = self.get_loaded_ids( data_source_batch_directory_for_source) else: loaded_ids = {} # filter ids for _id in data_rows: if _id not in loaded_ids: filtered_ids.append(_id) self.load_config.log(LOG_LEVEL_INFO, 'source index', source_index_id) self.load_config.log(LOG_LEVEL_INFO, 'loaded ids', len(loaded_ids)) self.load_config.log(LOG_LEVEL_INFO, 'ids to load', len(filtered_ids)) if self.mode is not DataProcessor.MODE_NORMAL_LOAD: batch_id = str(int(round(time.time() * 1000))) old_data_source_batch_directory = data_source_directory + '/' + 'old_' + data_source_batch_name + '_' + batch_id os.rename(data_source_batch_directory, old_data_source_batch_directory) batch = {} count = 0 for _id in filtered_ids: data = data_rows.pop(_id, None) batch[_id] = data count += 1 # if count % 1000 == 0: # print 'Adding id to batch', _id if count % self.load_config.data_loader_batch_size == 0: self.start_relationship_load_process(batch, data_source_batch_name, source_index_id) batch = {} if len(batch) > 0: self.start_relationship_load_process(batch, data_source_batch_name, source_index_id) self.join_processes()
def rename_failed_ids_directory(self): # data_source_batch_name = os.path.basename(self.data_source_batch_directory) # data_source_directory = os.path.dirname(self.data_source_batch_directory) failed_docs_directory = self.load_config.failed_docs_directory( self.data_source_batch_name) failed_docs_directory_path = os.path.dirname(failed_docs_directory) failed_docs_directory_name = os.path.basename(failed_docs_directory) batch_id = str(int(round(time.time() * 1000))) old_failed_docs_directory = failed_docs_directory_path + \ '/' + 'old_' + batch_id + '_' + failed_docs_directory_name os.rename(failed_docs_directory, old_failed_docs_directory) file_utils.make_directory(failed_docs_directory)