def create_load_config(): load_config = LoadConfig() load_config.root_directory = ROOT_DIRECTORY # load_config.data_source_name = 'extended_relations' load_config.process_count = psutil.cpu_count() load_config.server = LOCAL_SERVER load_config.index = INDEX load_config.type = TYPE load_config.data_mapper = IRDBDataMapper() load_config.data_extractor = IRDBDataExtractor() # load_config.data_source_name = file_name.split('.')[0] load_config.max_memory_percent = 75 return load_config
def get_load_config(): irdb_load_config = irdb_load_config_getter.get_load_config() load_config = LoadConfig() load_config.root_directory = irdb_load_config.root_directory load_config.server = irdb_load_config.server load_config.index = INDEX_MAPPING[ID_PUBMED]['index'] load_config.type = INDEX_MAPPING[ID_PUBMED]['type'] load_config.data_extractor = PubmedDataExtractor() load_config.data_mapper = PubmedDataMapper() # load_config.data_source_name = file_name.split('.')[0] load_config.process_count = irdb_load_config.process_count # load_config.log_level = LOG_LEVEL_TRACE return load_config
def get_pubmed_load_config(self): index_item = es_utils.get_info_for_index_id(ID_PUBMED) pubmed_index = index_item['index'] pubmed_type = index_item['index_type'] load_config = LoadConfig() load_config.root_directory = self.ct_load_config.root_directory load_config.server = self.ct_load_config.server load_config.index = pubmed_index load_config.type = pubmed_type load_config.data_extractor = PubmedDataExtractor() load_config.data_mapper = PubmedDataMapper() return load_config
def get_load_config(): load_config = LoadConfig() load_config.root_directory = ROOT_DIRECTORY load_config.server = SERVER load_config.index = INDEX load_config.type = TYPE load_config.process_count = PROCESS_COUNT load_config.bulk_data_size = BULK_DATA_SIZE load_config.data_loader_batch_size = DATA_LOADER_BATCH_SIZE load_config.data_source_batch_size = DATA_SOURCE_BATCH_SIZE load_config.doc_fetch_batch_size = DOC_FETCH_BATCH_SIZE # load_config.log_level = LOG_LEVEL_TRACE load_config.data_extractor = CTDataExtractor() load_config.data_mapper = CTDataMapper() # load_config.data_source_name = file_name.split('.')[0] load_config.max_memory_percent = 80 return load_config
added_citations = update_history_item['added_citations'] citations_set = set(citations) added_citations_set = set(added_citations) citations = list(citations_set | added_citations_set) if 'removed_citations' in update_history_item: removed_citations = update_history_item[ 'removed_citations'] citations = list(set(citations) - set(removed_citations)) return citations load_config = LoadConfig() load_config.root_directory = '/data/data_loading/pubmed_2019/pubmed2019/fix_citations' # load_config.process_count = psutil.cpu_count() load_config.server = 'http://localhost:9200' load_config.server_username = '' load_config.server_password = '' load_config.index = "pubmed2019" load_config.type = "article" load_config.data_mapper = PubmedDataMapper() load_config.data_extractor = PubmedDataExtractor() load_config.max_memory_percent = 75 load_config.source = "" load_config.append_relations = False
def get_load_config(self): load_config = LoadConfig() load_config.root_directory = self.root_directory load_config.process_count = psutil.cpu_count() load_config.server = self.server load_config.server_username = self.server_username load_config.server_password = self.server_password load_config.index = self.index load_config.type = self.type load_config.data_mapper = self.get_data_mapper() load_config.data_extractor = self.get_data_extractor() load_config.max_memory_percent = self.get_max_memory_percent() return load_config
"match": { "citations.index_id": ID_PUBMED } } ] } } ids = self.data_utils.batch_fetch_ids_for_query(base_url=self.load_config.server, query=query, index=self.load_config.index, type=self.load_config.type) return ids load_config = LoadConfig() load_config.root_directory = DIR # load_config.process_count = psutil.cpu_count() load_config.server = 'http://localhost:9200' load_config.server_username = '' load_config.server_password = '' load_config.index = "pubmed2019" load_config.type = "article" load_config.data_mapper = PubmedDataMapper() load_config.data_extractor = PubmedDataExtractor() load_config.max_memory_percent = 75 load_config.process_count = 4 load_config.process_spawn_delay = 1