class CopyDocs(object): def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils() def get_total_doc_count(self): return self.data_utils.get_total_doc_count( base_url=self.src_data_loader_utils.server, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type) def docs_fetched(self, docs, index, type): docs_to_copy = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_copy[_id] = existing_doc self.index_docs(docs_to_copy) self.processed_doc_count += len(docs) progress = ((self.processed_doc_count / float(self.total_doc_count)) * 100) print '---------------------------------------------------------------------------------------------' print 'Progress', self.processed_doc_count, '/', self.total_doc_count, progress, '%' print '---------------------------------------------------------------------------------------------' def export_doc_ids(self, server, src_index, src_type): print 'Fetching doc ids for', src_index, src_type query = {"match_all": {}} self.data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query, ids_fetched=self.ids_fetched) # print 'Done, fetched', len(documents_ids), 'doc ids' def ids_fetched(self, ids, index, type): self.copy_docs_batch(ids) def create_destination_index(self, mapping=None): if mapping is None: # Get mapping from src index mapping = self.src_data_loader_utils.get_mapping_from_server() if not self.dest_data_loader_utils.index_exists(): print 'Creating index' self.dest_data_loader_utils.put_mapping(mapping) # migrate_index(self.dest_data_loader_utils.index) else: print self.dest_data_loader_utils.index, 'exists' def copy_docs(self): self.processed_doc_count = 0 self.total_doc_count = self.get_total_doc_count() print 'Total doc count', self.total_doc_count self.create_destination_index(mapping=None) self.export_doc_ids(server=self.src_data_loader_utils.server, src_index=self.src_data_loader_utils.index, src_type=self.src_data_loader_utils.type) def copy_docs_for_ids(self, doc_ids, mapping=None): self.processed_doc_count = 0 self.total_doc_count = len(doc_ids) print 'Total doc count', self.total_doc_count self.create_destination_index(mapping) print 'Fetching docs from source index' batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch, 3000, 16, 0.33) batch_doc_processor.run() def copy_docs_batch(self, doc_ids): self.data_utils.batch_fetch_docs_for_ids( base_url=self.src_data_loader_utils.server, ids=doc_ids, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type, docs_fetched=self.docs_fetched) def index_docs(self, docs_to_copy): bulk_data = '' count = 0 for es_id in docs_to_copy: count += 1 doc = docs_to_copy[es_id] bulk_data += self.dest_data_loader_utils.bulk_index_header(es_id) bulk_data += '\n' bulk_data += json.dumps(doc) bulk_data += '\n' # if count % 1000 == 0: # print 'Processed', 1000, 'docs' if len(bulk_data) >= 150000: self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' def load_bulk_data(self, bulk_data): # print 'Bulk data size', len(bulk_data), 'loading...' response = self.dest_data_loader_utils.load_bulk_data(bulk_data) if response: pass # print 'Done loading bulk data, saving response' else: print 'Bulk data load failed' # src_server = 'http://localhost:9200' # src_index = 'irdb_v3' # src_type = 'grant' # dest_server = 'http://localhost:9200' # dest_index = 'irdb_v4' # dest_type = 'grant' # copy_docs = CopyDocs(src_server=src_server, # dest_server=dest_server, # src_index=src_index, # src_type=src_type, # dst_index=dest_index, # dst_type=dest_type) # copy_docs.copy_docs() # copy_relations.relations_to_exclude.append({ # "source": "", # "index_id": ID_PUBMED # }) # copy_relations.run()
class CopyRelationships(object): def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils() self.relations_to_exclude = [] self.missing_destination_ids = [] self.username = username self.password = password self.last_time_stamp = 0 self.diff_average = 0 def run(self): self.processed_doc_count = 0 self.total_doc_count = self.get_total_doc_count() print 'Total doc count', self.total_doc_count # self.create_destination_index(mapping=None) self.export_doc_ids(server=self.src_data_loader_utils.server, src_index=self.src_data_loader_utils.index, src_type=self.src_data_loader_utils.type) print 'saving missing docs' file_utils.save_file('/data/data_loading/pubmed_2019', 'missing_docs_pubmed2019.json', self.missing_destination_ids) def run_for_ids(self, doc_ids, mapping=None): self.processed_doc_count = 0 self.total_doc_count = len(doc_ids) print 'Total doc count', self.total_doc_count print 'Fetching docs from source index' batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch, 1000, 1, 0) batch_doc_processor.run() file_utils.save_file('/data/data_loading/pubmed_2019', 'missing_docs_pubmed2019.json', self.missing_destination_ids) def export_doc_ids(self, server, src_index, src_type): print 'Fetching doc ids for', src_index, src_type query = {"match_all": {}} self.data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query, ids_fetched=self.ids_fetched, batch_size=10000) # print 'Done, fetched', len(documents_ids), 'doc ids' def ids_fetched(self, ids, index, type): print 'Ids fetched', len(ids) self.copy_docs_batch(ids) def copy_docs_batch(self, doc_ids): print 'Fetching docs' self.data_utils.batch_fetch_docs_for_ids( base_url=self.src_data_loader_utils.server, ids=doc_ids, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type, docs_fetched=self.docs_fetched, batch_size=500) def docs_fetched(self, docs, index, type): print 'Docs fetched', len(docs) docs_to_copy = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_copy[_id] = existing_doc self.copy_relations(docs_to_copy) # Update progress self.processed_doc_count += len(docs) progress = ((self.processed_doc_count / float(self.total_doc_count)) * 100) current_time_stamp = time.time() diff = current_time_stamp - self.last_time_stamp self.diff_average = float(diff + self.diff_average) / 2 time_remaining = diff * (float(self.total_doc_count) / len(docs)) self.last_time_stamp = current_time_stamp print '---------------------------------------------------------------------------------------------' print 'Progress', self.processed_doc_count, '/', self.total_doc_count, progress, '%', time_remaining, 'secs' print '---------------------------------------------------------------------------------------------' def get_src_relations(self, src_doc, relationship_type): src_relations = [] if relationship_type in src_doc: relations = src_doc[relationship_type] for relation_item in relations: exclude_relation_item = False for relation_to_exclude in self.relations_to_exclude: if relation_to_exclude['source'] == relation_item[ 'source'] and relation_to_exclude[ 'index_id'] == relation_item['index_id']: exclude_relation_item = True break if not exclude_relation_item: src_relations.append(relation_item) return src_relations def get_dest_relations(self, dest_doc, relationship_type): dest_relations = [] if relationship_type in dest_doc: dest_relations = dest_doc[relationship_type] return dest_relations def add_relations(self, append_ids, relation, relations_list): relation_found = False for existing_relation in relations_list: # print existing_relation['source'], relation['source'], existing_relation['index_id'], relation['index_id'] if existing_relation['source'] == relation[ 'source'] and existing_relation['index_id'] == relation[ 'index_id']: existing_relation_ids = existing_relation['ids'] if append_ids: relation_ids = relation['ids'] for _id in relation_ids: if _id not in existing_relation_ids: existing_relation_ids.append(_id) existing_relation['ids'] = existing_relation_ids relation_found = True break if not relation_found: relations_list.append(relation) return relations_list def merge_relations(self, src_doc, dest_doc, relationship_type): dest_relations = self.get_dest_relations(dest_doc, relationship_type) src_relations = self.get_src_relations(src_doc, relationship_type) # print 'src_relations', len(src_relations) # print 'dest_relations', len(dest_relations) combined_relations = [] for relation in dest_relations: combined_relations = self.add_relations(True, relation, combined_relations) for relation in src_relations: combined_relations = self.add_relations(True, relation, combined_relations) return combined_relations def copy_relations(self, src_docs): bulk_data = '' count = 0 # Fetch destination docs destination_ids = src_docs.keys() destination_docs_array = self.data_utils.fetch_docs_for_ids( base_url=self.dest_data_loader_utils.server, ids=destination_ids, index=self.dest_data_loader_utils.index, type=self.dest_data_loader_utils.type, username=self.username, password=self.password) # Create destination doc dict destination_docs = {} for doc in destination_docs_array: _id = doc['_id'] if '_source' in doc: destination_docs[_id] = doc['_source'] # Find missing destination docs for _id in destination_ids: if _id not in destination_docs: self.missing_destination_ids.append(_id) print 'Missing ids', len(self.missing_destination_ids) # print 'dest ids', len() # Copy relations for _id in destination_docs: dest_doc = destination_docs[_id] src_doc = src_docs[_id] dest_relations = {} dest_relations[RELATIONSHIP_TYPE_CITATIONS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_CITATIONS) dest_relations[RELATIONSHIP_TYPE_CITED_BYS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_CITED_BYS) dest_relations[RELATIONSHIP_TYPE_RELATIONS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_RELATIONS) doc = {} if len(dest_relations[RELATIONSHIP_TYPE_CITATIONS]) > 0: doc[RELATIONSHIP_TYPE_CITATIONS] = dest_relations[ RELATIONSHIP_TYPE_CITATIONS] if len(dest_relations[RELATIONSHIP_TYPE_CITED_BYS]) > 0: doc[RELATIONSHIP_TYPE_CITED_BYS] = dest_relations[ RELATIONSHIP_TYPE_CITED_BYS] if len(dest_relations[RELATIONSHIP_TYPE_RELATIONS]) > 0: doc[RELATIONSHIP_TYPE_RELATIONS] = dest_relations[ RELATIONSHIP_TYPE_RELATIONS] # if len(dest_relations[RELATIONSHIP_TYPE_CITATIONS]) >= 2: # print _id count += 1 # doc = docs_to_copy[es_id] bulk_data += self.dest_data_loader_utils.bulk_update_header(_id) bulk_data += '\n' doc = {'doc': doc} bulk_data += json.dumps(doc) bulk_data += '\n' # if count % 1000 == 0: # print 'Processed', 1000, 'docs' if len(bulk_data) >= 150000: print _id self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) pass # print 'Copied', count, 'docs' # def create_destination_index(self, mapping=None): # if mapping is None: # # Get mapping from src index # mapping = self.src_data_loader_utils.get_mapping_from_server() # if not self.dest_data_loader_utils.index_exists(): # print 'Creating index' # self.dest_data_loader_utils.put_mapping(mapping) # # migrate_index(self.dest_data_loader_utils.index) # else: # print self.dest_data_loader_utils.index, 'exists' def load_bulk_data(self, bulk_data): print 'Bulk data size', len(bulk_data), 'loading...' response = self.dest_data_loader_utils.load_bulk_data(bulk_data) if response: pass # print 'Done loading bulk data, saving response' else: print 'Bulk data load failed' def get_total_doc_count(self): return self.data_utils.get_total_doc_count( base_url=self.src_data_loader_utils.server, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type) # src_server = 'http://localhost:9200' # src_index = 'pubmed2018_v5' # src_type = 'article' # dest_server = 'http://localhost:9200' # dest_index = 'pubmed2019' # dest_type = 'article' # copy_relations = CopyRelationships(src_server=src_server, # dest_server=dest_server, # src_index=src_index, # src_type=src_type, # dst_index=dest_index, # dst_type=dest_type, # username='', # password='') # copy_relations.relations_to_exclude.append({ # "source": "", # "index_id": ID_PUBMED # }) # copy_relations.run() # copy_relations.run_for_ids([12620793])