def run(self): old_ids = export_doc_ids(server=SERVER, src_index=OLD_INDEX, src_type=OLD_TYPE) new_ids = export_doc_ids(server=SERVER, src_index=NEW_INDEX, src_type=NEW_TYPE) for _id in old_ids: if _id not in new_ids: self.missing_ids[_id] = 0 if len(self.missing_ids) % 1000 == 0: print 'Missing ids', len(self.missing_ids) for _id in new_ids: if _id not in old_ids: self.new_ids[_id] = 0 if len(self.new_ids) % 1000 == 0: print 'New ids', len(self.new_ids) print 'Missing ids', len(self.missing_ids) print 'New ids', len(self.new_ids) file_utils.make_directory(missing_ids_directory) file_utils.save_file(missing_ids_directory, 'missing_ids.json', self.missing_ids.keys()) file_utils.save_file(missing_ids_directory, 'new_ids.json', self.new_ids)
def save_docs_with_new_citations(self, docs_with_new_citations, update_file): update_record_file_name = self.get_docs_with_new_citations_file_name( update_file) file_utils.save_file( self.get_update_records_directory(DIR_DOCS_WITH_NEW_CITATION), update_record_file_name, docs_with_new_citations)
def process_batches(self): load_config = self.get_load_config() generated_files_directory = load_config.data_source_directory() other_files_directory = load_config.other_files_directory() batch_file_names = [] for batch_file_name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, batch_file_name) if os.path.isfile(file_path) and batch_file_name.startswith( 'batch_'): batch_file_names.append(batch_file_name) print "Generated ", len(batch_file_names), 'batch file names' batch_file_names.sort() if len(batch_file_names) == 0: batch_file_names = self.split_to_batches() processed_batches = file_utils.load_file( other_files_directory, 'processed_spires_pubmed_batches.json') for batch_file_name in batch_file_names: if batch_file_name not in processed_batches: print 'Loading batch', batch_file_name batch = file_utils.load_file(generated_files_directory, batch_file_name) self.process_batch(batch) processed_batches[batch_file_name] = 0 file_utils.save_file(other_files_directory, 'processed_spires_pubmed_batches.json', processed_batches)
def get_updated_docs(self): load_config = self.get_load_config(updates_directory) ftp_manager = FTPManager(load_config) update_file_urls = ftp_manager.get_update_file_urls() update_file_urls = update_file_urls[:2] ftp_manager.download_missing_files(file_urls=update_file_urls, no_of_files=2) all_files = file_manager.get_all_files(load_config) files_to_process = all_files[:2] # files_to_process = file_manager.get_new_update_files(load_config, update_file_urls, 2) print files_to_process for update_file in files_to_process: file_name = os.path.basename(update_file) self.current_update_file = file_name #file_name.split('.')[0] xml_data_source = XMLDataSource(update_file, 2) xml_data_source.process_rows(self.process_row) print 'Total updated ids:', len(self.updated_docs) file_utils.save_file(self.load_config.other_files_directory(), 'updated_docs.json', self.updated_docs) file_utils.save_file(self.load_config.other_files_directory(), 'inverted_index_for_updated_docs.json', self.inverted_index_for_updated_docs)
def process_batches(self): batch_file_names = [] for batch_file_name in os.listdir(TEMP_DIR): file_path = os.path.join(TEMP_DIR, batch_file_name) if os.path.isfile(file_path) and batch_file_name.startswith('batch_'): batch_file_names.append(batch_file_name) print "Generated ", len(batch_file_names), 'batch file names' batch_file_names.sort() if len(batch_file_names) == 0: batch_file_names = self.split_to_batches() print len(batch_file_names) raw_input('Continue?') processed_batches = file_utils.load_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json') for batch_file_name in batch_file_names: if batch_file_name not in processed_batches: print 'Loading batch', batch_file_name batch = file_utils.load_file(TEMP_DIR, batch_file_name) self.copy_docs_batch(batch) processed_batches[batch_file_name] = 0 file_utils.save_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json', processed_batches)
def filter_and_split_ids_into_batches(load_config): other_files_directory = load_config.other_files_directory() generated_files_directory = load_config.data_source_directory() all_ids = export_doc_ids.get_doc_ids_for_load_config(load_config) # total_count = len(all_ids) max_batch_count = BATCH_DOC_COUNT batch_index = 0 batch_ids = [] # Splitting into batches for _id in all_ids: batch_ids.append(_id) if len(batch_ids) >= max_batch_count: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' file_utils.save_file(generated_files_directory, batch_file_name, batch_ids) batch_ids = [] batch_index += 1 if len(batch_ids) > 0: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' file_utils.save_file(generated_files_directory, batch_file_name, batch_ids) batch_index += 1 print batch_index, 'batches to process'
def split_to_batches(self): server = self.src_data_loader_utils.server src_index = self.src_data_loader_utils.index src_type = self.src_data_loader_utils.type print 'Fetching doc ids for', src_index, src_type query = { "nested": { "path": "grants", "query": { "bool": { "must": [ { "exists": { "field": "grants" } } ] } } } } all_pubmed_ids = export_doc_ids.get_doc_ids(server, src_index, src_type, TEMP_DIR, 'pubmed2018_docs_with_grants.json', query=query) # all_pubmed_ids = all_pubmed_ids.keys() # all_pubmed_ids.sort() self.total_doc_count = len(all_pubmed_ids) max_batch_count = 5000 batch_file_names = [] batch_index = 0 batch_ids = [] # Splitting into batches for _id in all_pubmed_ids: batch_ids.append(_id) if len(batch_ids) >= max_batch_count: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids) batch_ids = [] batch_index += 1 if len(batch_ids) > 0: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids) batch_index += 1 return batch_file_names
def start_process_doc_batch(self, batch_file_name): print 'Loading batch', batch_file_name batch = file_utils.load_file(self.batch_docs_directory(), batch_file_name) batch_name = batch_file_name.split('.')[0] results = self.process_docs_batch(batch, batch_name) file_utils.save_file(self.batch_docs_directory(), RESULTS_FILE_PREFIX + batch_file_name, results)
def create_mapping(self, data_directory, format): mapping = {} for signature in format: format_item = format[signature] parents = format_item['parents'] mapping = self.add_mapping(parents, mapping) file_utils.save_file(data_directory, 'mapping.json', mapping)
def save_update_record_for_date(load_config, local_date, update_data, docs_with_new_citations): update_records_directory = get_update_records_directory(load_config) local_time = now.strftime("%H:%M:%S") update_record = { 'local_date': local_date, 'update_data': update_data, 'docs_with_new_citations': docs_with_new_citations, } file_utils.save_file(update_records_directory, 'pubmed_update_record_' + local_date + '_' + local_time, update_record)
def save_prospects(self, prospects): update_record = {} update_record['update_files'] = self.files_to_process update_record['prospects'] = prospects update_record['date'] = self.local_date_time update_records_directory = self.pubmed_updater.get_update_records_directory( DIR_PROSPECTS) prospects_file_name = self.get_prospects_file_name() file_utils.save_file(update_records_directory, prospects_file_name, update_record)
def save_update_record(self): update_records_name = 'pubmed_update_records.json' update_records = file_utils.load_file( self.pubmed_updater.get_update_records_directory(), update_records_name) if len(update_records) == 0: update_records = [] # update_records_for_date = [] # if self.local_date_time in update_records: # update_records_for_date = update_records[self.local_date_time] # update_records_for_date.extend(self.files_to_process) # update_records[self.local_date_time] = update_records_for_date update_data = self.pubmed_updater.generate_update_summary( self.files_to_process) update_file_records = [] for update_file_path in update_data: update_file_name = os.path.basename(update_file_path) update_data_for_file = update_data[update_file_path] articles_processed = len( update_data_for_file['articles_processed']) new_articles = len(update_data_for_file['new_articles']) updated_articles = articles_processed - new_articles update_file_record_item = { 'file_name': update_file_name, 'file_path': update_file_path, 'total_articles': articles_processed, 'new_articles': new_articles, 'updated_articles': updated_articles } update_file_records.append(update_file_record_item) update_record_item = { 'date': self.local_date_time, 'update_files': update_file_records } update_records.append(update_record_item) file_utils.save_file( self.pubmed_updater.get_update_records_directory(), update_records_name, update_records) # Save processed files list file_manager.update_processed_files(self.get_load_config(), self.files_to_process)
def set_config(self): config = {} config['root_directory'] = self.root_directory config['index_id'] = self.index_id config['server'] = self.server config['server_username'] = self.server_username config['server_password'] = self.server_password config['index'] = self.index config['type'] = self.type config['src_data_exists'] = self.src_data_exists config['src_data_directory'] = self.src_data_directory config['local_date_time'] = self.local_date_time file_utils.save_file(DATA_LOADING_DIRECTORY, self.config_file, config) return config
def save_new_pmids(self, update_files): load_config = self.load_manager.get_load_config() self.existing_pubmed_ids = file_utils.load_file( load_config.other_files_directory(), ALL_PUBMED_IDS_FILE) update_summary = self.generate_update_summary(update_files) for update_file in update_summary: update_summary_for_file = update_summary[update_file] articles_processed = update_summary_for_file['articles_processed'] for _id in articles_processed: self.existing_pubmed_ids[_id] = None file_utils.save_file(load_config.other_files_directory(), ALL_PUBMED_IDS_FILE, self.existing_pubmed_ids)
def run_for_ids(self, doc_ids, mapping=None): self.processed_doc_count = 0 self.total_doc_count = len(doc_ids) print 'Total doc count', self.total_doc_count print 'Fetching docs from source index' batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch, 1000, 1, 0) batch_doc_processor.run() file_utils.save_file('/data/data_loading/pubmed_2019', 'missing_docs_pubmed2019.json', self.missing_destination_ids)
def get_all_members(self): result_items = [] params = {} params['rows'] = self.no_of_rows url = 'members' self.cursor = None request = self.create_request(url, params) response = self.perform_request(request) result_items = self.get_result_items_from_response(response) print 'Total members', len(result_items) file_utils.save_file('/data_loading', 'crossref_members.json', result_items) return result_items
def run(self): xml_data_source = XMLDataSource(self.baseline_file, 2) xml_data_source.process_rows(self.process_baseline_row) print len(self.original_docs), self.current_baseline_file if len(self.original_docs) > 0: file_utils.save_file( self.load_config.generated_files_directory(), 'original_docs_' + self.current_baseline_file + '.json', self.original_docs) file_utils.save_file( self.load_config.generated_files_directory(), 'inverted_index_' + self.current_baseline_file + '.json', self.inverted_index)
def split_to_batches(self): load_config = self.get_load_config() other_files_directory = load_config.other_files_directory() # appl_id__pmid__mapping = file_utils.unpickle_file(other_files_directory, 'appl_id__pmid__mapping.json') pmid__appl_id__mapping = file_utils.unpickle_file( other_files_directory, 'pmid__appl_id__mapping.json') pubmed_ids = {} pubmed_ids = load_config.data_mapper.reformat( reformatted_array=pubmed_ids, relations_array=pmid__appl_id__mapping, dest_index_id=ID_IRDB, relationship_type=RELATIONSHIP_TYPE_CITATIONS) generated_files_directory = load_config.data_source_directory() pubmed_id_keys = pubmed_ids.keys() pubmed_id_keys.sort() max_batch_count = 10000 batch_index = 0 batch_ids = {} batch_file_names = [] # Splitting into batches for _id in pubmed_id_keys: batch_ids[_id] = pubmed_ids[_id] if len(batch_ids) >= max_batch_count: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(generated_files_directory, batch_file_name, batch_ids) batch_ids = {} batch_index += 1 if len(batch_ids) > 0: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(generated_files_directory, batch_file_name, batch_ids) batch_index += 1 return batch_file_names
def process_irdb_relations(self): generated_files_directory = self.load_config.data_source_directory() all_ids = file_utils.load_file(generated_files_directory, self.batch_file_name) processed_count = 0 batch_count = 0 ids_to_update = file_utils.load_file( generated_files_directory, 'ids_to_update_' + self.batch_file_name) processed_ids = file_utils.load_file( generated_files_directory, 'processed_ids_' + self.batch_file_name) filtered_ids = [] for _id in all_ids: if _id not in processed_ids: filtered_ids.append(_id) print 'Processing', self.batch_file_name, len(filtered_ids), 'ids' self.batch_fetch_docs(filtered_ids, ID_IRDB) for _id in filtered_ids: processed_ids[_id] = '' processed_count += 1 batch_count += 1 if processed_count % 500 == 0: print 'Processing irdb', _id # print 'Processing', processed_count, '/', total_count derwent_ids = self.process_id(_id) if len(derwent_ids) > 0: # print ex_rl ids_to_update[_id] = derwent_ids if batch_count >= 500: batch_count = 0 file_utils.save_file(generated_files_directory, 'ids_to_update_' + self.batch_file_name, ids_to_update) file_utils.save_file(generated_files_directory, 'processed_ids_' + self.batch_file_name, processed_ids) file_utils.save_file(generated_files_directory, 'ids_to_update_' + self.batch_file_name, ids_to_update) file_utils.save_file(generated_files_directory, 'processed_ids_' + self.batch_file_name, processed_ids) # file_utils.save_file(generated_files_directory, 'missing_pubmed_ids_' + self.batch_file_name, self.missing_pubmed_ids) print 'Docs to update..............................................', len( ids_to_update)
def run(self): self.processed_doc_count = 0 self.total_doc_count = self.get_total_doc_count() print 'Total doc count', self.total_doc_count # self.create_destination_index(mapping=None) self.export_doc_ids(server=self.src_data_loader_utils.server, src_index=self.src_data_loader_utils.index, src_type=self.src_data_loader_utils.type) print 'saving missing docs' file_utils.save_file('/data/data_loading/pubmed_2019', 'missing_docs_pubmed2019.json', self.missing_destination_ids)
def combine_original_docs(self): files = [] generated_files_directory = self.load_config.generated_files_directory( ) for name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, name) if os.path.isfile(file_path) and name.startswith('original_docs_'): files.append(name) combined = {} for name in files: data = file_utils.load_file(generated_files_directory, name) combined.update(data) print 'Original docs', len(combined) file_utils.save_file(self.load_config.other_files_directory(), 'original_docs.json', combined)
def combine_inverted_index(self): files = [] generated_files_directory = self.load_config.generated_files_directory( ) for name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, name) if os.path.isfile(file_path) and name.startswith( 'inverted_index_'): files.append(name) combined = {} for name in files: data = file_utils.load_file(generated_files_directory, name) combined.update(data) print 'Inverted index', len(combined) file_utils.save_file(self.load_config.other_files_directory(), 'inverted_index.json', combined)
def process(self, data_directory): for name in os.listdir(data_directory): file_path = os.path.join(data_directory, name) if os.path.isfile(file_path) and name.endswith('.xml'): print 'Parsing file:', file_path xmltodict.parse(open(file_path), item_depth=1, item_callback=self.handle_row) file_utils.save_file(data_directory, 'schema.json', self.format) # ft = { # "test": { # "data": ["test"], # "parents": [ "clinical_results", "outcome_list", "outcome", "analysis_list", "analysis", "param_type" ] # } # } self.create_mapping(data_directory, self.format)
def split_to_batches(self): print 'Fetching doc ids for', self.load_config.index, self.load_config.type query = self.get_query() print json.dumps(query) all_ids = export_doc_ids.get_doc_ids(self.load_config.server, self.load_config.index, self.load_config.type, self.batch_docs_directory(), ALL_IDS_FILE, query=query) all_ids = all_ids.keys() all_ids.sort() print 'all_ids', len(all_ids) batch_file_names = [] batch_index = 0 batch_ids = [] # Splitting into batches for _id in all_ids: batch_ids.append(_id) if len(batch_ids) >= self.batch_doc_count: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(self.batch_docs_directory(), batch_file_name, batch_ids) batch_ids = [] batch_index += 1 if len(batch_ids) > 0: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(self.batch_docs_directory(), batch_file_name, batch_ids) batch_index += 1 return batch_file_names
def process_file(self): file_name = os.path.basename(self.update_file) self.load_config.data_source_name = file_name.split('.')[0] print self.update_file data_source = XMLDataSource(self.update_file, 2) data_source.process_rows(self.process_row) print self.update_file print 'Docs with citations:', len(self.docs_with_citations) print 'New Docs:', len(self.new_docs) print 'Total Docs:', len(self.total_ids) if len(self.docs_with_citations) > 0: file_utils.make_directory("docs_with_citations") file_utils.save_file("docs_with_citations", self.load_config.data_source_name + '.json', self.docs_with_citations)
def process_docs(self, docs): print 'Processing docs', len(docs) citation_errors = {} for _id in docs: # print 'Processing doc', _id doc = docs[_id] citations_from_update_history = self.get_citations_from_data(doc) current_citations = self.get_citations(doc) if len(current_citations) != len(citations_from_update_history): citation_errors[_id] = citations_from_update_history print _id, 'current citations:', len( current_citations), 'citations from update history:', len( citations_from_update_history) file_utils.save_file(self.batch_docs_directory, 'citation_errors_' + self.batch_name + '.json', citation_errors)
def run(self): # self.get_updated_docs() self.updated_docs = file_utils.load_file( self.load_config.other_files_directory(), 'updated_docs.json') print 'Updated docs:', len(self.updated_docs) print 'Original docs:', len(self.original_docs) # self.get_original_docs() # sys.exit(1) self.original_docs = file_utils.load_file( self.load_config.other_files_directory(), 'original_docs.json') self.inverted_index = file_utils.load_file( self.load_config.other_files_directory(), 'inverted_index.json') self.inverted_index_for_updated_docs = file_utils.load_file( self.load_config.other_files_directory(), 'inverted_index_for_updated_docs.json') print 'Updated docs:', len(self.updated_docs) print 'Original docs:', len(self.original_docs) print 'Inverted index:', len(self.inverted_index) print 'inverted_index_for_updated_docs:', len( self.inverted_index_for_updated_docs) # print json.dumps(self.inverted_index_for_updated_docs) # input = raw_input('Continue?') # if input.lower() in ['n', 'no', '0']: # sys.exit(1) self.update_docs() print 'Docs with updates', len(self.docs_with_updates) # print json.dumps(self.docs_with_updates) print 'Missing docs' print json.dumps(self.missing_docs.keys()) file_utils.save_file(self.load_config.other_files_directory(), 'docs_with_updates.json', self.docs_with_updates)
def send_notifications(self, prospects): load_config = self.get_load_config() email_client = EmailClient(load_config) # print 'Prospects', all_prospects # self.get_logger().info('Prospects ' + str(prospects)) failed_prospects = [] local_date = self.local_date_time.split(' ')[0] # Send email notifications for prospect in prospects: problems = email_client.send_notification_for_prospect( prospect, local_date) if len(problems) > 0: failed_prospects.append({ 'problems': problems, 'prospect': prospect }) # Dump failed prospects to file if len(failed_prospects) > 0: file_utils.save_file( self.get_load_config().other_files_directory(), 'failed_prospects.json', failed_prospects)
def load(self): generated_files_directory = self.load_config.data_source_directory() print 'Processing batch', self.batch_file_name processes_ids_file_name = 'processed_ids_' + self.batch_file_name ids_to_update_file_name = 'ids_to_update_' + self.batch_file_name # Get processed ids processed_ids = file_utils.load_file(generated_files_directory, processes_ids_file_name) if processed_ids is None or len(processed_ids) == 0: print 'Processed ids file not found, aborting...' return # Get batch ids batch_ids = file_utils.load_file(generated_files_directory, self.batch_file_name) if batch_ids is None or len(batch_ids) == 0: print 'batch ids not found, aborting....' return # Continue processing print 'Processed ids count:', len(processed_ids), ' ~ ', len(batch_ids) if len(processed_ids) != len(batch_ids): print 'Processing not finished, aborting...' return else: print 'Processing complete for', self.batch_file_name, ', proceeding with data load...' ids_to_update = file_utils.load_file(generated_files_directory, ids_to_update_file_name) # Get the loaded ids loaded_ids = self.get_loaded_ids(self.reports_directory) total_count = len(ids_to_update) count = 0 filtered_ids = [] for _id in ids_to_update: if _id not in loaded_ids: filtered_ids.append(_id) print 'Ids to update:', len(filtered_ids) reformatted_array = {} for _id in filtered_ids: count += 1 derwent_ids = ids_to_update[_id] uspto_ids = [] for derwent_id in derwent_ids: uspto_id = derwent_id.replace('DP', '') if uspto_id in self.uspto_ids: uspto_ids.append(uspto_id) if _id not in reformatted_array: reformatted_array[_id] = [] if len(derwent_ids) > 0: relationship = { 'index_id': ID_DERWENT_PATENTS, 'ids': derwent_ids, 'type': RELATIONSHIP_TYPE_RELATIONS } reformatted_array[_id].append(relationship) if len(uspto_ids) > 0: relationship = { 'index_id': ID_USPTO, 'ids': uspto_ids, 'type': RELATIONSHIP_TYPE_RELATIONS } reformatted_array[_id].append(relationship) # print 'Reformatted ids', reformatted_array # time.sleep(10) # if len(uspto_ids) > 0: # print _id, len(derwent_ids) # print derwent_ids # print uspto_ids # time.sleep(4) # print extended_relations if len(reformatted_array) > 0: self.load_ids(reformatted_array) file_utils.save_file(self.reports_directory, 'loaded_ids_' + self.batch_file_name, reformatted_array.keys())
def set_downloaded_files(other_files_directory, downloaded_files): file_utils.save_file(other_files_directory, USPTO_DOWNLOADED_FILES, downloaded_files)