def process_batches(self): batch_file_names = [] for batch_file_name in os.listdir(TEMP_DIR): file_path = os.path.join(TEMP_DIR, batch_file_name) if os.path.isfile(file_path) and batch_file_name.startswith('batch_'): batch_file_names.append(batch_file_name) print "Generated ", len(batch_file_names), 'batch file names' batch_file_names.sort() if len(batch_file_names) == 0: batch_file_names = self.split_to_batches() print len(batch_file_names) raw_input('Continue?') processed_batches = file_utils.load_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json') for batch_file_name in batch_file_names: if batch_file_name not in processed_batches: print 'Loading batch', batch_file_name batch = file_utils.load_file(TEMP_DIR, batch_file_name) self.copy_docs_batch(batch) processed_batches[batch_file_name] = 0 file_utils.save_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json', processed_batches)
def process_batches(self): load_config = self.get_load_config() generated_files_directory = load_config.data_source_directory() other_files_directory = load_config.other_files_directory() batch_file_names = [] for batch_file_name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, batch_file_name) if os.path.isfile(file_path) and batch_file_name.startswith( 'batch_'): batch_file_names.append(batch_file_name) print "Generated ", len(batch_file_names), 'batch file names' batch_file_names.sort() if len(batch_file_names) == 0: batch_file_names = self.split_to_batches() processed_batches = file_utils.load_file( other_files_directory, 'processed_spires_pubmed_batches.json') for batch_file_name in batch_file_names: if batch_file_name not in processed_batches: print 'Loading batch', batch_file_name batch = file_utils.load_file(generated_files_directory, batch_file_name) self.process_batch(batch) processed_batches[batch_file_name] = 0 file_utils.save_file(other_files_directory, 'processed_spires_pubmed_batches.json', processed_batches)
def process_irdb_relations(self): generated_files_directory = self.load_config.data_source_directory() all_ids = file_utils.load_file(generated_files_directory, self.batch_file_name) processed_count = 0 batch_count = 0 ids_to_update = file_utils.load_file( generated_files_directory, 'ids_to_update_' + self.batch_file_name) processed_ids = file_utils.load_file( generated_files_directory, 'processed_ids_' + self.batch_file_name) filtered_ids = [] for _id in all_ids: if _id not in processed_ids: filtered_ids.append(_id) print 'Processing', self.batch_file_name, len(filtered_ids), 'ids' self.batch_fetch_docs(filtered_ids, ID_IRDB) for _id in filtered_ids: processed_ids[_id] = '' processed_count += 1 batch_count += 1 if processed_count % 500 == 0: print 'Processing irdb', _id # print 'Processing', processed_count, '/', total_count derwent_ids = self.process_id(_id) if len(derwent_ids) > 0: # print ex_rl ids_to_update[_id] = derwent_ids if batch_count >= 500: batch_count = 0 file_utils.save_file(generated_files_directory, 'ids_to_update_' + self.batch_file_name, ids_to_update) file_utils.save_file(generated_files_directory, 'processed_ids_' + self.batch_file_name, processed_ids) file_utils.save_file(generated_files_directory, 'ids_to_update_' + self.batch_file_name, ids_to_update) file_utils.save_file(generated_files_directory, 'processed_ids_' + self.batch_file_name, processed_ids) # file_utils.save_file(generated_files_directory, 'missing_pubmed_ids_' + self.batch_file_name, self.missing_pubmed_ids) print 'Docs to update..............................................', len( ids_to_update)
def process_file(load_config, batch_file_name): generated_files_directory = load_config.generated_files_directory() # print 'Processing batch', batch_file_name processes_ids_file_name = 'processed_ids_' + batch_file_name ids_to_update_file_name = 'ids_to_update_' + batch_file_name # Get processed ids processed_ids = file_utils.load_file(generated_files_directory, processes_ids_file_name) if processed_ids is None or len(processed_ids) == 0: print 'Processed ids file not found, aborting...' return # Get batch ids batch_ids = file_utils.load_file(generated_files_directory, batch_file_name) if batch_ids is None or len(batch_ids) == 0: print 'batch ids not found, aborting....' return # Continue processing # print batch_file_name, 'Processed ids count:', len(processed_ids), ' ~ ', len(batch_ids) # if len(processed_ids) != len(batch_ids): # print 'Processing not finished, aborting...' # return # else: # print 'Processing complete for', self.batch_file_name, ', proceeding with data load...' ids_to_update = file_utils.load_file(generated_files_directory, ids_to_update_file_name) total_count = len(ids_to_update) count = 0 reformatted_array = {} for _id in ids_to_update: count += 1 derwent_ids = ids_to_update[_id] if _id not in reformatted_array: reformatted_array[_id] = [] if len(derwent_ids) > 0: relationship = { 'index_id': ID_DERWENT_PATENTS, 'ids': derwent_ids, 'type': RELATIONSHIP_TYPE_RELATIONS } reformatted_array[_id].append(relationship) # print 'Reformatted ids', len(reformatted_array) return reformatted_array, ids_to_update
def verify_citations(): load_config = pubmed_load_config.get_load_config() load_config.process_count = 1 FTPManager(load_config).download_new_update_files() print 'Loading pubmed ids...' doc_ids = file_utils.load_file( load_config.index, load_config.index + '_ids.json') if len(doc_ids) == 0: doc_ids = export_doc_ids(load_config.server, load_config.index, load_config.type, load_config.index, load_config.index + '_ids.json') print len(doc_ids), 'Total pubmed ids' total_new_update_files = get_all_update_files(load_config) filtered_update_files = [] for update_file in total_new_update_files: if '1010.xml' in update_file: filtered_update_files.append(update_file) print 'Total update files:', len(filtered_update_files) print filtered_update_files if len(filtered_update_files) > 0: for new_update_file in filtered_update_files: print 'Processing file:', new_update_file verify_citations = VerifyCitations(new_update_file, doc_ids) verify_citations.process_file()
def load_update_summary(self, update_file): update_record_file_name = self.get_update_summary_file_name( update_file) # print 'update_record_file_name', update_record_file_name return file_utils.load_file( self.get_update_records_directory(DIR_UPDATE_SUMMARY), update_record_file_name)
def run(): directory = '/data/data_loading/scripts/data-load-n/data_load/irdb/new_sample_data' file_format = '.json' missing_fields = {} for name in os.listdir(directory): file_path = os.path.join(directory, name) if os.path.isfile(file_path) and name.endswith(file_format): data = file_utils.load_file(directory, name) if name in fields_mapping: fields = fields_mapping[name] missing_fields_for_file = [] for data_item in data: for field in fields: if field not in data_item: missing_fields_for_file.append(field) break missing_fields[name] = missing_fields_for_file for name in missing_fields: print name print missing_fields[name] print '-----------------------------'
def get_downloaded_files(load_config): other_files_directory = load_config.other_files_directory() downloaded_files = file_utils.load_file(other_files_directory, GRANTS_DOWNLOADED_FILES) if len(downloaded_files) == 0: downloaded_files = [] return downloaded_files
def get_downloaded_update_file_urls(self): other_files_directory = self.load_config.other_files_directory() downloaded_update_file_urls = file_utils.load_file( other_files_directory, DOWNLOADED_UPDATE_FILES) if len(downloaded_update_file_urls) == 0: return [] return downloaded_update_file_urls
def process_update_record(load_config, update_record_name): logger = log_utils.create_logger('pubmed2018_update', load_config.log_files_directory()) logger.info('Loading update record: ' + str(update_record_name)) update_records_directory = get_update_records_directory(load_config) update_record = file_utils.load_file(update_records_directory, update_record_name) local_date = update_record['local_date'] update_data = update_record['update_data'] docs_with_new_citations = update_record['docs_with_new_citations'] logger.info('Update record loaded') logger.info('Date: ' + str(local_date)) logger.info('Update files: ') for update_file in update_data: logger.info(update_file) logger.info('Docs with new citations: ' + str(len(docs_with_new_citations))) all_prospects = send_prospective_citations_notifications(logger, docs_with_new_citations) # Send update notification logger.info('Sending update status mail...') EmailClient.send_update_notifications(local_date, update_data, all_prospects) logger.info('Done')
def load_tasks_list(self): # print 'Loading tasks list', self.root_directory tasks_list = file_utils.load_file(self.root_directory, 'tasks_list.json') if len(tasks_list) == 0: tasks_list = [] return tasks_list
def run(): load_config = irdb_load_config.get_load_config() load_config.data_source_name = DS_PATENT_RELATIONS data_source_batch_name = 'loaded_ids' data_source_batch_directory = load_config.data_source_batch_directory( data_source_batch_name) all_updated_ids = {} all_indexed_ids = {} all_failed_ids = {} for name in os.listdir(data_source_batch_directory): file_path = os.path.join(data_source_batch_directory, name) if os.path.isfile(file_path) and name.startswith( DATA_LOADER_BATCH_PREFIX): # print 'processing file:', file_path batch_data = file_utils.load_file(data_source_batch_directory, name) updated_ids = batch_data['updated_ids'] indexed_ids = batch_data['indexed_ids'] failed_ids = batch_data['failed_ids'] for _id in updated_ids: all_updated_ids[_id] = 0 for _id in indexed_ids: all_indexed_ids[_id] = 0 for _id in failed_ids: all_failed_ids[_id] = 0 print len(all_failed_ids), 'all_failed_ids' print len(all_indexed_ids), 'all_indexed_ids' print len(all_updated_ids), 'all_updated_ids'
def process_completed(self): # if len(self.citation_errors) == 0: # self.citation_errors = file_utils.load_file(self.batch_docs_directory(), 'citation_errors.json') # print len(self.citation_errors), 'citation errors' # print self.citation_errors.keys() # file_utils.save_file(self.batch_docs_directory(), 'citation_errors.json', self.citation_errors) batch_file_names = [] for batch_file_name in os.listdir(self.batch_docs_directory()): file_path = os.path.join(self.batch_docs_directory(), batch_file_name) if os.path.isfile(file_path) and batch_file_name.startswith( 'citation_errors_batch_'): batch_file_names.append(batch_file_name) citation_errors = {} for batch_file_name in batch_file_names: print 'Loading batch', batch_file_name batch = file_utils.load_file(self.batch_docs_directory(), batch_file_name) for _id in batch: citation_errors[_id] = batch[_id] print len(citation_errors), 'citation errors' print citation_errors.keys() raw_input('Load Citations?')
def start_process_doc_batch(self, batch_file_name): print 'Loading batch', batch_file_name batch = file_utils.load_file(self.batch_docs_directory(), batch_file_name) batch_name = batch_file_name.split('.')[0] results = self.process_docs_batch(batch, batch_name) file_utils.save_file(self.batch_docs_directory(), RESULTS_FILE_PREFIX + batch_file_name, results)
def get_processed_files(load_config): other_files_directory = load_config.other_files_directory() processed_file_urls = file_utils.load_file( other_files_directory, PROCESSED_UPDATE_FILES) if len(processed_file_urls) == 0: return [] return processed_file_urls
def process_batch(self, load_config, batch_file_name): generated_files_directory = load_config.generated_files_directory() ids_to_update_file_name = 'ids_to_update_' + batch_file_name ids_to_update = file_utils.load_file(generated_files_directory, ids_to_update_file_name) return ids_to_update
def get_loaded_ids(self, reports_directory): loaded_ids = {} for name in os.listdir(reports_directory): file_path = os.path.join(reports_directory, name) if os.path.isfile(file_path) and name.startswith("loaded_ids_"): # print 'processing file:', name batch_data = file_utils.load_file(reports_directory, name) for _id in batch_data: loaded_ids[_id] = 0 return loaded_ids
def save_update_record(self): update_records_name = 'pubmed_update_records.json' update_records = file_utils.load_file( self.pubmed_updater.get_update_records_directory(), update_records_name) if len(update_records) == 0: update_records = [] # update_records_for_date = [] # if self.local_date_time in update_records: # update_records_for_date = update_records[self.local_date_time] # update_records_for_date.extend(self.files_to_process) # update_records[self.local_date_time] = update_records_for_date update_data = self.pubmed_updater.generate_update_summary( self.files_to_process) update_file_records = [] for update_file_path in update_data: update_file_name = os.path.basename(update_file_path) update_data_for_file = update_data[update_file_path] articles_processed = len( update_data_for_file['articles_processed']) new_articles = len(update_data_for_file['new_articles']) updated_articles = articles_processed - new_articles update_file_record_item = { 'file_name': update_file_name, 'file_path': update_file_path, 'total_articles': articles_processed, 'new_articles': new_articles, 'updated_articles': updated_articles } update_file_records.append(update_file_record_item) update_record_item = { 'date': self.local_date_time, 'update_files': update_file_records } update_records.append(update_record_item) file_utils.save_file( self.pubmed_updater.get_update_records_directory(), update_records_name, update_records) # Save processed files list file_manager.update_processed_files(self.get_load_config(), self.files_to_process)
def run(self): # doc_ids = export_doc_ids( self.server, self.index, # self.type, self.index + '_' + self.type , 'doc_ids.json') doc_ids = file_utils.load_file( self.ct_load_config.index, self.ct_load_config.index + '_ids.json') if len(doc_ids) == 0: doc_ids = export_doc_ids.export_doc_ids(self.ct_load_config.server, self.ct_load_config.index, self.ct_load_config.type) doc_ids = doc_ids.keys() self.data_utils.batch_fetch_docs_for_ids(base_url= self.ct_load_config.server, ids=doc_ids, index= self.ct_load_config.index, type= self.ct_load_config.type, docs_fetched=self.docs_fetched) print 'Total pubmed relations', len(self.pubmed_relations) print 'Total ct relations', len(self.pubmed_relations) # Load Pubmed relations pubmed_ids = {} pubmed_ids = data_mapper.reformat(reformatted_array=pubmed_ids, relations_array=self.pubmed_relations, dest_index_id=ID_CLINICAL_TRIALS, relationship_type=RELATIONSHIP_TYPE_CITATIONS) print 'Reformatted pubmed ids', len(pubmed_ids) self.pubmed_load_config.append_relations = True self.pubmed_load_config.source = 'ct_publications' self.pubmed_load_config.data_source_name = 'ct_publications_relations' data_load_batcher = DataLoadBatcher(self.pubmed_load_config, self.pubmed_load_config.index, self.pubmed_load_config.type) data_load_batcher.load_relationships = True data_load_batcher.process_data_rows('pubmed_ct_citations', pubmed_ids) # Load Clinical trials relations ct_ids = {} ct_ids = data_mapper.reformat(reformatted_array=ct_ids, relations_array=self.ct_relations, dest_index_id=ID_PUBMED, relationship_type=RELATIONSHIP_TYPE_CITED_BYS) print 'Reformatted ct ids', len(ct_ids) self.ct_load_config.append_relations = True self.ct_load_config.source = 'ct_publications' self.ct_load_config.data_source_name = 'ct_publications_relations' data_load_batcher = DataLoadBatcher(self.ct_load_config, self.ct_load_config.index, self.ct_load_config.type) data_load_batcher.load_relationships = True data_load_batcher.process_data_rows('ct_pubmed_cited_bys', ct_ids)
def run(self): # self.get_updated_docs() self.updated_docs = file_utils.load_file( self.load_config.other_files_directory(), 'updated_docs.json') print 'Updated docs:', len(self.updated_docs) print 'Original docs:', len(self.original_docs) # self.get_original_docs() # sys.exit(1) self.original_docs = file_utils.load_file( self.load_config.other_files_directory(), 'original_docs.json') self.inverted_index = file_utils.load_file( self.load_config.other_files_directory(), 'inverted_index.json') self.inverted_index_for_updated_docs = file_utils.load_file( self.load_config.other_files_directory(), 'inverted_index_for_updated_docs.json') print 'Updated docs:', len(self.updated_docs) print 'Original docs:', len(self.original_docs) print 'Inverted index:', len(self.inverted_index) print 'inverted_index_for_updated_docs:', len( self.inverted_index_for_updated_docs) # print json.dumps(self.inverted_index_for_updated_docs) # input = raw_input('Continue?') # if input.lower() in ['n', 'no', '0']: # sys.exit(1) self.update_docs() print 'Docs with updates', len(self.docs_with_updates) # print json.dumps(self.docs_with_updates) print 'Missing docs' print json.dumps(self.missing_docs.keys()) file_utils.save_file(self.load_config.other_files_directory(), 'docs_with_updates.json', self.docs_with_updates)
def save_new_pmids(self, update_files): load_config = self.load_manager.get_load_config() self.existing_pubmed_ids = file_utils.load_file( load_config.other_files_directory(), ALL_PUBMED_IDS_FILE) update_summary = self.generate_update_summary(update_files) for update_file in update_summary: update_summary_for_file = update_summary[update_file] articles_processed = update_summary_for_file['articles_processed'] for _id in articles_processed: self.existing_pubmed_ids[_id] = None file_utils.save_file(load_config.other_files_directory(), ALL_PUBMED_IDS_FILE, self.existing_pubmed_ids)
def get_doc_ids(load_config, index_id): other_files_directory = load_config.other_files_directory() file_name = 'DOC_IDS_' + index_id + '.json' print 'Loading', file_name doc_ids = file_utils.load_file(other_files_directory, file_name) if len(doc_ids) == 0: doc_ids = export_doc_ids.export_doc_ids( server=SERVER, src_index=INDEX_MAPPING[index_id]['index'], src_type=INDEX_MAPPING[index_id]['type'], dest_dir=other_files_directory, dest_file_name=file_name) return doc_ids
def load_prospects(self): prospects_file_name = self.get_prospects_file_name() update_records_directory = self.pubmed_updater.get_update_records_directory( DIR_PROSPECTS) self.get_logger().info('Loading prospects...' + update_records_directory + ' ' + prospects_file_name) update_record = file_utils.load_file(update_records_directory, prospects_file_name) if 'prospects' in update_record: return update_record['prospects'] else: return None
def check_tags_and_annotations(self): missing_ids = file_utils.load_file(missing_ids_directory, 'missing_ids.json') new_ids = file_utils.load_file(missing_ids_directory, 'new_ids.json') print 'Missing ids', len(missing_ids) print 'New ids', len(new_ids) docs_with_tags = self.fetch_ids() missing_docs_with_tags = [] for _id in missing_ids: if _id in docs_with_tags: missing_docs_with_tags.append(_id) print 'Missing docs with tags', _id print 'Missing docs with tags', len(missing_docs_with_tags) print 'Missing docs with tags', json.dumps(missing_docs_with_tags) for _id in missing_docs_with_tags: existing_doc = self.get_existing_doc(_id) if 'userTags' in existing_doc: user_tags = existing_doc['userTags'] for user_tag in user_tags: added_by = user_tag['added_by'] if added_by == '*****@*****.**': self.docs_for_dolan[_id] = existing_doc print _id print user_tags break print 'Docs for Dolan', len(self.docs_for_dolan) print 'Docs for Dolan', self.docs_for_dolan.keys()
def run(self): # doc_ids = export_doc_ids( self.server, self.index, # self.type, self.index + '_' + self.type , 'doc_ids.json') doc_ids = file_utils.load_file(self.index, self.index + '_ids.json') if len(doc_ids) == 0: doc_ids = export_doc_ids.export_doc_ids(self.server, self.index, self.type) doc_ids = doc_ids.keys() batch_doc_processor = BatchDocProcessor(doc_ids, self.process_batch, self.batch_size, self.process_count, self.process_spawn_delay) batch_doc_processor.run()
def combine_original_docs(self): files = [] generated_files_directory = self.load_config.generated_files_directory( ) for name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, name) if os.path.isfile(file_path) and name.startswith('original_docs_'): files.append(name) combined = {} for name in files: data = file_utils.load_file(generated_files_directory, name) combined.update(data) print 'Original docs', len(combined) file_utils.save_file(self.load_config.other_files_directory(), 'original_docs.json', combined)
def run(): load_config = irdb_load_config.get_load_config() load_config.data_source_name = DS_EXTENDED_RELATIONS generated_files_directory = load_config.generated_files_directory() batch_file_names = [] for batch_file_name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, batch_file_name) if os.path.isfile(file_path) and batch_file_name.startswith('batch_'): batch_file_names.append(batch_file_name) for batch_file_name in batch_file_names: batch = file_utils.load_file(generated_files_directory, batch_file_name) if len(batch) < 1000: print batch_file_name, len(batch)
def combine_inverted_index(self): files = [] generated_files_directory = self.load_config.generated_files_directory( ) for name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, name) if os.path.isfile(file_path) and name.startswith( 'inverted_index_'): files.append(name) combined = {} for name in files: data = file_utils.load_file(generated_files_directory, name) combined.update(data) print 'Inverted index', len(combined) file_utils.save_file(self.load_config.other_files_directory(), 'inverted_index.json', combined)
def analyse_failed_docs_in_batch(data_loader_batch_directory): # print '...Processing', data_loader_batch_directory for name in os.listdir(data_loader_batch_directory): file_path = os.path.join(data_loader_batch_directory, name) if os.path.isfile(file_path) and name.startswith("failed_docs_"): failed_docs = file_utils.load_file(data_loader_batch_directory, name) print file_path, '- Failed docs', len(failed_docs) if len(failed_docs) > 0: a = raw_input('List docs? (y/n)') if a.lower() in ['y', 'yes']: for _id in failed_docs: reason = failed_docs[_id]['reason'] print 'Doc:', _id print 'Reason', reason c = raw_input('Continue?') if c.lower() in ['n', 'no']: break
def get_config(self): file_utils.make_directory(DATA_LOADING_DIRECTORY) config = file_utils.load_file(DATA_LOADING_DIRECTORY, self.config_file) if len(config) == 0: config = self.create_config() self.root_directory = config['root_directory'] self.index_id = config['index_id'] self.server = config['server'] self.server_username = config['server_username'] self.server_password = config['server_password'] self.index = config['index'] self.type = config['type'] self.src_data_exists = config['src_data_exists'] if 'src_data_directory' in config: self.src_data_directory = config['src_data_directory'] if 'local_date_time' in config: self.local_date_time = config['local_date_time'] return config