def convertAllFilesAndAddToDB(self, ALL_INPUT_FILES, inputdir, import_options): """ Loads each XML file, saves it as a SciDoc JSON file, adds its metadata to the database """ progress=ProgressIndicator(True, self.num_files_to_process, dot_every_xitems=20) tasks=[] for fn in ALL_INPUT_FILES[FILES_TO_PROCESS_FROM:FILES_TO_PROCESS_TO]: corpus_id=self.generate_corpus_id(fn) match=cp.Corpus.getMetadataByField("metadata.filename",os.path.basename(fn)) if not match or import_options.get("reload_xml_if_doc_in_collection",False): if self.use_celery: match_id=match["guid"] if match else None tasks.append(importXMLTask.apply_async( args=[ os.path.join(inputdir,fn), corpus_id, self.import_id, self.collection_id, import_options, match_id ], queue="import_xml" )) else: # main loop over all files filename=cp.Corpus.paths.inputXML+fn corpus_id=self.generate_corpus_id(fn) match=cp.Corpus.getMetadataByField("metadata.filename",os.path.basename(fn)) if not match: try: doc=convertXMLAndAddToCorpus( os.path.join(inputdir,fn), corpus_id, self.import_id, self.collection_id, import_options ) except ValueError: logging.exception("ERROR: Couldn't convert %s" % fn) continue progress.showProgressReport("Importing -- latest file %s" % fn)
def reloadSciDocsOnly(self, conditions, inputdir, file_mask): """ Iterates through the papers already in the collection given the condition. Tries to load their scidoc. If KeyError occurs, it loads the XML again """ ## filenames=cp.Corpus.SQLQuery("SELECT guid,metadata.filename FROM papers where %s limit 10000" % conditions) in_collection=[item["_source"] for item in cp.Corpus.unlimitedQuery( index="papers", doc_type="paper", _source=["metadata.corpus_id","metadata.filename","guid"], q=conditions )] print("Fixing broken SciDocs") print("Listing all loaded papers...") ALL_INPUT_FILES=self.loadListOrListAllFiles(inputdir,file_mask) files_to_process=[] files_hash={} for input_file in ALL_INPUT_FILES: corpus_id=self.generate_corpus_id(input_file) files_hash[corpus_id]=input_file print("Iterating over all papers trying to load them...") tasks=[] import_options={"reload_xml_if_doc_in_collection": True,} progress=ProgressIndicator(True,len(in_collection)) for item in in_collection: corpus_id=self.generate_corpus_id(item["metadata"]["filename"]) assert corpus_id==item["metadata"]["corpus_id"] try: doc=cp.Corpus.loadSciDoc(item["guid"]) except KeyError: print("File %s is broken" % item["guid"]) if self.use_celery: tasks.append(importXMLTask.apply_async(args=[ os.path.join(cp.Corpus.paths.inputXML,files_hash[corpus_id]), corpus_id, self.import_id, self.collection_id, import_options ], kwargs={"existing_guid":item["guid"]}, queue="import_xml" )) else: files_to_process.append([files_hash[corpus_id],item["guid"]]) progress.showProgressReport("Checking papers") if self.use_celery: return print("Processing all %s broken files..." % len(files_to_process)) progress=ProgressIndicator(True,len(files_to_process)) for fn in files_to_process: corpus_id=self.generate_corpus_id(fn[0]) try: doc=convertXMLAndAddToCorpus( os.path.join(cp.Corpus.paths.inputXML,fn[0]), corpus_id, self.import_id, self.collection_id, import_options, existing_guid=fn[1], ) except ValueError: logging.exception("ERROR: Couldn't convert %s" % fn) continue progress.showProgressReport("Importing -- latest file %s" % fn)