def importCorpus(self, root_input_dir, file_mask="*.xml", import_options={}, maxfiles=10000000000): """ Does all that is necessary for the initial import of the corpus """ inputdir=ensureTrailingBackslash(root_input_dir) print("Starting ingestion of corpus...") print("Creating database...") cp.Corpus.createAndInitializeDatabase() cp.Corpus.connectToDB() cp.Corpus.metadata_index=self.metadata_index cp.Corpus.FILES_TO_IGNORE=self.files_to_ignore self.start_time=datetime.datetime.now() ALL_INPUT_FILES=self.loadListOrListAllFiles(inputdir,file_mask) self.num_files_to_process=min(len(ALL_INPUT_FILES),FILES_TO_PROCESS_TO-FILES_TO_PROCESS_FROM) if import_options.get("convert_and_import_docs",True): print("Converting input files to SciDoc format and loading metadata...") self.convertAllFilesAndAddToDB(ALL_INPUT_FILES, inputdir, import_options) if import_options.get("update_doc_references",True): print("Updating in-collection links...") ## ALL_GUIDS=cp.Corpus.SQLQuery("SELECT guid FROM papers where metadata.original_citation_style = \"AFI\" or metadata.original_citation_style = null or metadata.pmc_id <> null limit 20000000") ## ALL_GUIDS=cp.Corpus.SQLQuery("SELECT guid FROM papers where metadata.original_citation_style = \"AFI\" or metadata.original_citation_style = null or metadata.pmc_id <> null limit 200") ALL_GUIDS=cp.Corpus.listPapers("metadata.collection_id:\"%s\"" % self.collection_id) ## assert False self.updateInCollectionReferences(ALL_GUIDS, import_options) self.end_time=datetime.datetime.now() print("All done. Processed %d files. Took %s" % (self.num_files_to_process, str(self.end_time-self.start_time)))
def saveGraphForResults(filename,metric): """ """ dir=ensureTrailingBackslash(getFileDir(filename)) drawSimilaritiesGraph(filename,metric,True) name=getFileName(filename) plt.savefig(dir+name+'.png', bbox_inches='tight') plt.close()
def makeAllGraphsForExperiment(exp_dir): """ Iterates through all weight*.csv files in the experiment's directory and saves a graph for each """ ## metric="avg_mrr" ## metric="precision_total" metric="avg_precision" exp_dir=ensureTrailingBackslash(exp_dir) ## working_dir=Corpus.dir_experiments+exp_name+os.sep for path in glob.glob(exp_dir+"weight*.csv"): saveGraphForResults(path,metric)
def connectCorpus(self, base_directory, initializing_corpus=False,suppress_error=False): """ If DB has been created, connect to it. If not, initialize it first. Args: base_directory: root dir of this corpus initializing_corpus: if True, create DB and directories suppress_error: if true, db doesn't complain if it's connected already """ self.setPaths(ensureTrailingBackslash(base_directory)) if initializing_corpus: self.createAndInitializeDatabase() self.connectToDB(suppress_error)