예제 #1
0
    def importCorpus(self, root_input_dir, file_mask="*.xml", import_options={}, maxfiles=10000000000):
        """
            Does all that is necessary for the initial import of the corpus
        """
        inputdir=ensureTrailingBackslash(root_input_dir)

        print("Starting ingestion of corpus...")
        print("Creating database...")

        cp.Corpus.createAndInitializeDatabase()
        cp.Corpus.connectToDB()

        cp.Corpus.metadata_index=self.metadata_index
        cp.Corpus.FILES_TO_IGNORE=self.files_to_ignore

        self.start_time=datetime.datetime.now()
        ALL_INPUT_FILES=self.loadListOrListAllFiles(inputdir,file_mask)

        self.num_files_to_process=min(len(ALL_INPUT_FILES),FILES_TO_PROCESS_TO-FILES_TO_PROCESS_FROM)
        if import_options.get("convert_and_import_docs",True):
            print("Converting input files to SciDoc format and loading metadata...")
            self.convertAllFilesAndAddToDB(ALL_INPUT_FILES, inputdir, import_options)

        if import_options.get("update_doc_references",True):
            print("Updating in-collection links...")

##            ALL_GUIDS=cp.Corpus.SQLQuery("SELECT guid FROM papers where metadata.original_citation_style = \"AFI\" or metadata.original_citation_style = null or metadata.pmc_id <> null limit 20000000")
##            ALL_GUIDS=cp.Corpus.SQLQuery("SELECT guid FROM papers where metadata.original_citation_style = \"AFI\" or metadata.original_citation_style = null or metadata.pmc_id <> null limit 200")

            ALL_GUIDS=cp.Corpus.listPapers("metadata.collection_id:\"%s\"" % self.collection_id)
##            assert False
            self.updateInCollectionReferences(ALL_GUIDS, import_options)

        self.end_time=datetime.datetime.now()
        print("All done. Processed %d files. Took %s" % (self.num_files_to_process, str(self.end_time-self.start_time)))
예제 #2
0
def saveGraphForResults(filename,metric):
    """
    """
    dir=ensureTrailingBackslash(getFileDir(filename))
    drawSimilaritiesGraph(filename,metric,True)
    name=getFileName(filename)
    plt.savefig(dir+name+'.png', bbox_inches='tight')
    plt.close()
예제 #3
0
def makeAllGraphsForExperiment(exp_dir):
    """
        Iterates through all weight*.csv files in the experiment's directory and
        saves a graph for each
    """
##    metric="avg_mrr"
##    metric="precision_total"
    metric="avg_precision"
    exp_dir=ensureTrailingBackslash(exp_dir)
##    working_dir=Corpus.dir_experiments+exp_name+os.sep
    for path in glob.glob(exp_dir+"weight*.csv"):
        saveGraphForResults(path,metric)
예제 #4
0
    def connectCorpus(self, base_directory, initializing_corpus=False,suppress_error=False):
        """
            If DB has been created, connect to it. If not, initialize it first.

            Args:
                base_directory: root dir of this corpus
                initializing_corpus: if True, create DB and directories
                suppress_error: if true, db doesn't complain if it's connected already
        """
        self.setPaths(ensureTrailingBackslash(base_directory))

        if initializing_corpus:
            self.createAndInitializeDatabase()
        self.connectToDB(suppress_error)