예제 #1
0
    def buildIndexes(self, testfiles, methods):
        """
            For every test file in [testfiles],
                create index
                for every in-collection reference,
                    add all of the BOWs of methods in [methods] to index
        """
        self.initializeIndexer()

        count=0
        for guid in testfiles:
            count+=1
            print("Building index: paper ",count,"/",len(testfiles),":",guid)

            fwriters={}
            doc=cp.Corpus.loadSciDoc(guid)
            if not doc:
                print("Error loading SciDoc for", guid)
                continue

            indexNames=getDictOfLuceneIndeces(methods)

            for indexName in indexNames:
                actual_dir=cp.Corpus.getRetrievalIndexPath(guid, indexName, full_corpus=False)
                fwriters[indexName]=self.createIndexWriter(actual_dir)

            # old way, assuming the documents are fine and one can just load all in-collection references
            # ...NOT! must select them using the same method that gets the resolvable CITATIONS
            # updated! Should work well now
##            for ref in doc["references"]:
##                match=cp.Corpus.matcher.matchReference(ref)
##                if match:
##                    ref_guid=match["guid"]
            # even newer way: just use the precomputed metadata.outlinks
            outlinks=cp.Corpus.getMetadataByGUID(guid)["outlinks"]
            for ref_guid in outlinks:
                addBOWsToIndex(ref_guid,indexNames,9999,fwriters)
                # TODO integrate this block below into addBOWsToIndex
##                for indexName in indexNames:
##                    # get the maximum year to create inlink_context descriptions from
##                    if indexNames[indexName]["options"].get("max_year",False) == True:
##                        max_year=cp.Corpus.getMetadataByGUID(test_guid)["year"]
##                    else:
##                        max_year=None

            for fwriter in fwriters:
                fwriters[fwriter].close()
예제 #2
0
    def buildGeneralIndex(self, exp, options):
        """
            Creates one index for each method and parameter, adding all files to each
        """
        print ("Building global index...")
        fwriters={}

        index_max_year=exp.get("index_max_year",None)

        indexNames=getDictOfLuceneIndeces(exp["prebuild_general_indexes"])
        for entry_name in indexNames:
            entry=indexNames[entry_name]
            entry["function_name"]=exp["prebuild_bows"][entry["bow_name"]]["function_name"]

        max_results=options.get("max_files_to_process",sys.maxint)

        ALL_GUIDS=cp.Corpus.listPapers("metadata.year:<=%d" % index_max_year,  max_results=max_results)
        for indexName in indexNames:
            actual_dir=cp.Corpus.getRetrievalIndexPath("ALL_GUIDS", indexName, full_corpus=True)
            fields=self.listFieldsToIndex(indexNames[indexName])
            self.createIndex(actual_dir,fields)
            fwriters[indexName]=self.createIndexWriter(actual_dir)

        print("Adding",len(ALL_GUIDS),"files:")

        if not self.use_celery:
##            widgets = ['Adding file: ', SimpleProgress(), ' ', Bar(), ' ', ETA()]
##            progress = ProgressBar(widgets=widgets, maxval=100).start()
            progress=ProgressIndicator(True, len(ALL_GUIDS), print_out=False)
            for guid in ALL_GUIDS:
                addBOWsToIndex(guid, indexNames, index_max_year, fwriters)
                progress.showProgressReport("Adding papers to index")
            for fwriter in fwriters:
                fwriters[fwriter].close()
        else:
            print("Queueing up files for import...")
            for guid in ALL_GUIDS:
                addToindexTask.apply_async(args=[
                                                guid,
                                                indexNames,
                                                index_max_year,
                                                ],
                                            queue="add_to_index")