예제 #1
0
def loadAZLabels(annot_dir=""):
    """
        Loads generated AZ labels from AZPrime output
    """
    if annot_dir=="":
        annot_dir=cp.Corpus.paths.output

    papers=cp.Corpus.listPapers()

    print("Loading AZPrime labels...")
    progress=ProgressIndicator(True, len(papers),False)

    for guid in papers:
        filename=os.path.join(annot_dir, guid+".pred.txt")
        if os.path.exists(filename):

            doc=cp.Corpus.loadSciDoc(guid)
            f=file(filename, "r")
            lines=f.readlines()
            allsentences=[s for s in doc.allsentences if s.get("type","") == "s"]

            if len(lines) != len(allsentences):
                print("Number of tags mismatch! %d != %d -- %s" % (len(lines), len(allsentences), guid))
                lines=["" for n in range(len(allsentences))]
##            else:
##                print("No mismatch! %d != %d -- %s" % (len(lines), len(doc.allsentences), guid))

            for index,sent in enumerate(allsentences):
                sent["az"]=lines[index].strip()
            cp.Corpus.saveSciDoc(doc)
        else:
            print("Cannot find annotation file for guid %s" % guid)

        progress.showProgressReport("Loading labels -- %s" % guid)
예제 #2
0
def ownAZannot(export_annots=False):
    """
        Annotates each sentence using own classifier
    """
    from minerva.az.az_cfc_classification import AZannotator

    annot=AZannotator("trained_az_classifier.pickle")

    papers=cp.Corpus.listPapers(max_results=sys.maxint)

    writer=AZPrimeWriter()
    writer.save_pos_tags=True
##    papers=papers[:1]
    progress=ProgressIndicator(True, len(papers),False)

    print("Producing annotations for SciDocs...")
    for guid in papers:
        doc=cp.Corpus.loadSciDoc(guid)
        annot.annotateDoc(doc)
        if export_annots:
            output_filename=os.path.join(cp.Corpus.paths.output, doc.metadata["guid"]+".annot.txt")
            output_file=open(output_filename,"w")
            for sentence in doc.allsentences:
                output_file.write(sentence.get("az","")+"\n")
            output_file.close()
        else:
            cp.Corpus.saveSciDoc(doc)

        progress.showProgressReport("Annotating -- %s" % guid)
예제 #3
0
    def prebuildBOWsForTests(self, exp, options):
        """
            Generates BOWs for each document from its inlinks, stores them in a
            corpus cached file

            :param parameters: list of parameters
            :param maxfiles: max. number of files to process. Simple parameter for debug
            :param overwrite_existing_bows: should BOWs be rebuilt even if existing?

        """
        self.exp=exp
        self.options=options

        maxfiles=options.get("max_files_to_process",sys.maxint)

        if len(self.exp.get("rhetorical_annotations",[])) > 0:
            print("Loading AZ/CFC classifiers")
            cp.Corpus.loadAnnotators()

        print("Prebuilding BOWs for", min(len(cp.Corpus.ALL_FILES),maxfiles), "files...")
        numfiles=min(len(cp.Corpus.ALL_FILES),maxfiles)

        if self.use_celery:
            print("Queueing tasks...")
            tasks=[]
            for guid in cp.Corpus.ALL_FILES[:maxfiles]:
                for method_name in self.exp["prebuild_bows"]:
                    run_annotators=self.exp.get("rhetorical_annotations",[]) if self.exp.get("run_rhetorical_annotators",False) else []
                    if self.use_celery:
                        tasks.append(prebuildBOWTask.apply_async(args=[
                            method_name,
                            self.exp["prebuild_bows"][method_name]["parameters"],
                            self.exp["prebuild_bows"][method_name]["function_name"],
                            guid,
                            self.options["overwrite_existing_bows"],
                            run_annotators],
                            queue="prebuild_bows"))

        else:
            progress=ProgressIndicator(True, numfiles, False)
            for guid in cp.Corpus.ALL_FILES[:maxfiles]:
                for method_name in self.exp["prebuild_bows"]:
                    run_annotators=self.exp.get("rhetorical_annotations",[]) if self.exp.get("run_rhetorical_annotators",False) else []
                    prebuildMulti(
                                  method_name,
                                  self.exp["prebuild_bows"][method_name]["parameters"],
                                  self.exp["prebuild_bows"][method_name]["function"],
                                  None,
                                  None,
                                  guid,
                                  self.options["overwrite_existing_bows"],
                                  run_annotators
                                  )
                progress.showProgressReport("Building BOWs")
예제 #4
0
def aggregate_statistics(conditions=None, max_files=sys.maxint):
    """
        Aggretates all counts from all documents in the collection
    """
    res = {
        "csc_type_counts": {},
        "az_counts": {},
        "num_sentences": [],
        "num_sections": [],
        "num_paragraphs": [],
        "per_zone_citations": {},
        "num_files": 0,
    }

    print("Listing files...")

    papers = cp.Corpus.listRecords(conditions, max_results=max_files, table="papers", field="_id")
    print("Aggregating statistics for %d SciDocs" % len(papers))
    progress = ProgressIndicator(True, len(papers), print_out=False)

    num_files = 0
    for guid in papers:
        ##        try:
        ##            stats=cp.Corpus.getStatistics(guid)
        ##        except:
        computeAnnotationStatistics(guid)
        try:
            stats = cp.Corpus.getStatistics(guid)
        except:
            continue

        for key in ["csc_type_counts", "az_counts", "per_zone_citations"]:
            for key2 in stats[key]:
                res[key][key2] = res[key].get(key2, 0) + stats[key][key2]

        for key in ["num_sentences", "num_sections", "num_paragraphs"]:
            res[key].append(stats[key])

        num_files += 1

        progress.showProgressReport("Aggregating statistics -- latest paper " + guid)

    if num_files == 0:
        print("No files found in db!")
        return

    for key in ["num_sentences", "num_sections", "num_paragraphs"]:
        res[key.replace("num", "avg")] = sum(res[key]) / float(num_files)

    res["num_files"] = num_files
    json.dump(res, file(os.path.join(cp.Corpus.paths.output, "stats.json"), "w"))
예제 #5
0
def add_statistics_to_all_files(use_celery=False, conditions=None, max_files=sys.maxint):
    """
        For each paper in the corpus, it computes and stores its statistics
    """
    print("Listing files...")
    papers = cp.Corpus.listPapers(conditions, max_results=max_files)
    ##    papers=cp.Corpus.listRecords(conditions, max_results=max_files, field="_id", table="papers")
    print("Computing statistics for %d SciDocs" % len(papers))
    progress = ProgressIndicator(True, len(papers), print_out=False)
    for guid in papers:
        if use_celery:
            computeAnnotationStatisticsTask.apply_async(args=[guid], kwargs={}, queue="compute_statistics")
        else:
            computeAnnotationStatistics(guid)
            progress.showProgressReport("Computing statistics -- latest paper " + guid)
예제 #6
0
def fix_citation_parent_aac():
    """
    """
    from minerva.proc.results_logging import ProgressIndicator
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac")
    guids=cp.Corpus.listPapers("metadata.collection_id:\"AAC\"")
    progress=ProgressIndicator(True, len(guids), True)
    for guid in guids:
        doc=cp.Corpus.loadSciDoc(guid)
        for cit in doc.citations:
            if "parent" in cit:
                cit["parent_s"]=cit.pop("parent")
        cp.Corpus.saveSciDoc(doc)
        progress.showProgressReport("Fixing badly imported PaperXML")
예제 #7
0
def fix_authors_full_corpus():
    """
        Fixes authors in each metadata entry having a "papers" key which they
        shouldn't
    """
    from minerva.proc.results_logging import ProgressIndicator
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc")
    guids=cp.Corpus.listPapers()
    progress=ProgressIndicator(True, len(guids), True)
    for guid in guids:
        doc_meta=cp.Corpus.getMetadataByGUID(guid)
        new_authors=[]
        for old_author in doc_meta.authors:
            del old_author["papers"]
        cp.Corpus.updatePaper(doc_meta)
        progress.showProgressReport("Removing redundant author information")
예제 #8
0
    def convertAllFilesAndAddToDB(self, ALL_INPUT_FILES, inputdir, import_options):
        """
            Loads each XML file, saves it as a SciDoc JSON file, adds its metadata to
            the database
        """
        progress=ProgressIndicator(True, self.num_files_to_process, dot_every_xitems=20)
        tasks=[]

        for fn in ALL_INPUT_FILES[FILES_TO_PROCESS_FROM:FILES_TO_PROCESS_TO]:
            corpus_id=self.generate_corpus_id(fn)
            match=cp.Corpus.getMetadataByField("metadata.filename",os.path.basename(fn))
            if not match or import_options.get("reload_xml_if_doc_in_collection",False):
                if self.use_celery:
                        match_id=match["guid"] if match else None
                        tasks.append(importXMLTask.apply_async(
                          args=[
                                os.path.join(inputdir,fn),
                                corpus_id,
                                self.import_id,
                                self.collection_id,
                                import_options,
                                match_id
                                ],
                                queue="import_xml"
                                ))
                else:
                    # main loop over all files
                    filename=cp.Corpus.paths.inputXML+fn
                    corpus_id=self.generate_corpus_id(fn)

                    match=cp.Corpus.getMetadataByField("metadata.filename",os.path.basename(fn))
                    if not match:
                        try:
                            doc=convertXMLAndAddToCorpus(
                                os.path.join(inputdir,fn),
                                corpus_id,
                                self.import_id,
                                self.collection_id,
                                import_options
                                )
                        except ValueError:
                            logging.exception("ERROR: Couldn't convert %s" % fn)
                            continue

                        progress.showProgressReport("Importing -- latest file %s" % fn)
예제 #9
0
def exportSciXML():
    """
        Exports all scidocs with the selected collection_id to AZPrime XML in the output dir of the corpus
    """
    papers=cp.Corpus.listPapers(max_results=sys.maxint)

    writer=AZPrimeWriter()
    writer.save_pos_tags=True
##    papers=papers[3894:]
    progress=ProgressIndicator(True, len(papers),False)
    print("Exporting SciXML files")
    for guid in papers:
        doc=cp.Corpus.loadSciDoc(guid)
        if len(doc.allsentences) < 1:
            continue
        writer.write(doc, os.path.join(cp.Corpus.paths.output, doc.metadata["guid"]+".pos.xml"))
        cp.Corpus.saveSciDoc(doc)
        progress.showProgressReport("Exporting -- %s" % guid)
예제 #10
0
    def listAllFiles(self, start_dir, file_mask):
        """
            Creates an ALL_FILES list with relative paths from the start_dir
        """
        ALL_FILES=[]

        from minerva.proc.results_logging import ProgressIndicator
        progress=ProgressIndicator(True, 25000, False)

        for dirpath, dirnames, filenames in os.walk(start_dir):
            for filename in filenames:
                if fnmatch.fnmatch(filename,file_mask) and filename not in cp.Corpus.FILES_TO_IGNORE:
                        fn=os.path.join(dirpath,filename)
                        fn=fn.replace(start_dir,"")
                        ALL_FILES.append(fn)
                        progress.showProgressReport("listing")

        print("Total files:",len(ALL_FILES))
        return ALL_FILES
예제 #11
0
    def buildGeneralIndex(self, exp, options):
        """
            Creates one index for each method and parameter, adding all files to each
        """
        print ("Building global index...")
        fwriters={}

        index_max_year=exp.get("index_max_year",None)

        indexNames=getDictOfLuceneIndeces(exp["prebuild_general_indexes"])
        for entry_name in indexNames:
            entry=indexNames[entry_name]
            entry["function_name"]=exp["prebuild_bows"][entry["bow_name"]]["function_name"]

        max_results=options.get("max_files_to_process",sys.maxint)

        ALL_GUIDS=cp.Corpus.listPapers("metadata.year:<=%d" % index_max_year,  max_results=max_results)
        for indexName in indexNames:
            actual_dir=cp.Corpus.getRetrievalIndexPath("ALL_GUIDS", indexName, full_corpus=True)
            fields=self.listFieldsToIndex(indexNames[indexName])
            self.createIndex(actual_dir,fields)
            fwriters[indexName]=self.createIndexWriter(actual_dir)

        print("Adding",len(ALL_GUIDS),"files:")

        if not self.use_celery:
##            widgets = ['Adding file: ', SimpleProgress(), ' ', Bar(), ' ', ETA()]
##            progress = ProgressBar(widgets=widgets, maxval=100).start()
            progress=ProgressIndicator(True, len(ALL_GUIDS), print_out=False)
            for guid in ALL_GUIDS:
                addBOWsToIndex(guid, indexNames, index_max_year, fwriters)
                progress.showProgressReport("Adding papers to index")
            for fwriter in fwriters:
                fwriters[fwriter].close()
        else:
            print("Queueing up files for import...")
            for guid in ALL_GUIDS:
                addToindexTask.apply_async(args=[
                                                guid,
                                                indexNames,
                                                index_max_year,
                                                ],
                                            queue="add_to_index")
예제 #12
0
    def precomputeQueries(self,exp):
        """
            Precompute all queries for all annotated citation contexts

            :param exp: experiment dict with all options
            :type exp: dict
        """
        self.exp=exp
        print("Precomputing queries...")
        logger=ProgressIndicator(True, numitems=len(exp["test_files"])) # init all the logging/counting
        logger.numchunks=exp.get("numchunks",10)

        cp.Corpus.loadAnnotators()

        # convert nested dict to flat dict where each method includes its parameters in the name
        self.all_doc_methods=getDictOfTestingMethods(exp["doc_methods"])

        self.precomputed_queries=[]
        self.files_dict=OrderedDict()

##        if exp["full_corpus"]:
##            files_dict["ALL_FILES"]={}
##            files_dict["ALL_FILES"]["doc_methods"]=all_doc_methods
##            files_dict["ALL_FILES"]["tfidf_models"]=[]
##            for method in all_doc_methods:
##                actual_dir=cp.Corpus.getRetrievalIndexPath("ALL_FILES",all_doc_methods[method]["index_filename"],exp["full_corpus"])
##                files_dict["ALL_FILES"]["tfidf_models"].append({"method":method,"actual_dir":actual_dir})

        #===================================
        # MAIN LOOP over all testing files
        #===================================
        for guid in exp["test_files"]:
            try:
                self.processOneFile(guid)
            except ValueError:
                print("Can't load SciDoc ",guid)
                continue

            logger.showProgressReport(guid) # prints out info on how it's going

        self.saveAllQueries()
        print("Precomputed queries saved.")
예제 #13
0
    def updateInCollectionReferences(self, ALL_GUIDS, import_options={}):
        """
            For every guid, it matches its in-collection references, and its
            resolvable citations

            Args:
                ALL_GUIDS: list of guids
        """
        print("Finding resolvable references, populating database...")
        progress=ProgressIndicator(True, len(ALL_GUIDS), dot_every_xitems=100)

        tasks=[]

        for doc_id in ALL_GUIDS[FILES_TO_PROCESS_FROM:FILES_TO_PROCESS_TO]:
            if self.use_celery:
                tasks.append(updateReferencesTask.apply_async(
                    args=[doc_id, import_options],
                    kwargs={},
                    queue="update_references"
                    ))
            else:
                doc_meta=updatePaperInCollectionReferences(doc_id, import_options)
                filename=doc_meta["filename"] if doc_meta else "<ERROR>"
                progress.showProgressReport("Updating references -- latest paper "+filename)
예제 #14
0
    def reloadSciDocsOnly(self, conditions, inputdir, file_mask):
        """
            Iterates through the papers already in the collection given the
            condition. Tries to load their scidoc. If KeyError occurs, it loads
            the XML again
        """
##        filenames=cp.Corpus.SQLQuery("SELECT guid,metadata.filename FROM papers where %s limit 10000" % conditions)
        in_collection=[item["_source"] for item in cp.Corpus.unlimitedQuery(
            index="papers",
            doc_type="paper",
            _source=["metadata.corpus_id","metadata.filename","guid"],
            q=conditions
            )]

        print("Fixing broken SciDocs")
        print("Listing all loaded papers...")
        ALL_INPUT_FILES=self.loadListOrListAllFiles(inputdir,file_mask)
        files_to_process=[]
        files_hash={}
        for input_file in ALL_INPUT_FILES:
            corpus_id=self.generate_corpus_id(input_file)
            files_hash[corpus_id]=input_file

        print("Iterating over all papers trying to load them...")
        tasks=[]
        import_options={"reload_xml_if_doc_in_collection": True,}
        progress=ProgressIndicator(True,len(in_collection))
        for item in in_collection:
            corpus_id=self.generate_corpus_id(item["metadata"]["filename"])
            assert corpus_id==item["metadata"]["corpus_id"]
            try:
                doc=cp.Corpus.loadSciDoc(item["guid"])
            except KeyError:
                print("File %s is broken" % item["guid"])
                if self.use_celery:
                    tasks.append(importXMLTask.apply_async(args=[
                            os.path.join(cp.Corpus.paths.inputXML,files_hash[corpus_id]),
                            corpus_id,
                            self.import_id,
                            self.collection_id,
                            import_options
                            ],
                            kwargs={"existing_guid":item["guid"]},
                            queue="import_xml"
                            ))
                else:
                    files_to_process.append([files_hash[corpus_id],item["guid"]])

            progress.showProgressReport("Checking papers")

        if self.use_celery:
            return

        print("Processing all %s broken files..." % len(files_to_process))
        progress=ProgressIndicator(True,len(files_to_process))

        for fn in files_to_process:
            corpus_id=self.generate_corpus_id(fn[0])
            try:
                doc=convertXMLAndAddToCorpus(
                    os.path.join(cp.Corpus.paths.inputXML,fn[0]),
                    corpus_id,
                    self.import_id,
                    self.collection_id,
                    import_options,
                    existing_guid=fn[1],
                    )
            except ValueError:
                logging.exception("ERROR: Couldn't convert %s" % fn)
                continue

            progress.showProgressReport("Importing -- latest file %s" % fn)