예제 #1
0
def aggregate_statistics(conditions=None, max_files=sys.maxint):
    """
        Aggretates all counts from all documents in the collection
    """
    res = {
        "csc_type_counts": {},
        "az_counts": {},
        "num_sentences": [],
        "num_sections": [],
        "num_paragraphs": [],
        "per_zone_citations": {},
        "num_files": 0,
    }

    print("Listing files...")

    papers = cp.Corpus.listRecords(conditions, max_results=max_files, table="papers", field="_id")
    print("Aggregating statistics for %d SciDocs" % len(papers))
    progress = ProgressIndicator(True, len(papers), print_out=False)

    num_files = 0
    for guid in papers:
        ##        try:
        ##            stats=cp.Corpus.getStatistics(guid)
        ##        except:
        computeAnnotationStatistics(guid)
        try:
            stats = cp.Corpus.getStatistics(guid)
        except:
            continue

        for key in ["csc_type_counts", "az_counts", "per_zone_citations"]:
            for key2 in stats[key]:
                res[key][key2] = res[key].get(key2, 0) + stats[key][key2]

        for key in ["num_sentences", "num_sections", "num_paragraphs"]:
            res[key].append(stats[key])

        num_files += 1

        progress.showProgressReport("Aggregating statistics -- latest paper " + guid)

    if num_files == 0:
        print("No files found in db!")
        return

    for key in ["num_sentences", "num_sections", "num_paragraphs"]:
        res[key.replace("num", "avg")] = sum(res[key]) / float(num_files)

    res["num_files"] = num_files
    json.dump(res, file(os.path.join(cp.Corpus.paths.output, "stats.json"), "w"))
예제 #2
0
def add_statistics_to_all_files(use_celery=False, conditions=None, max_files=sys.maxint):
    """
        For each paper in the corpus, it computes and stores its statistics
    """
    print("Listing files...")
    papers = cp.Corpus.listPapers(conditions, max_results=max_files)
    ##    papers=cp.Corpus.listRecords(conditions, max_results=max_files, field="_id", table="papers")
    print("Computing statistics for %d SciDocs" % len(papers))
    progress = ProgressIndicator(True, len(papers), print_out=False)
    for guid in papers:
        if use_celery:
            computeAnnotationStatisticsTask.apply_async(args=[guid], kwargs={}, queue="compute_statistics")
        else:
            computeAnnotationStatistics(guid)
            progress.showProgressReport("Computing statistics -- latest paper " + guid)