Пример #1
0
def ingest_single_parse(docket, deletions, insertions, parser):
    if parser not in ('sentence', '4-gram'):
        raise "Parser must be one of 'sentence' or '4-gram'. Got '%s'." % parser

    corpora = get_corpora_by_metadata('docket_id', docket.id)

    parsed_corpora = [c for c in corpora if c.metadata.get('parser') == parser]

    if len(parsed_corpora) == 0:
        c = Corpus(metadata=dict(docket_id=docket.id, agency_id=docket.agency, parser=parser))
        print "Created new corpus #%s for %s parse." % (c.id, parser)
    
    elif len(parsed_corpora) == 1:
        c = parsed_corpora[0]
        print "Updating existing corpus #%s for %s parse." % (c.id, parser)
        
        print "Deleting documents at %s..." % datetime.now()
        c.delete_by_metadata('document_id', deletions + [d['metadata']['document_id'] for d in insertions])
    
    else:
        raise "More than one sentence parse for docket %s found. Shouldn't happen--will need ot manually remove extra corpora." % docket.id
    
    print "Inserting documents at %s..." % datetime.now()
    if parser == 'sentence':
        i = DocumentIngester(c, parser=sentence_parse, compute_similarities=False)
    elif parser == '4-gram':
        i = DocumentIngester(c, parser=ngram_parser(4), compute_similarities=True)
    i.ingest(insertions)

    print "Removing hierarchy, if cached, at %s..." % datetime.now()
    c.delete_hierarchy_cache()
Пример #2
0
def repair_missing_docket(docket_id):
    """Recreate any dockets that Mongo thinks are analyzed already but aren't in Postgres.

    Note that this is a very limited form or repair, corresponding to the particular
    situation in which some malformed dockets have been deleted from Postgres by
    hand, but not marked as such on the Mongo side. As other particular problems
    arise we may add different repair methods.
    """

    # only repair if MongoDB thinks that something should be in Postgres already
    if Doc.objects(docket_id=docket_id, in_cluster_db=True).count() == 0:
        return

    # does docket exist at all?
    corpora = get_corpora_by_metadata('docket_id', docket_id)

    if len(corpora) == 0:
        # neither parse exists, mark as unclustered in Mongo
        update_count = Doc.objects(
            docket_id=docket_id,
            in_cluster_db=True).update(set__in_cluster_db=False)
        print "Docket %s missing in Postgres. Marked %s documents with in_cluster_db=False." % (
            docket_id, update_count)
        ingest_docket(docket_id)
    elif len(corpora) == 1 or len(corpora) > 2:
        # we have a single or multiple parses...that's something unexpected that we can't fix automatically
        raise "Found %s corpora for docket %s. Expected either 0 or 2 corpora. Must fix by hand." % (
            len(corpora), docket_id)
Пример #3
0
def print_stats(docket_id):
    print "MongoDB has\t%s in_cluster_db=True, deleted=False;\t%s in_cluster_db=False,deleted=False" % \
        (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=False, type='public_submission').count(),
        Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=False, type='public_submission').count())
    print "\t\t%s in_cluster_db=True, deleted=True;\t%s in_cluster_db=False,deleted=True" % \
        (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=True, type='public_submission').count(),
        Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=True, type='public_submission').count())


    for corpus in get_corpora_by_metadata('docket_id', docket_id):
            print "Corpus %s (%s) has %s documents." % (corpus.id, corpus.metadata, corpus.num_docs())
Пример #4
0
def print_stats(docket_id):
    print "MongoDB has\t%s in_cluster_db=True, deleted=False;\t%s in_cluster_db=False,deleted=False" % \
        (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=False, type='public_submission').count(),
        Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=False, type='public_submission').count())
    print "\t\t%s in_cluster_db=True, deleted=True;\t%s in_cluster_db=False,deleted=True" % \
        (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=True, type='public_submission').count(),
        Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=True, type='public_submission').count())

    for corpus in get_corpora_by_metadata('docket_id', docket_id):
        print "Corpus %s (%s) has %s documents." % (corpus.id, corpus.metadata,
                                                    corpus.num_docs())
Пример #5
0
def ingest_single_parse(docket_id, deletions, insertions, parser):
    if parser not in ('sentence', '4-gram'):
        raise "Parser must be one of 'sentence' or '4-gram'. Got '%s'." % parser

    corpora = get_corpora_by_metadata('docket_id', docket_id)

    parsed_corpora = [c for c in corpora if c.metadata.get('parser') == parser]

    if len(parsed_corpora) == 0:
        dockets = list(Docket.objects(id=docket_id).only('agency'))
        docket = dockets[0] if dockets else Docket()
        c = Corpus(metadata=dict(docket_id=docket_id,
                                 agency_id=docket.agency if docket.
                                 agency else docket_id.split("-")[0],
                                 parser=parser))
        print "Created new corpus #%s for %s parse." % (c.id, parser)

    elif len(parsed_corpora) == 1:
        c = parsed_corpora[0]
        print "Updating existing corpus #%s for %s parse." % (c.id, parser)

        print "Deleting documents at %s..." % datetime.now()
        c.delete_by_metadata(
            'document_id',
            deletions + [d['metadata']['document_id'] for d in insertions])

    else:
        raise "More than one sentence parse for docket %s found. Shouldn't happen--will need ot manually remove extra corpora." % docket_id

    print "Inserting documents at %s..." % datetime.now()
    if parser == 'sentence':
        i = DocumentIngester(c,
                             parser=sentence_parse,
                             compute_similarities=False)
    elif parser == '4-gram':
        i = DocumentIngester(c,
                             parser=ngram_parser(4),
                             compute_similarities=True)
    i.ingest(insertions)

    print "Removing hierarchy, if cached, at %s..." % datetime.now()
    c.delete_hierarchy_cache()
Пример #6
0
def repair_missing_docket(docket):
    """Recreate any dockets that Mongo thinks are analyzed already but aren't in Postgres.

    Note that this is a very limited form or repair, corresponding to the particular
    situation in which some malformed dockets have been deleted from Postgres by
    hand, but not marked as such on the Mongo side. As other particular problems
    arise we may add different repair methods.
    """

    # only repair if MongoDB thinks that something should be in Postgres already
    if Doc.objects(docket_id=docket.id, in_cluster_db=True).count() == 0:
        return

    # does docket exist at all?
    corpora = get_corpora_by_metadata('docket_id', docket.id)
    
    if len(corpora) == 0:
        # neither parse exists, mark as unclustered in Mongo
        update_count = Doc.objects(docket_id=docket.id, in_cluster_db=True).update(safe_update=True, set__in_cluster_db=False)
        print "Docket %s missing in Postgres. Marked %s documents with in_cluster_db=False." % (docket.id, update_count)
        ingest_docket(docket)
    elif len(corpora) == 1 or len(corpora) > 2:
        # we have a single or multiple parses...that's something unexpected that we can't fix automatically
        raise "Found %s corpora for docket %s. Expected either 0 or 2 corpora. Must fix by hand." % (len(corpora), docket.id)