예제 #1
0
def repair_missing_docket(docket_id):
    """Recreate any dockets that Mongo thinks are analyzed already but aren't in Postgres.

    Note that this is a very limited form or repair, corresponding to the particular
    situation in which some malformed dockets have been deleted from Postgres by
    hand, but not marked as such on the Mongo side. As other particular problems
    arise we may add different repair methods.
    """

    # only repair if MongoDB thinks that something should be in Postgres already
    if Doc.objects(docket_id=docket_id, in_cluster_db=True).count() == 0:
        return

    # does docket exist at all?
    corpora = get_corpora_by_metadata('docket_id', docket_id)

    if len(corpora) == 0:
        # neither parse exists, mark as unclustered in Mongo
        update_count = Doc.objects(
            docket_id=docket_id,
            in_cluster_db=True).update(set__in_cluster_db=False)
        print "Docket %s missing in Postgres. Marked %s documents with in_cluster_db=False." % (
            docket_id, update_count)
        ingest_docket(docket_id)
    elif len(corpora) == 1 or len(corpora) > 2:
        # we have a single or multiple parses...that's something unexpected that we can't fix automatically
        raise "Found %s corpora for docket %s. Expected either 0 or 2 corpora. Must fix by hand." % (
            len(corpora), docket_id)
예제 #2
0
def ingest_docket(docket):
    print "Loading docket %s at %s..." % (docket.id, datetime.now())

    deletions = list(Doc.objects(docket_id=docket.id, deleted=True, in_cluster_db=True, type='public_submission').scalar('id'))

    insertions = [
        dict(text=doc_text(d), metadata=doc_metadata(d))
        for d in Doc.objects(docket_id=docket.id, deleted=False, in_cluster_db=False, type='public_submission')]

    
    print "Found %s documents for deletion, %s documents for insertion or update." % (len(deletions), len(insertions))

    if not insertions and not deletions:
        return

    with transaction.commit_on_success():
        ingest_single_parse(docket, deletions, insertions, 'sentence')
        ingest_single_parse(docket, deletions, insertions, '4-gram')

    print "Marking MongoDB documents as analyzed at %s..." % datetime.now()
    update_count = Doc.objects(id__in=[d['metadata']['document_id'] for d in insertions]) \
                      .update(safe_update=True, set__in_cluster_db=True)
    if update_count != len(insertions):
        print "ERROR: %s documents inserted into Postgres, but only %s documents marked as analyzed in MongoDB." % (len(insertions), update_count)
    update_count = Doc.objects(id__in=deletions) \
                      .update(safe_update=True, set__in_cluster_db=False)
    if update_count != len(deletions):
        print "ERROR: %s documents deleted in Postgres, but only %s documents marked as deleted in MongoDB." % (len(deletions), update_count)
예제 #3
0
def delete_analysis(docket):
    with transaction.commit_on_success():
        c = get_dual_corpora_by_metadata('docket_id', docket.id)
        if c:
            c.delete_corpus()
            print "Deleted docket %s (id=%s)." % (docket.id, c.id)
        else:
            print "Attempted deletion of %s. Docket not found." % docket.id
        Doc.objects(docket_id=docket.id).update(set__in_cluster_db=False)
예제 #4
0
def delete_analysis(docket_id):
    with transaction.commit_on_success():
        c = get_dual_corpora_by_metadata('docket_id', docket_id)
        if c:
            c.delete_corpus()
            print "Deleted docket %s (id=%s)." % (docket_id, c.id)
        else:
            print "Attempted deletion of %s. Docket not found." % docket_id
        Doc.objects(docket_id=docket_id).update(set__in_cluster_db=False)
예제 #5
0
def print_stats(docket_id):
    print "MongoDB has\t%s in_cluster_db=True, deleted=False;\t%s in_cluster_db=False,deleted=False" % \
        (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=False, type='public_submission').count(),
        Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=False, type='public_submission').count())
    print "\t\t%s in_cluster_db=True, deleted=True;\t%s in_cluster_db=False,deleted=True" % \
        (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=True, type='public_submission').count(),
        Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=True, type='public_submission').count())

    for corpus in get_corpora_by_metadata('docket_id', docket_id):
        print "Corpus %s (%s) has %s documents." % (corpus.id, corpus.metadata,
                                                    corpus.num_docs())
예제 #6
0
    def get(self, request, entity_id, docket_id, document_type, entity_type):
        dkt_results = list(Docket.objects(id=docket_id).only('id', 'title'))
        ent_results = list(Entity.objects(id=entity_id).only('id', 'aliases'))
        if not dkt_results or not ent_results:
            raise Http404('Not found.')

        docket = dkt_results[0]
        entity = ent_results[0]

        if document_type == 'mentions':
            docs_q = Doc.objects(Q(attachments__views__entities=entity_id) | Q(views__entities=entity_id), docket_id=docket_id)
        else:
            docs_q = Doc.objects(submitter_entities=entity_id, docket_id=docket_id) \

        docs_q = docs_q.only('type', 'title', 'id', 'views', 'attachments.views', 'details.Date_Posted', 'deleted').hint([("docket_id", 1)])
        docs = filter(lambda d: not d.deleted, sorted(list(docs_q), key=lambda doc: doc.details.get('Date_Posted', datetime.datetime(1900,1,1)), reverse=True))

        get_views = lambda doc: [{
            'object_id': view.object_id,
            'file_type': view.type,
            'url': view.url.replace('inline', 'attachment')
        } for view in doc.views if entity_id in view.entities]

        out_docs = []
        for doc in docs[:10]:
            out_doc = {
                'title': doc.title,
                'id': doc.id,
                'date_posted': doc.details['Date_Posted'],
                'type': doc.type,
                'url': '/document/' + doc.id
            }
            if document_type == 'mentions':
                out_doc['files'] = get_views(doc) + list(itertools.chain.from_iterable([get_views(attachment) for attachment in doc.attachments]))

            out_docs.append(out_doc)

        return Response({
            'documents': out_docs,
            'has_more': len(docs) > 10,
            'count': len(docs),
            'document_search_url': "/search-document/" + \
                url_quote(":".join(["mentioned" if document_type == "mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) + \
                url_quote(":".join(["docket", docket.id, '"%s"' % docket.title])),
            'docket': {
                'id': docket.id,
                'title': docket.title,
            },
            'entity': {
                'id': entity.id,
                'name': entity.aliases[0]
            },
            'filter_type': document_type
        })
예제 #7
0
def print_stats(docket_id):
    print "MongoDB has\t%s in_cluster_db=True, deleted=False;\t%s in_cluster_db=False,deleted=False" % \
        (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=False, type='public_submission').count(),
        Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=False, type='public_submission').count())
    print "\t\t%s in_cluster_db=True, deleted=True;\t%s in_cluster_db=False,deleted=True" % \
        (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=True, type='public_submission').count(),
        Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=True, type='public_submission').count())


    for corpus in get_corpora_by_metadata('docket_id', docket_id):
            print "Corpus %s (%s) has %s documents." % (corpus.id, corpus.metadata, corpus.num_docs())
예제 #8
0
    def handle(self, **options):
        if options['parsable']:
            # disable standard output by monkey-patching sys.stdout
            dev_null = open('/dev/null', 'w')
            real_stdout = sys.stdout
            sys.stdout = dev_null

        doc_kwargs = {'type': 'public_submission'}
        if options.get('docket'):
            doc_kwargs['docket_id'] = options['docket']
        elif options.get('agency'):
            doc_kwargs['agency'] = options['agency']

        print "Enumerating dockets..."
        docket_list = list(
            Doc.objects(
                Q(deleted=True, in_cluster_db=True)
                | Q(deleted=False, in_cluster_db=False),
                **doc_kwargs).distinct('docket_id'))
        docket_count = len(docket_list)
        counter = 0

        print "Beginning loading %s dockets at %s..." % (docket_count,
                                                         datetime.now())

        if options['fork']:
            print "Using forking strategy..."
            import multiprocessing
            for docket_id in docket_list:
                counter += 1
                print "Docket #%s / %s" % (counter, docket_count)

                p = multiprocessing.Process(target=process_docket,
                                            args=[docket_id, options])
                p.start()
                p.join()
        else:
            print "Using single-process strategy..."
            for docket_id in docket_list:
                counter += 1
                print "Docket #%s / %s" % (counter, docket_count)

                process_docket(docket_id, options)

        print "Done."

        if options['parsable']:
            # turn stdout back on so we can print output
            sys.stdout = real_stdout

            print json.dumps({'dockets': docket_count})
예제 #9
0
def ingest_docket(docket_id):
    print "Loading docket %s at %s..." % (docket_id, datetime.now())

    deletions = list(
        Doc.objects(docket_id=docket_id,
                    deleted=True,
                    in_cluster_db=True,
                    type='public_submission').scalar('id'))

    insertions = [
        dict(text=doc_text(d), metadata=doc_metadata(d))
        for d in Doc.objects(docket_id=docket_id,
                             deleted=False,
                             in_cluster_db=False,
                             type='public_submission')
    ]

    print "Found %s documents for deletion, %s documents for insertion or update." % (
        len(deletions), len(insertions))

    if not insertions and not deletions:
        return

    with transaction.commit_on_success():
        ingest_single_parse(docket_id, deletions, insertions, 'sentence')
        ingest_single_parse(docket_id, deletions, insertions, '4-gram')

    print "Marking MongoDB documents as analyzed at %s..." % datetime.now()
    update_count = Doc.objects(id__in=[d['metadata']['document_id'] for d in insertions]) \
                      .update(set__in_cluster_db=True)
    if update_count != len(insertions):
        print "ERROR: %s documents inserted into Postgres, but only %s documents marked as analyzed in MongoDB." % (
            len(insertions), update_count)
    update_count = Doc.objects(id__in=deletions) \
                      .update(set__in_cluster_db=False)
    if update_count != len(deletions):
        print "ERROR: %s documents deleted in Postgres, but only %s documents marked as deleted in MongoDB." % (
            len(deletions), update_count)
예제 #10
0
def repair_missing_docket(docket):
    """Recreate any dockets that Mongo thinks are analyzed already but aren't in Postgres.

    Note that this is a very limited form or repair, corresponding to the particular
    situation in which some malformed dockets have been deleted from Postgres by
    hand, but not marked as such on the Mongo side. As other particular problems
    arise we may add different repair methods.
    """

    # only repair if MongoDB thinks that something should be in Postgres already
    if Doc.objects(docket_id=docket.id, in_cluster_db=True).count() == 0:
        return

    # does docket exist at all?
    corpora = get_corpora_by_metadata('docket_id', docket.id)
    
    if len(corpora) == 0:
        # neither parse exists, mark as unclustered in Mongo
        update_count = Doc.objects(docket_id=docket.id, in_cluster_db=True).update(safe_update=True, set__in_cluster_db=False)
        print "Docket %s missing in Postgres. Marked %s documents with in_cluster_db=False." % (docket.id, update_count)
        ingest_docket(docket)
    elif len(corpora) == 1 or len(corpora) > 2:
        # we have a single or multiple parses...that's something unexpected that we can't fix automatically
        raise "Found %s corpora for docket %s. Expected either 0 or 2 corpora. Must fix by hand." % (len(corpora), docket.id)
예제 #11
0
    def handle(self, **options):
        if options['parsable']:
            # disable standard output by monkey-patching sys.stdout
            dev_null = open('/dev/null', 'w')
            real_stdout = sys.stdout
            sys.stdout = dev_null

        doc_kwargs = {'type': 'public_submission'}
        if options.get('docket'):
            doc_kwargs['docket_id'] = options['docket']
        elif options.get('agency'):
            doc_kwargs['agency'] = options['agency']

        print "Enumerating dockets..."
        docket_list = list(Doc.objects(Q(deleted=True, in_cluster_db=True) | Q(deleted=False, in_cluster_db=False), **doc_kwargs).distinct('docket_id'))
        docket_count = len(docket_list)
        counter = 0

        print "Beginning loading %s dockets at %s..." % (docket_count, datetime.now())

        if options['fork']:
            print "Using forking strategy..."
            import multiprocessing
            for docket_id in docket_list:
                counter += 1
                print "Docket #%s / %s" % (counter, docket_count)

                p = multiprocessing.Process(target=process_docket, args=[docket_id, options])
                p.start()
                p.join()
        else:
            print "Using single-process strategy..."
            for docket_id in docket_list:
                counter += 1
                print "Docket #%s / %s" % (counter, docket_count)

                process_docket(docket_id, options)

        print "Done."

        if options['parsable']:
            # turn stdout back on so we can print output
            sys.stdout = real_stdout

            print json.dumps({'dockets': docket_count})
예제 #12
0
 def get(self, request, document_id, file_type, object_id):
     docs = list(Doc.objects(id=document_id))
     if not docs:
         raise Http404("Document not found")
     doc = docs[0]
     
     # figure out which view it is
     all_views = itertools.chain.from_iterable([doc.views, itertools.chain.from_iterable([attachment.views for attachment in doc.attachments])])
     matches = [view for view in all_views if view.type == file_type and view.object_id == object_id]
     if not matches:
         raise Http404("File record not found")
     match = matches[0]
     
     if not match.downloaded or not match.file_path or not os.path.exists(match.file_path):
         raise Http404("File not found")
     
     # we're good to go; gather some info about the file
     mimetype = magic.from_file(match.file_path, mime=True)
     extension = mimetypes.guess_extension(mimetype)
     if not extension:
         extension = ".%s" % match.type
     
     return sendfile(request, match.file_path, attachment=True, attachment_filename="%s%s" % (match.object_id, extension), mimetype=mimetype)
예제 #13
0
    def get(self, request, document_id, file_type, object_id):
        docs = list(Doc.objects(id=document_id))
        if not docs:
            raise Http404("Document not found")
        doc = docs[0]

        # figure out which view it is
        all_views = itertools.chain.from_iterable([
            doc.views,
            itertools.chain.from_iterable(
                [attachment.views for attachment in doc.attachments])
        ])
        matches = [
            view for view in all_views
            if view.type == file_type and view.object_id == object_id
        ]
        if not matches:
            raise Http404("File record not found")
        match = matches[0]

        if not match.downloaded or not match.file_path or not os.path.exists(
                match.file_path):
            raise Http404("File not found")

        # we're good to go; gather some info about the file
        mimetype = magic.from_file(match.file_path, mime=True)
        extension = mimetypes.guess_extension(mimetype)
        if not extension:
            extension = ".%s" % match.type

        return sendfile(request,
                        match.file_path,
                        attachment=True,
                        attachment_filename="%s%s" %
                        (match.object_id, extension),
                        mimetype=mimetype)
예제 #14
0
GEVENT = False

from regs_models import Doc
import json
import itertools

def split_seq(iterable, size):
    it = iter(iterable)
    item = list(itertools.islice(it, size))
    while item:
        yield item
        item = list(itertools.islice(it, size))

all_ids = json.load(open("/tmp/problems.json"))
for ids in split_seq(all_ids, 1000):
    for doc in Doc.objects(id__in=ids):
        for view in doc.views:
            if view.type == "pdf" and view.mode == "html" and view.extracted == "yes":
                view.extracted = "no"
                view.content.delete()
        for attachment in doc.attachments:
            for view in attachment.views:
                if view.type == "pdf" and view.mode == "html" and view.extracted == "yes":
                    view.extracted = "no"
                    view.content.delete()
        doc.in_search_index = False
        doc.in_cluster_db = False
        doc.entities_last_extracted = None
        
        print "Repaired %s" % doc.id
        doc.save()
예제 #15
0
파일: views.py 프로젝트: imclab/sparerib
    def get(self, request, *args, **kwargs):
        out = super(DocketView, self).get(request, *args, **kwargs).data

        stats = out['stats']
        stats['similar_dockets'] = []
        summaries = []

        if stats['count'] > 0:
            # do a similar thing with FR documents
            if stats.get('doc_info', {}).get('fr_docs', None):
                fr_doc_ids = [doc['id'] for doc in stats['doc_info']['fr_docs']]
                fr_search = Doc.objects(id__in=fr_doc_ids)
                fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search])

                for doc in stats['doc_info']['fr_docs']:
                    if doc['id'] in fr_docs:
                        fr_doc = fr_docs[doc['id']]
                        doc['stats'] = {
                            'date_range': fr_doc.stats['date_range'],
                            'count': fr_doc.stats['count']
                        } if fr_doc.stats else {'count': 0}
                        doc['summary'] = fr_doc.get_summary()
                        doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and fr_doc.details['Comment_Due_Date'] > datetime.datetime.now()

                        if doc['summary']:
                            summaries.append(doc['summary'])
                    else:
                        doc['stats'] = {'count': 0, 'comments_open': False}
                        doc['summary'] = None

                # remove duplicates, if any
                tmp = stats['doc_info']['fr_docs']
                included = set()
                stats['doc_info']['fr_docs'] = []
                for doc in tmp:
                    if doc['id'] not in included:
                        stats['doc_info']['fr_docs'].append(doc)
                        included.add(doc['id'])

            summary_text = "\n".join(summaries)
            if summary_text:
                similar_dockets = get_similar_dockets(summary_text, kwargs[self.aggregation_field])[:3]
                if similar_dockets:
                    sd = dict([(docket.id, docket.title) for docket in Docket.objects(id__in=similar_dockets).only('id', 'title')])
                    stats['similar_dockets'] = [{
                        'id': docket,
                        'title': sd[docket]
                    } for docket in similar_dockets]

        agency = self.item.agency
        if not agency:
            agency = re.split("[-_]", self.item.id)[0]
        
        if agency:
            agency_meta = list(Agency.objects(id=agency).only("name"))
            if agency_meta:
                out['agency'] = {
                    'id': agency,
                    'name': agency_meta[0].name,
                    'url': '/agency/%s' % agency
                }
            else:
                agency = None
        
        if not agency:
            out['agency'] = None

        return Response(out)
예제 #16
0
    def get(self, request, *args, **kwargs):
        "Access aggregate information about entities as they occur in regulations.gov data."
        results = Entity.objects(id=kwargs['entity_id'])
        if not results:
            raise Http404('Docket not found.')

        entity = results[0]

        # basic docket metadata
        out = {
            'name': entity.aliases[0],
            'url': reverse('entity-view', args=args, kwargs=kwargs),
            'id': entity.id,
            'type': entity.td_type,
            'stats': entity.stats
        }

        stats = entity.stats
        if stats:
            # cleanup, plus stitch on some additional data
            now = datetime.datetime.now().date()
            for mention_type in ["text_mentions", "submitter_mentions"]:
                stats[mention_type].update({
                    'months': [month for month in prettify_months(stats[mention_type]['months']) if month['date_range'][0] <= now] if stats[mention_type]['months'] else [],
                })

                # limit ourselves to the top ten of each match type, and grab their extra metadata
                agencies = sorted(stats[mention_type]['agencies'].items(), key=lambda x: x[1], reverse=True)[:10]

                stats[mention_type]['top_agencies'] = [{
                    'id': item[0],
                    'count': item[1],
                    'months': prettify_months(stats[mention_type]['agencies_by_month'][item[0]])
                } for item in agencies]
                del stats[mention_type]['agencies'], stats[mention_type]['agencies_by_month']

                docket_list = stats[mention_type]['dockets'].items()
                years = request.GET.get('years', None)
                if years:
                    year_set = set(years.split(","))
                    docket_list = [item for item in docket_list if get_docket_year(item[0]) in year_set]
                dockets = sorted(docket_list, key=lambda x: x[1], reverse=True)[:10]

                stats[mention_type]['top_dockets'] = [{
                    'id': item[0],
                    'count': item[1]
                } for item in dockets]

                stats[mention_type]['docket_count'] = len(docket_list)
                del stats[mention_type]['dockets']

                stats[mention_type]['docket_search_url'] = "/search-docket/" + url_quote(":".join(["mentioned" if mention_type == "text_mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]]))

            # grab additional docket metadata
            ids = list(set([record['id'] for record in stats['submitter_mentions']['top_dockets']] + [record['id'] for record in stats['text_mentions']['top_dockets']]))
            dockets_search = Docket.objects(id__in=ids).only('id', 'title', 'year', 'details.dk_type', 'agency', 'stats.date_range')
            dockets = dict([(docket.id, docket) for docket in dockets_search])

            # stitch this back onto the main records
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for docket in stats[mention_type]['top_dockets']:
                    rdocket = dockets[docket['id']]
                    docket.update({
                        'title': rdocket.title,
                        'url': reverse('docket-view', kwargs={'docket_id': rdocket.id}),
                        'year': rdocket.year if rdocket.year else (getattr(rdocket.stats['date_range'][0], 'year', None) if 'date_range' in rdocket.stats else None),
                        'rulemaking': rdocket.details.get('Type', 'Nonrulemaking').lower() == 'rulemaking',
                        'agency': rdocket.agency if rdocket.agency else re.split("[-_]", rdocket.id)[0]
                    })

            # repeat for agencies
            ids = list(set([record['id'] for record in stats['submitter_mentions']['top_agencies']] + [record['id'] for record in stats['text_mentions']['top_agencies']]))
            agencies_search = Agency.objects(id__in=ids).only('id', 'name')
            agencies = dict([(agency.id, agency) for agency in agencies_search])

            # ...and stitch
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for agency in stats[mention_type]['top_agencies']:
                    ragency = agencies.get(agency['id'], None)
                    agency.update({
                        'name': ragency.name if ragency else agency['id'],
                        'url': '/agency/%s' % agency['id']
                    })

            # and for comments
            recent_comments = []
            if 'recent_comments' in stats['submitter_mentions']:
                recent_comments_search = Doc.objects(id__in=[doc['id'] for doc in stats['submitter_mentions']['recent_comments']]).only('id', 'title', 'details')
                for comment in recent_comments_search:
                    comment_item = {
                        'title': comment.title,
                        'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None,
                        'author': " ".join([comment.details.get('First_Name', ''), comment.details.get('Last_Name', '')]).strip(),
                        'organization': comment.details.get('Organization_Name', ''),
                        'url': '/document/' + comment.id
                    }
                    comment_item['author'] = comment_item['author'] if comment_item['author'] else None
                    recent_comments.append(comment_item)

            stats['submitter_mentions']['recent_comments'] = recent_comments

            out['stats'] = stats
        else:
            out['stats'] = {'count': 0}

        return Response(out)
예제 #17
0
    def get(self, request, *args, **kwargs):
        "Access basic metadata about regulations.gov documents."
        results = list(Doc.objects(id=kwargs['document_id']))
        if not results or results[0].deleted:
            raise Http404('Document not found.')

        document = results[0]

        # basic document metadata
        out = {
            'title': document.title,
            'url': reverse('document-view', kwargs=kwargs),
            'id': document.id,

            'agency': {
                'id': document.agency,
                'url': reverse('agency-view', kwargs={'agency': document.agency}),
                'name': Agency.objects(id=document.agency).only("name")[0].name
            },
            'date': document.details.get('Date_Posted', None),
            'type': document.type,
            'views': [],
            'attachments': [],
            'details': document.details if document.details else {}
        }

        # inter-dataset suppression
        if 'replaced_by' in document.suppression:
            new_kwargs = dict(kwargs)
            new_kwargs['document_id'] = document.suppression['replaced_by'][0]
            out['redirect_to'] = reverse('document-view', kwargs=new_kwargs)

        # comment-on metadata
        if document.comment_on:
            # if we don't have all the data built in, grab it from its original record
            comment_on_doc = document.comment_on if 'title' in document.comment_on else Doc.objects.get(id=document.comment_on['document_id']).to_mongo()
            out['comment_on'] = {
                "fr_doc": comment_on_doc.get('fr_doc', False),  
                "type": comment_on_doc.get('type', None), 
                "id": document.comment_on['document_id'],
                'url': reverse('document-view', kwargs={'document_id': document.comment_on['document_id']}),
                "title": comment_on_doc['title']
            }
            if comment_on_doc['agency'] == out['agency']['id'] or not comment_on_doc['agency']:
                out['comment_on']['agency'] = out['agency']
            else:
                out['comment_on']['agency'] = {
                    'id': comment_on_doc['agency'],
                    'url': reverse('agency-view', kwargs={'agency': comment_on_doc['agency']}),
                    'name': Agency.objects(id=comment_on_doc['agency']).only("name")[0].name
                }
        else:
            out['comment_on'] = {}

        # docket metadata
        docket = Docket.objects(id=document.docket_id)[0]
        out['docket'] = {
            'id': document.docket_id,
            'url': reverse('docket-view', kwargs={'docket_id': document.docket_id}),
            'title': docket.title,
            'weeks': [],
            'fr_docs': []
        }
        if docket.stats:
            out['docket']['weeks'] = prettify_weeks(docket.stats['weeks'])
            out['docket']['fr_docs'] = docket.stats['doc_info'].get('fr_docs', [])

        if out['date']:
            out['date'] = out['date'].isoformat()

        text_entities = set()
        submitter_entities = set(document.submitter_entities if document.submitter_entities else [])
        
        # a weird thing happens with iterating over mongoengine lists where they lose references to their parent instances, so do this weird generator thing
        for view in (document.views[i] for i in xrange(len(document.views))):
            # hack to deal with documents whose scrapes failed but still got extracted
            object_id = document.object_id if document.object_id else view.file_path.split('/')[-1].split('.')[0]
            out['views'].append({
                'object_id': object_id,
                'file_type': view.type,
                'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()),
                'extracted': view.extracted == 'yes',
                'url': view.download_url,
                'html': reverse('raw-text-view', kwargs={'document_id': document.id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'view'}) if view.extracted == 'yes' else None
            })

            for entity in view.entities:
                text_entities.add(entity)

        for attachment in (document.attachments[i] for i in xrange(len(document.attachments))):
            a = {
                'title': attachment.title,
                'views': []
            }
            for view in (attachment.views[i] for i in xrange(len(attachment.views))):
                a['views'].append({
                    'object_id': attachment.object_id,
                    'file_type': view.type,
                    'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()),
                    'extracted': view.extracted == 'yes',
                    'url': view.download_url,
                    'html': reverse('raw-text-view', kwargs={'document_id': document.id, 'object_id': attachment.object_id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'attachment'}) if view.extracted == 'yes' else None
                })

                for entity in view.entities:
                    text_entities.add(entity)
            out['attachments'].append(a)

        # stats for FR docs
        stats = document.stats if document.stats else {'count': 0}
        # limit ourselves to the top five of each match type, and grab their extra metadata
        for label in ['text_entities', 'submitter_entities']:
            stats['top_' + label] = [{
                'id': i[0],
                'count': i[1]
            } for i in sorted(stats.get(label, {}).items(), key=lambda x: x[1], reverse=True)[:5]]
            if label in stats:
                del stats[label]
        top_entities = set([record['id'] for record in stats['top_text_entities']] + [record['id'] for record in stats['top_submitter_entities']])

        entities_search = Entity.objects(id__in=list(submitter_entities.union(text_entities, top_entities))).only('id', 'td_type', 'aliases')
        entities = dict([(entity.id, entity) for entity in entities_search])

        for label, items in [('submitter_entities', sorted(list(submitter_entities))), ('text_entities', sorted(list(text_entities)))]:
            out[label] = [{
                'id': item,
                'type': entities[item].td_type,
                'name': entities[item].aliases[0],
                'url': '/%s/%s/%s' % (entities[item].td_type, slugify(entities[item].aliases[0]), item)
            } for item in items]

        for label in ['top_text_entities', 'top_submitter_entities']:
            for entity in stats[label]:
                if not entities[entity['id']].td_type:
                    continue
                
                entity['type'] = entities[entity['id']].td_type
                entity['name'] = entities[entity['id']].aliases[0]
                entity['url'] = '/%s/%s/%s' % (entity['type'], slugify(entity['name']), entity['id'])

        if 'weeks' in stats:
            stats['weeks'] = prettify_weeks(stats['weeks'])

        recent_comments = []
        if 'recent_comments' in stats:
            recent_comments_search = Doc.objects(id__in=[doc['id'] for doc in stats['recent_comments']]).only('id', 'title', 'details')
            for comment in recent_comments_search:
                comment_item = {
                    'title': comment.title,
                    'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None,
                    'author': " ".join([comment.details.get('First_Name', ''), comment.details.get('Last_Name', '')]).strip(),
                    'organization': comment.details.get('Organization_Name', ''),
                    'url': '/document/' + comment.id
                }
                comment_item['author'] = comment_item['author'] if comment_item['author'] else None
                recent_comments.append(comment_item)

        stats['recent_comments'] = recent_comments

        out['comment_stats'] = stats

        # links upstream
        out['source'] = document.source
        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url': 'http://www.regulations.gov/#!documentDetail;D=' + document.id,
                'label': 'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            for replaced in document.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url': 'http://www.regulations.gov/#!documentDetail;D=' + replaced,
                    'label': 'Regulations.gov'
                })

        # cleaned-up details
        details = out['details'].copy()
        dp = lambda key, default=None: details.pop(key, default)
        out['clean_details'] = dtls(
            ('Submitter Information', dtls(
                ('Name', combine(dp('First_Name'), dp('Middle_Name'), dp('Last_Name'))),
                ('Organization', dp('Organization_Name')),
                ('Location', combine(dp('Mailing_Address'), dp('Mailing_Address_'), dp('City'), expand_state(dp('State_or_Province')), dp('Postal_Code'), dp('Country'), sep=", ")),
                ('Email Address', dp('Email_Address')),
                ('Phone Number', dp('Phone_Number')),
                ('Fax Number', dp('Fax_Number')),
                ("Submitter's Representative", dp('Submitter_s_Representative'))
            )),

            ('Dates and Times', dtls(
                ('Document Date', dp('Document_Date')), # rarely-used
                ('Date Received', dp('Received_Date')),
                ('Postmark Date', dp('Postmark_Date', dp('Post_Mark_Date'))),
                ('Date Posted', dp('Date_Posted')),
                (None, dp('Date')), # Swallow this one, since it's always the same as Date_Posted,
                ('Comment Period', combine(
                    short_date(force_date(dp('Comment_Start_Date'))),
                    short_date(force_date(dp('Comment_Due_Date'))),
                    sep="&ndash;"
                )),

                # all the other dates -- don't even know what most of these are
                ("File Date", dp("File_Date")),
                ("Answer Date", dp("Answer_Date")),
                ("Author Date", dp("Author_Date")),
                ("Author Document Date", dp("Author_Document_Date")),
                ("Effective Date", dp("Effective_Date")),
                ("Implementation Date", dp("Implementation_Date")),
                ("Implementation Service Date", dp("Implementation_Service_Date"))
            )),
            
            ('Citations and References', dtls(
                ("RIN", document.rin if document.rin else None),
                ("Federal Register No.", dp("Federal_Register_Number")),
                ("Federal Register Pages", dp("Start_End_Page", "").replace(" - ", "&ndash;")),
                (None, dp("Page_Count")), # who cares?
                (None, dp("Page_Start")), # who cares?
                ("Federal Register Citation", dp("Federal_Register_Citation")),
                ("CFR Section(s)", dp("CFR")),
                ("Related RINs", dp("Related_RIN_s_")),
            )),
            
            ('Additional Details', dtls(*details.items()))
        )

        return Response(out)
예제 #18
0
    def get(self, request, *args, **kwargs):
        out = super(DocketView, self).get(request, *args, **kwargs).data

        out['source'] = self.item.source
        stats = out['stats']
        stats['similar_dockets'] = []
        summaries = []

        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url': 'http://www.regulations.gov/#!docketDetail;D=' + self.item.id,
                'label': 'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            if 'Source_URL' in self.item.details:
                out['upstream_urls'].append({
                    'url': self.item.details['Source_URL'],
                    'label': 'SEC.gov' if self.item.agency == 'SEC' else 'CFTC.gov'
                })
            for replaced in self.item.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url': 'http://www.regulations.gov/#!docketDetail;D=' + replaced,
                    'label': 'Regulations.gov'
                })


        if stats['count'] > 0:
            # do a similar thing with FR documents
            if stats.get('doc_info', {}).get('fr_docs', None):
                fr_doc_ids = [doc['id'] for doc in stats['doc_info']['fr_docs']]
                fr_search = Doc.objects(id__in=fr_doc_ids)
                fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search])

                for doc in stats['doc_info']['fr_docs']:
                    if doc['id'] in fr_docs:
                        fr_doc = fr_docs[doc['id']]
                        doc['stats'] = {
                            'date_range': fr_doc.stats['date_range'],
                            'count': fr_doc.stats['count']
                        } if fr_doc.stats else {'count': 0}
                        
                        if fr_doc.annotations.get('fr_data', None):
                            doc['summary'] = fr_doc.annotations['fr_data'].get('abstract', None)

                        if not doc.get('summary', None):
                            doc['summary'] = fr_doc.get_summary()

                        doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and force_date(fr_doc.details['Comment_Due_Date']) > datetime.datetime.now()

                        if doc['summary']:
                            summaries.append(doc['summary'])
                    else:
                        doc['stats'] = {'count': 0, 'comments_open': False}
                        doc['summary'] = None

                # remove duplicates, if any
                tmp = stats['doc_info']['fr_docs']
                included = set()
                stats['doc_info']['fr_docs'] = []
                for doc in tmp:
                    if doc['id'] not in included:
                        stats['doc_info']['fr_docs'].append(doc)
                        included.add(doc['id'])

            summary_text = "\n".join(summaries)
            if summary_text:
                similar_dockets = get_similar_dockets(summary_text, kwargs[self.aggregation_field])[:3]
                if similar_dockets:
                    sd = dict([(docket.id, docket.title) for docket in Docket.objects(id__in=similar_dockets).only('id', 'title')])
                    stats['similar_dockets'] = [{
                        'id': docket,
                        'title': sd[docket]
                    } for docket in similar_dockets]

        agency = self.item.agency
        if not agency:
            agency = re.split("[-_]", self.item.id)[0]
        
        if agency:
            agency_meta = list(Agency.objects(id=agency).only("name"))
            if agency_meta:
                out['agency'] = {
                    'id': agency,
                    'name': agency_meta[0].name,
                    'url': '/agency/%s' % agency
                }
            else:
                agency = None
        
        if not agency:
            out['agency'] = None

        return Response(out)
예제 #19
0
    def get(self, request, entity_id, docket_id, document_type, entity_type):
        dkt_results = list(Docket.objects(id=docket_id).only('id', 'title'))
        ent_results = list(Entity.objects(id=entity_id).only('id', 'aliases'))
        if not dkt_results or not ent_results:
            raise Http404('Not found.')

        docket = dkt_results[0]
        entity = ent_results[0]

        if document_type == 'mentions':
            docs_q = Doc.objects(Q(attachments__views__entities=entity_id)
                                 | Q(views__entities=entity_id),
                                 docket_id=docket_id)
        else:
            docs_q = Doc.objects(submitter_entities=entity_id, docket_id=docket_id) \

        docs_q = docs_q.only('type', 'title', 'id', 'views',
                             'attachments.views', 'details.Date_Posted',
                             'deleted').hint([("docket_id", 1)])
        docs = filter(
            lambda d: not d.deleted,
            sorted(list(docs_q),
                   key=lambda doc: doc.details.get(
                       'Date_Posted', datetime.datetime(1900, 1, 1)),
                   reverse=True))

        get_views = lambda doc: [
            {
                'object_id': view.object_id,
                'file_type': view.type,
                'url': view.url.replace('inline', 'attachment')
            } for view in doc.views if entity_id in view.entities
        ]

        out_docs = []
        for doc in docs[:10]:
            out_doc = {
                'title': doc.title,
                'id': doc.id,
                'date_posted': doc.details['Date_Posted'],
                'type': doc.type,
                'url': '/document/' + doc.id
            }
            if document_type == 'mentions':
                out_doc['files'] = get_views(doc) + list(
                    itertools.chain.from_iterable([
                        get_views(attachment) for attachment in doc.attachments
                    ]))

            out_docs.append(out_doc)

        return Response({
            'documents': out_docs,
            'has_more': len(docs) > 10,
            'count': len(docs),
            'document_search_url': "/search-document/" + \
                url_quote(":".join(["mentioned" if document_type == "mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) + \
                url_quote(":".join(["docket", docket.id, '"%s"' % docket.title])),
            'docket': {
                'id': docket.id,
                'title': docket.title,
            },
            'entity': {
                'id': entity.id,
                'name': entity.aliases[0]
            },
            'filter_type': document_type
        })
예제 #20
0
    def get(self, request, *args, **kwargs):
        "Access aggregate information about entities as they occur in regulations.gov data."
        results = Entity.objects(id=kwargs['entity_id'])
        if not results:
            raise Http404('Docket not found.')

        entity = results[0]

        # basic docket metadata
        out = {
            'name': entity.aliases[0],
            'url': reverse('entity-view', args=args, kwargs=kwargs),
            'id': entity.id,
            'type': entity.td_type,
            'stats': entity.stats
        }

        stats = entity.stats
        if stats:
            # cleanup, plus stitch on some additional data
            now = datetime.datetime.now().date()
            for mention_type in ["text_mentions", "submitter_mentions"]:
                stats[mention_type].update({
                    'months': [
                        month for month in prettify_months(stats[mention_type]
                                                           ['months'])
                        if month['date_range'][0] <= now
                    ] if stats[mention_type]['months'] else [],
                })

                # limit ourselves to the top ten of each match type, and grab their extra metadata
                agencies = sorted(stats[mention_type]['agencies'].items(),
                                  key=lambda x: x[1],
                                  reverse=True)[:10]

                stats[mention_type]['top_agencies'] = [{
                    'id':
                    item[0],
                    'count':
                    item[1],
                    'months':
                    prettify_months(
                        stats[mention_type]['agencies_by_month'][item[0]])
                } for item in agencies]
                del stats[mention_type]['agencies'], stats[mention_type][
                    'agencies_by_month']

                docket_list = stats[mention_type]['dockets'].items()
                years = request.GET.get('years', None)
                if years:
                    year_set = set(years.split(","))
                    docket_list = [
                        item for item in docket_list
                        if get_docket_year(item[0]) in year_set
                    ]
                dockets = sorted(docket_list, key=lambda x: x[1],
                                 reverse=True)[:10]

                stats[mention_type]['top_dockets'] = [{
                    'id': item[0],
                    'count': item[1]
                } for item in dockets]

                stats[mention_type]['docket_count'] = len(docket_list)
                del stats[mention_type]['dockets']

                stats[mention_type][
                    'docket_search_url'] = "/search-docket/" + url_quote(
                        ":".join([
                            "mentioned" if mention_type == "text_mentions" else
                            "submitter", entity.id,
                            '"%s"' % entity.aliases[0]
                        ]))

            # grab additional docket metadata
            ids = list(
                set([
                    record['id']
                    for record in stats['submitter_mentions']['top_dockets']
                ] + [
                    record['id']
                    for record in stats['text_mentions']['top_dockets']
                ]))
            dockets_search = Docket.objects(id__in=ids).only(
                'id', 'title', 'year', 'details.dk_type', 'agency',
                'stats.date_range')
            dockets = dict([(docket.id, docket) for docket in dockets_search])

            # stitch this back onto the main records
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for docket in stats[mention_type]['top_dockets']:
                    rdocket = dockets[docket['id']]
                    docket.update({
                        'title':
                        rdocket.title,
                        'url':
                        reverse('docket-view',
                                kwargs={'docket_id': rdocket.id}),
                        'year':
                        rdocket.year if rdocket.year else
                        (getattr(rdocket.stats['date_range'][0], 'year', None)
                         if 'date_range' in rdocket.stats else None),
                        'rulemaking':
                        rdocket.details.get(
                            'Type', 'Nonrulemaking').lower() == 'rulemaking',
                        'agency':
                        rdocket.agency if rdocket.agency else re.split(
                            "[-_]", rdocket.id)[0]
                    })

            # repeat for agencies
            ids = list(
                set([
                    record['id']
                    for record in stats['submitter_mentions']['top_agencies']
                ] + [
                    record['id']
                    for record in stats['text_mentions']['top_agencies']
                ]))
            agencies_search = Agency.objects(id__in=ids).only('id', 'name')
            agencies = dict([(agency.id, agency)
                             for agency in agencies_search])

            # ...and stitch
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for agency in stats[mention_type]['top_agencies']:
                    ragency = agencies.get(agency['id'], None)
                    agency.update({
                        'name':
                        ragency.name if ragency else agency['id'],
                        'url':
                        '/agency/%s' % agency['id']
                    })

            # and for comments
            recent_comments = []
            if 'recent_comments' in stats['submitter_mentions']:
                recent_comments_search = Doc.objects(id__in=[
                    doc['id']
                    for doc in stats['submitter_mentions']['recent_comments']
                ]).only('id', 'title', 'details')
                for comment in recent_comments_search:
                    comment_item = {
                        'title':
                        comment.title,
                        'date':
                        comment.details['Date_Posted'].date().isoformat()
                        if 'Date_Posted' in comment.details else None,
                        'author':
                        " ".join([
                            comment.details.get('First_Name', ''),
                            comment.details.get('Last_Name', '')
                        ]).strip(),
                        'organization':
                        comment.details.get('Organization_Name', ''),
                        'url':
                        '/document/' + comment.id
                    }
                    comment_item['author'] = comment_item[
                        'author'] if comment_item['author'] else None
                    recent_comments.append(comment_item)

            stats['submitter_mentions']['recent_comments'] = recent_comments

            out['stats'] = stats
        else:
            out['stats'] = {'count': 0}

        return Response(out)
예제 #21
0
    def get(self, request, *args, **kwargs):
        "Access basic metadata about regulations.gov documents."
        results = list(Doc.objects(id=kwargs['document_id']))
        if not results or results[0].deleted:
            raise Http404('Document not found.')

        document = results[0]

        # basic document metadata
        out = {
            'title': document.title,
            'url': reverse('document-view', kwargs=kwargs),
            'id': document.id,
            'agency': {
                'id': document.agency,
                'url': reverse('agency-view',
                               kwargs={'agency': document.agency}),
                'name': Agency.objects(id=document.agency).only("name")[0].name
            },
            'date': document.details.get('Date_Posted', None),
            'type': document.type,
            'views': [],
            'attachments': [],
            'details': document.details if document.details else {}
        }

        # inter-dataset suppression
        if 'replaced_by' in document.suppression:
            new_kwargs = dict(kwargs)
            new_kwargs['document_id'] = document.suppression['replaced_by'][0]
            out['redirect_to'] = reverse('document-view', kwargs=new_kwargs)

        # comment-on metadata
        if document.comment_on:
            # if we don't have all the data built in, grab it from its original record
            comment_on_doc = document.comment_on if 'title' in document.comment_on else Doc.objects.get(
                id=document.comment_on['document_id']).to_mongo()
            out['comment_on'] = {
                "fr_doc":
                comment_on_doc.get('fr_doc', False),
                "type":
                comment_on_doc.get('type', None),
                "id":
                document.comment_on['document_id'],
                'url':
                reverse(
                    'document-view',
                    kwargs={'document_id':
                            document.comment_on['document_id']}),
                "title":
                comment_on_doc['title']
            }
            if comment_on_doc['agency'] == out['agency'][
                    'id'] or not comment_on_doc['agency']:
                out['comment_on']['agency'] = out['agency']
            else:
                out['comment_on']['agency'] = {
                    'id':
                    comment_on_doc['agency'],
                    'url':
                    reverse('agency-view',
                            kwargs={'agency': comment_on_doc['agency']}),
                    'name':
                    Agency.objects(
                        id=comment_on_doc['agency']).only("name")[0].name
                }
        else:
            out['comment_on'] = {}

        # docket metadata
        docket = Docket.objects(id=document.docket_id)[0]
        out['docket'] = {
            'id': document.docket_id,
            'url': reverse('docket-view',
                           kwargs={'docket_id': document.docket_id}),
            'title': docket.title,
            'weeks': [],
            'fr_docs': []
        }
        if docket.stats:
            out['docket']['weeks'] = prettify_weeks(docket.stats['weeks'])
            out['docket']['fr_docs'] = docket.stats['doc_info'].get(
                'fr_docs', [])

        if out['date']:
            out['date'] = out['date'].isoformat()

        text_entities = set()
        submitter_entities = set(
            document.submitter_entities if document.submitter_entities else [])

        # a weird thing happens with iterating over mongoengine lists where they lose references to their parent instances, so do this weird generator thing
        for view in (document.views[i] for i in xrange(len(document.views))):
            # hack to deal with documents whose scrapes failed but still got extracted
            object_id = document.object_id if document.object_id else view.file_path.split(
                '/')[-1].split('.')[0]
            out['views'].append({
                'object_id':
                object_id,
                'file_type':
                view.type,
                'file_type_label':
                TYPE_LABELS.get(view.type, view.type.upper()),
                'extracted':
                view.extracted == 'yes',
                'url':
                view.download_url,
                'html':
                reverse('raw-text-view',
                        kwargs={
                            'document_id': document.id,
                            'file_type': view.type,
                            'output_format': 'html',
                            'view_type': 'view'
                        }) if view.extracted == 'yes' else None
            })

            for entity in view.entities:
                text_entities.add(entity)

        for attachment in (document.attachments[i]
                           for i in xrange(len(document.attachments))):
            a = {'title': attachment.title, 'views': []}
            for view in (attachment.views[i]
                         for i in xrange(len(attachment.views))):
                a['views'].append({
                    'object_id':
                    attachment.object_id,
                    'file_type':
                    view.type,
                    'file_type_label':
                    TYPE_LABELS.get(view.type, view.type.upper()),
                    'extracted':
                    view.extracted == 'yes',
                    'url':
                    view.download_url,
                    'html':
                    reverse('raw-text-view',
                            kwargs={
                                'document_id': document.id,
                                'object_id': attachment.object_id,
                                'file_type': view.type,
                                'output_format': 'html',
                                'view_type': 'attachment'
                            }) if view.extracted == 'yes' else None
                })

                for entity in view.entities:
                    text_entities.add(entity)
            out['attachments'].append(a)

        # stats for FR docs
        stats = document.stats if document.stats else {'count': 0}
        # limit ourselves to the top five of each match type, and grab their extra metadata
        for label in ['text_entities', 'submitter_entities']:
            stats['top_' + label] = [{
                'id': i[0],
                'count': i[1]
            } for i in sorted(stats.get(label, {}).items(),
                              key=lambda x: x[1],
                              reverse=True)[:5]]
            if label in stats:
                del stats[label]
        top_entities = set(
            [record['id'] for record in stats['top_text_entities']] +
            [record['id'] for record in stats['top_submitter_entities']])

        entities_search = Entity.objects(id__in=list(
            submitter_entities.union(text_entities, top_entities))).only(
                'id', 'td_type', 'aliases')
        entities = dict([(entity.id, entity) for entity in entities_search])

        for label, items in [('submitter_entities',
                              sorted(list(submitter_entities))),
                             ('text_entities', sorted(list(text_entities)))]:
            out[label] = [{
                'id':
                item,
                'type':
                entities[item].td_type,
                'name':
                entities[item].aliases[0],
                'url':
                '/%s/%s/%s' % (entities[item].td_type,
                               slugify(entities[item].aliases[0]), item)
            } for item in items]

        for label in ['top_text_entities', 'top_submitter_entities']:
            for entity in stats[label]:
                if not entities[entity['id']].td_type:
                    continue

                entity['type'] = entities[entity['id']].td_type
                entity['name'] = entities[entity['id']].aliases[0]
                entity['url'] = '/%s/%s/%s' % (
                    entity['type'], slugify(entity['name']), entity['id'])

        if 'weeks' in stats:
            stats['weeks'] = prettify_weeks(stats['weeks'])

        recent_comments = []
        if 'recent_comments' in stats:
            recent_comments_search = Doc.objects(
                id__in=[doc['id'] for doc in stats['recent_comments']]).only(
                    'id', 'title', 'details')
            for comment in recent_comments_search:
                comment_item = {
                    'title':
                    comment.title,
                    'date':
                    comment.details['Date_Posted'].date().isoformat()
                    if 'Date_Posted' in comment.details else None,
                    'author':
                    " ".join([
                        comment.details.get('First_Name', ''),
                        comment.details.get('Last_Name', '')
                    ]).strip(),
                    'organization':
                    comment.details.get('Organization_Name', ''),
                    'url':
                    '/document/' + comment.id
                }
                comment_item['author'] = comment_item[
                    'author'] if comment_item['author'] else None
                recent_comments.append(comment_item)

        stats['recent_comments'] = recent_comments

        out['comment_stats'] = stats

        # links upstream
        out['source'] = document.source
        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url':
                'http://www.regulations.gov/#!documentDetail;D=' + document.id,
                'label':
                'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            for replaced in document.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url':
                    'http://www.regulations.gov/#!documentDetail;D=' +
                    replaced,
                    'label':
                    'Regulations.gov'
                })

        # cleaned-up details
        details = out['details'].copy()
        dp = lambda key, default=None: details.pop(key, default)
        out['clean_details'] = dtls(
            ('Submitter Information',
             dtls(('Name',
                   combine(dp('First_Name'), dp('Middle_Name'),
                           dp('Last_Name'))),
                  ('Organization', dp('Organization_Name')),
                  ('Location',
                   combine(dp('Mailing_Address'),
                           dp('Mailing_Address_'),
                           dp('City'),
                           expand_state(dp('State_or_Province')),
                           dp('Postal_Code'),
                           dp('Country'),
                           sep=", ")), ('Email Address', dp('Email_Address')),
                  ('Phone Number', dp('Phone_Number')),
                  ('Fax Number', dp('Fax_Number')),
                  ("Submitter's Representative",
                   dp('Submitter_s_Representative')))),
            (
                'Dates and Times',
                dtls(
                    ('Document Date', dp('Document_Date')),  # rarely-used
                    ('Date Received', dp('Received_Date')),
                    ('Postmark Date', dp('Postmark_Date',
                                         dp('Post_Mark_Date'))),
                    ('Date Posted', dp('Date_Posted')),
                    (
                        None, dp('Date')
                    ),  # Swallow this one, since it's always the same as Date_Posted,
                    ('Comment Period',
                     combine(short_date(force_date(dp('Comment_Start_Date'))),
                             short_date(force_date(dp('Comment_Due_Date'))),
                             sep="&ndash;")),

                    # all the other dates -- don't even know what most of these are
                    ("File Date", dp("File_Date")),
                    ("Answer Date", dp("Answer_Date")),
                    ("Author Date", dp("Author_Date")),
                    ("Author Document Date", dp("Author_Document_Date")),
                    ("Effective Date", dp("Effective_Date")),
                    ("Implementation Date", dp("Implementation_Date")),
                    ("Implementation Service Date",
                     dp("Implementation_Service_Date")))),
            (
                'Citations and References',
                dtls(
                    ("RIN", document.rin if document.rin else None),
                    ("Federal Register No.", dp("Federal_Register_Number")),
                    ("Federal Register Pages", dp(
                        "Start_End_Page", "").replace(" - ", "&ndash;")),
                    (None, dp("Page_Count")),  # who cares?
                    (None, dp("Page_Start")),  # who cares?
                    ("Federal Register Citation",
                     dp("Federal_Register_Citation")),
                    ("CFR Section(s)", dp("CFR")),
                    ("Related RINs", dp("Related_RIN_s_")),
                )),
            ('Additional Details', dtls(*details.items())))

        return Response(out)
예제 #22
0
    def get(self, request, *args, **kwargs):
        out = super(DocketView, self).get(request, *args, **kwargs).data

        out['source'] = self.item.source
        stats = out['stats']
        stats['similar_dockets'] = []
        summaries = []

        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url':
                'http://www.regulations.gov/#!docketDetail;D=' + self.item.id,
                'label':
                'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            if 'Source_URL' in self.item.details:
                out['upstream_urls'].append({
                    'url':
                    self.item.details['Source_URL'],
                    'label':
                    'SEC.gov' if self.item.agency == 'SEC' else 'CFTC.gov'
                })
            for replaced in self.item.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url':
                    'http://www.regulations.gov/#!docketDetail;D=' + replaced,
                    'label':
                    'Regulations.gov'
                })

        if stats['count'] > 0:
            # do a similar thing with FR documents
            if stats.get('doc_info', {}).get('fr_docs', None):
                fr_doc_ids = [
                    doc['id'] for doc in stats['doc_info']['fr_docs']
                ]
                fr_search = Doc.objects(id__in=fr_doc_ids)
                fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search])

                for doc in stats['doc_info']['fr_docs']:
                    if doc['id'] in fr_docs:
                        fr_doc = fr_docs[doc['id']]
                        doc['stats'] = {
                            'date_range': fr_doc.stats['date_range'],
                            'count': fr_doc.stats['count']
                        } if fr_doc.stats else {
                            'count': 0
                        }

                        if fr_doc.annotations.get('fr_data', None):
                            doc['summary'] = fr_doc.annotations['fr_data'].get(
                                'abstract', None)

                        if not doc.get('summary', None):
                            doc['summary'] = fr_doc.get_summary()

                        doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and force_date(
                            fr_doc.details['Comment_Due_Date']
                        ) > datetime.datetime.now()

                        if doc['summary']:
                            summaries.append(doc['summary'])
                    else:
                        doc['stats'] = {'count': 0, 'comments_open': False}
                        doc['summary'] = None

                # remove duplicates, if any
                tmp = stats['doc_info']['fr_docs']
                included = set()
                stats['doc_info']['fr_docs'] = []
                for doc in tmp:
                    if doc['id'] not in included:
                        stats['doc_info']['fr_docs'].append(doc)
                        included.add(doc['id'])

            summary_text = "\n".join(summaries)
            if summary_text:
                similar_dockets = get_similar_dockets(
                    summary_text, kwargs[self.aggregation_field])[:3]
                if similar_dockets:
                    sd = dict([(docket.id, docket.title)
                               for docket in Docket.objects(
                                   id__in=similar_dockets).only('id', 'title')
                               ])
                    stats['similar_dockets'] = [{
                        'id': docket,
                        'title': sd[docket]
                    } for docket in similar_dockets]

        agency = self.item.agency
        if not agency:
            agency = re.split("[-_]", self.item.id)[0]

        if agency:
            agency_meta = list(Agency.objects(id=agency).only("name"))
            if agency_meta:
                out['agency'] = {
                    'id': agency,
                    'name': agency_meta[0].name,
                    'url': '/agency/%s' % agency
                }
            else:
                agency = None

        if not agency:
            out['agency'] = None

        return Response(out)
예제 #23
0
from regs_models import Doc
import json
import itertools


def split_seq(iterable, size):
    it = iter(iterable)
    item = list(itertools.islice(it, size))
    while item:
        yield item
        item = list(itertools.islice(it, size))


all_ids = json.load(open("/tmp/problems.json"))
for ids in split_seq(all_ids, 1000):
    for doc in Doc.objects(id__in=ids):
        for view in doc.views:
            if view.type == "pdf" and view.mode == "html" and view.extracted == "yes":
                view.extracted = "no"
                view.content.delete()
        for attachment in doc.attachments:
            for view in attachment.views:
                if view.type == "pdf" and view.mode == "html" and view.extracted == "yes":
                    view.extracted = "no"
                    view.content.delete()
        doc.in_search_index = False
        doc.in_cluster_db = False
        doc.entities_last_extracted = None

        print "Repaired %s" % doc.id
        doc.save()