def repair_missing_docket(docket_id): """Recreate any dockets that Mongo thinks are analyzed already but aren't in Postgres. Note that this is a very limited form or repair, corresponding to the particular situation in which some malformed dockets have been deleted from Postgres by hand, but not marked as such on the Mongo side. As other particular problems arise we may add different repair methods. """ # only repair if MongoDB thinks that something should be in Postgres already if Doc.objects(docket_id=docket_id, in_cluster_db=True).count() == 0: return # does docket exist at all? corpora = get_corpora_by_metadata('docket_id', docket_id) if len(corpora) == 0: # neither parse exists, mark as unclustered in Mongo update_count = Doc.objects( docket_id=docket_id, in_cluster_db=True).update(set__in_cluster_db=False) print "Docket %s missing in Postgres. Marked %s documents with in_cluster_db=False." % ( docket_id, update_count) ingest_docket(docket_id) elif len(corpora) == 1 or len(corpora) > 2: # we have a single or multiple parses...that's something unexpected that we can't fix automatically raise "Found %s corpora for docket %s. Expected either 0 or 2 corpora. Must fix by hand." % ( len(corpora), docket_id)
def ingest_docket(docket): print "Loading docket %s at %s..." % (docket.id, datetime.now()) deletions = list(Doc.objects(docket_id=docket.id, deleted=True, in_cluster_db=True, type='public_submission').scalar('id')) insertions = [ dict(text=doc_text(d), metadata=doc_metadata(d)) for d in Doc.objects(docket_id=docket.id, deleted=False, in_cluster_db=False, type='public_submission')] print "Found %s documents for deletion, %s documents for insertion or update." % (len(deletions), len(insertions)) if not insertions and not deletions: return with transaction.commit_on_success(): ingest_single_parse(docket, deletions, insertions, 'sentence') ingest_single_parse(docket, deletions, insertions, '4-gram') print "Marking MongoDB documents as analyzed at %s..." % datetime.now() update_count = Doc.objects(id__in=[d['metadata']['document_id'] for d in insertions]) \ .update(safe_update=True, set__in_cluster_db=True) if update_count != len(insertions): print "ERROR: %s documents inserted into Postgres, but only %s documents marked as analyzed in MongoDB." % (len(insertions), update_count) update_count = Doc.objects(id__in=deletions) \ .update(safe_update=True, set__in_cluster_db=False) if update_count != len(deletions): print "ERROR: %s documents deleted in Postgres, but only %s documents marked as deleted in MongoDB." % (len(deletions), update_count)
def delete_analysis(docket): with transaction.commit_on_success(): c = get_dual_corpora_by_metadata('docket_id', docket.id) if c: c.delete_corpus() print "Deleted docket %s (id=%s)." % (docket.id, c.id) else: print "Attempted deletion of %s. Docket not found." % docket.id Doc.objects(docket_id=docket.id).update(set__in_cluster_db=False)
def delete_analysis(docket_id): with transaction.commit_on_success(): c = get_dual_corpora_by_metadata('docket_id', docket_id) if c: c.delete_corpus() print "Deleted docket %s (id=%s)." % (docket_id, c.id) else: print "Attempted deletion of %s. Docket not found." % docket_id Doc.objects(docket_id=docket_id).update(set__in_cluster_db=False)
def print_stats(docket_id): print "MongoDB has\t%s in_cluster_db=True, deleted=False;\t%s in_cluster_db=False,deleted=False" % \ (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=False, type='public_submission').count(), Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=False, type='public_submission').count()) print "\t\t%s in_cluster_db=True, deleted=True;\t%s in_cluster_db=False,deleted=True" % \ (Doc.objects(docket_id=docket_id,in_cluster_db=True,deleted=True, type='public_submission').count(), Doc.objects(docket_id=docket_id,in_cluster_db=False,deleted=True, type='public_submission').count()) for corpus in get_corpora_by_metadata('docket_id', docket_id): print "Corpus %s (%s) has %s documents." % (corpus.id, corpus.metadata, corpus.num_docs())
def get(self, request, entity_id, docket_id, document_type, entity_type): dkt_results = list(Docket.objects(id=docket_id).only('id', 'title')) ent_results = list(Entity.objects(id=entity_id).only('id', 'aliases')) if not dkt_results or not ent_results: raise Http404('Not found.') docket = dkt_results[0] entity = ent_results[0] if document_type == 'mentions': docs_q = Doc.objects(Q(attachments__views__entities=entity_id) | Q(views__entities=entity_id), docket_id=docket_id) else: docs_q = Doc.objects(submitter_entities=entity_id, docket_id=docket_id) \ docs_q = docs_q.only('type', 'title', 'id', 'views', 'attachments.views', 'details.Date_Posted', 'deleted').hint([("docket_id", 1)]) docs = filter(lambda d: not d.deleted, sorted(list(docs_q), key=lambda doc: doc.details.get('Date_Posted', datetime.datetime(1900,1,1)), reverse=True)) get_views = lambda doc: [{ 'object_id': view.object_id, 'file_type': view.type, 'url': view.url.replace('inline', 'attachment') } for view in doc.views if entity_id in view.entities] out_docs = [] for doc in docs[:10]: out_doc = { 'title': doc.title, 'id': doc.id, 'date_posted': doc.details['Date_Posted'], 'type': doc.type, 'url': '/document/' + doc.id } if document_type == 'mentions': out_doc['files'] = get_views(doc) + list(itertools.chain.from_iterable([get_views(attachment) for attachment in doc.attachments])) out_docs.append(out_doc) return Response({ 'documents': out_docs, 'has_more': len(docs) > 10, 'count': len(docs), 'document_search_url': "/search-document/" + \ url_quote(":".join(["mentioned" if document_type == "mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) + \ url_quote(":".join(["docket", docket.id, '"%s"' % docket.title])), 'docket': { 'id': docket.id, 'title': docket.title, }, 'entity': { 'id': entity.id, 'name': entity.aliases[0] }, 'filter_type': document_type })
def handle(self, **options): if options['parsable']: # disable standard output by monkey-patching sys.stdout dev_null = open('/dev/null', 'w') real_stdout = sys.stdout sys.stdout = dev_null doc_kwargs = {'type': 'public_submission'} if options.get('docket'): doc_kwargs['docket_id'] = options['docket'] elif options.get('agency'): doc_kwargs['agency'] = options['agency'] print "Enumerating dockets..." docket_list = list( Doc.objects( Q(deleted=True, in_cluster_db=True) | Q(deleted=False, in_cluster_db=False), **doc_kwargs).distinct('docket_id')) docket_count = len(docket_list) counter = 0 print "Beginning loading %s dockets at %s..." % (docket_count, datetime.now()) if options['fork']: print "Using forking strategy..." import multiprocessing for docket_id in docket_list: counter += 1 print "Docket #%s / %s" % (counter, docket_count) p = multiprocessing.Process(target=process_docket, args=[docket_id, options]) p.start() p.join() else: print "Using single-process strategy..." for docket_id in docket_list: counter += 1 print "Docket #%s / %s" % (counter, docket_count) process_docket(docket_id, options) print "Done." if options['parsable']: # turn stdout back on so we can print output sys.stdout = real_stdout print json.dumps({'dockets': docket_count})
def ingest_docket(docket_id): print "Loading docket %s at %s..." % (docket_id, datetime.now()) deletions = list( Doc.objects(docket_id=docket_id, deleted=True, in_cluster_db=True, type='public_submission').scalar('id')) insertions = [ dict(text=doc_text(d), metadata=doc_metadata(d)) for d in Doc.objects(docket_id=docket_id, deleted=False, in_cluster_db=False, type='public_submission') ] print "Found %s documents for deletion, %s documents for insertion or update." % ( len(deletions), len(insertions)) if not insertions and not deletions: return with transaction.commit_on_success(): ingest_single_parse(docket_id, deletions, insertions, 'sentence') ingest_single_parse(docket_id, deletions, insertions, '4-gram') print "Marking MongoDB documents as analyzed at %s..." % datetime.now() update_count = Doc.objects(id__in=[d['metadata']['document_id'] for d in insertions]) \ .update(set__in_cluster_db=True) if update_count != len(insertions): print "ERROR: %s documents inserted into Postgres, but only %s documents marked as analyzed in MongoDB." % ( len(insertions), update_count) update_count = Doc.objects(id__in=deletions) \ .update(set__in_cluster_db=False) if update_count != len(deletions): print "ERROR: %s documents deleted in Postgres, but only %s documents marked as deleted in MongoDB." % ( len(deletions), update_count)
def repair_missing_docket(docket): """Recreate any dockets that Mongo thinks are analyzed already but aren't in Postgres. Note that this is a very limited form or repair, corresponding to the particular situation in which some malformed dockets have been deleted from Postgres by hand, but not marked as such on the Mongo side. As other particular problems arise we may add different repair methods. """ # only repair if MongoDB thinks that something should be in Postgres already if Doc.objects(docket_id=docket.id, in_cluster_db=True).count() == 0: return # does docket exist at all? corpora = get_corpora_by_metadata('docket_id', docket.id) if len(corpora) == 0: # neither parse exists, mark as unclustered in Mongo update_count = Doc.objects(docket_id=docket.id, in_cluster_db=True).update(safe_update=True, set__in_cluster_db=False) print "Docket %s missing in Postgres. Marked %s documents with in_cluster_db=False." % (docket.id, update_count) ingest_docket(docket) elif len(corpora) == 1 or len(corpora) > 2: # we have a single or multiple parses...that's something unexpected that we can't fix automatically raise "Found %s corpora for docket %s. Expected either 0 or 2 corpora. Must fix by hand." % (len(corpora), docket.id)
def handle(self, **options): if options['parsable']: # disable standard output by monkey-patching sys.stdout dev_null = open('/dev/null', 'w') real_stdout = sys.stdout sys.stdout = dev_null doc_kwargs = {'type': 'public_submission'} if options.get('docket'): doc_kwargs['docket_id'] = options['docket'] elif options.get('agency'): doc_kwargs['agency'] = options['agency'] print "Enumerating dockets..." docket_list = list(Doc.objects(Q(deleted=True, in_cluster_db=True) | Q(deleted=False, in_cluster_db=False), **doc_kwargs).distinct('docket_id')) docket_count = len(docket_list) counter = 0 print "Beginning loading %s dockets at %s..." % (docket_count, datetime.now()) if options['fork']: print "Using forking strategy..." import multiprocessing for docket_id in docket_list: counter += 1 print "Docket #%s / %s" % (counter, docket_count) p = multiprocessing.Process(target=process_docket, args=[docket_id, options]) p.start() p.join() else: print "Using single-process strategy..." for docket_id in docket_list: counter += 1 print "Docket #%s / %s" % (counter, docket_count) process_docket(docket_id, options) print "Done." if options['parsable']: # turn stdout back on so we can print output sys.stdout = real_stdout print json.dumps({'dockets': docket_count})
def get(self, request, document_id, file_type, object_id): docs = list(Doc.objects(id=document_id)) if not docs: raise Http404("Document not found") doc = docs[0] # figure out which view it is all_views = itertools.chain.from_iterable([doc.views, itertools.chain.from_iterable([attachment.views for attachment in doc.attachments])]) matches = [view for view in all_views if view.type == file_type and view.object_id == object_id] if not matches: raise Http404("File record not found") match = matches[0] if not match.downloaded or not match.file_path or not os.path.exists(match.file_path): raise Http404("File not found") # we're good to go; gather some info about the file mimetype = magic.from_file(match.file_path, mime=True) extension = mimetypes.guess_extension(mimetype) if not extension: extension = ".%s" % match.type return sendfile(request, match.file_path, attachment=True, attachment_filename="%s%s" % (match.object_id, extension), mimetype=mimetype)
def get(self, request, document_id, file_type, object_id): docs = list(Doc.objects(id=document_id)) if not docs: raise Http404("Document not found") doc = docs[0] # figure out which view it is all_views = itertools.chain.from_iterable([ doc.views, itertools.chain.from_iterable( [attachment.views for attachment in doc.attachments]) ]) matches = [ view for view in all_views if view.type == file_type and view.object_id == object_id ] if not matches: raise Http404("File record not found") match = matches[0] if not match.downloaded or not match.file_path or not os.path.exists( match.file_path): raise Http404("File not found") # we're good to go; gather some info about the file mimetype = magic.from_file(match.file_path, mime=True) extension = mimetypes.guess_extension(mimetype) if not extension: extension = ".%s" % match.type return sendfile(request, match.file_path, attachment=True, attachment_filename="%s%s" % (match.object_id, extension), mimetype=mimetype)
GEVENT = False from regs_models import Doc import json import itertools def split_seq(iterable, size): it = iter(iterable) item = list(itertools.islice(it, size)) while item: yield item item = list(itertools.islice(it, size)) all_ids = json.load(open("/tmp/problems.json")) for ids in split_seq(all_ids, 1000): for doc in Doc.objects(id__in=ids): for view in doc.views: if view.type == "pdf" and view.mode == "html" and view.extracted == "yes": view.extracted = "no" view.content.delete() for attachment in doc.attachments: for view in attachment.views: if view.type == "pdf" and view.mode == "html" and view.extracted == "yes": view.extracted = "no" view.content.delete() doc.in_search_index = False doc.in_cluster_db = False doc.entities_last_extracted = None print "Repaired %s" % doc.id doc.save()
def get(self, request, *args, **kwargs): out = super(DocketView, self).get(request, *args, **kwargs).data stats = out['stats'] stats['similar_dockets'] = [] summaries = [] if stats['count'] > 0: # do a similar thing with FR documents if stats.get('doc_info', {}).get('fr_docs', None): fr_doc_ids = [doc['id'] for doc in stats['doc_info']['fr_docs']] fr_search = Doc.objects(id__in=fr_doc_ids) fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search]) for doc in stats['doc_info']['fr_docs']: if doc['id'] in fr_docs: fr_doc = fr_docs[doc['id']] doc['stats'] = { 'date_range': fr_doc.stats['date_range'], 'count': fr_doc.stats['count'] } if fr_doc.stats else {'count': 0} doc['summary'] = fr_doc.get_summary() doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and fr_doc.details['Comment_Due_Date'] > datetime.datetime.now() if doc['summary']: summaries.append(doc['summary']) else: doc['stats'] = {'count': 0, 'comments_open': False} doc['summary'] = None # remove duplicates, if any tmp = stats['doc_info']['fr_docs'] included = set() stats['doc_info']['fr_docs'] = [] for doc in tmp: if doc['id'] not in included: stats['doc_info']['fr_docs'].append(doc) included.add(doc['id']) summary_text = "\n".join(summaries) if summary_text: similar_dockets = get_similar_dockets(summary_text, kwargs[self.aggregation_field])[:3] if similar_dockets: sd = dict([(docket.id, docket.title) for docket in Docket.objects(id__in=similar_dockets).only('id', 'title')]) stats['similar_dockets'] = [{ 'id': docket, 'title': sd[docket] } for docket in similar_dockets] agency = self.item.agency if not agency: agency = re.split("[-_]", self.item.id)[0] if agency: agency_meta = list(Agency.objects(id=agency).only("name")) if agency_meta: out['agency'] = { 'id': agency, 'name': agency_meta[0].name, 'url': '/agency/%s' % agency } else: agency = None if not agency: out['agency'] = None return Response(out)
def get(self, request, *args, **kwargs): "Access aggregate information about entities as they occur in regulations.gov data." results = Entity.objects(id=kwargs['entity_id']) if not results: raise Http404('Docket not found.') entity = results[0] # basic docket metadata out = { 'name': entity.aliases[0], 'url': reverse('entity-view', args=args, kwargs=kwargs), 'id': entity.id, 'type': entity.td_type, 'stats': entity.stats } stats = entity.stats if stats: # cleanup, plus stitch on some additional data now = datetime.datetime.now().date() for mention_type in ["text_mentions", "submitter_mentions"]: stats[mention_type].update({ 'months': [month for month in prettify_months(stats[mention_type]['months']) if month['date_range'][0] <= now] if stats[mention_type]['months'] else [], }) # limit ourselves to the top ten of each match type, and grab their extra metadata agencies = sorted(stats[mention_type]['agencies'].items(), key=lambda x: x[1], reverse=True)[:10] stats[mention_type]['top_agencies'] = [{ 'id': item[0], 'count': item[1], 'months': prettify_months(stats[mention_type]['agencies_by_month'][item[0]]) } for item in agencies] del stats[mention_type]['agencies'], stats[mention_type]['agencies_by_month'] docket_list = stats[mention_type]['dockets'].items() years = request.GET.get('years', None) if years: year_set = set(years.split(",")) docket_list = [item for item in docket_list if get_docket_year(item[0]) in year_set] dockets = sorted(docket_list, key=lambda x: x[1], reverse=True)[:10] stats[mention_type]['top_dockets'] = [{ 'id': item[0], 'count': item[1] } for item in dockets] stats[mention_type]['docket_count'] = len(docket_list) del stats[mention_type]['dockets'] stats[mention_type]['docket_search_url'] = "/search-docket/" + url_quote(":".join(["mentioned" if mention_type == "text_mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) # grab additional docket metadata ids = list(set([record['id'] for record in stats['submitter_mentions']['top_dockets']] + [record['id'] for record in stats['text_mentions']['top_dockets']])) dockets_search = Docket.objects(id__in=ids).only('id', 'title', 'year', 'details.dk_type', 'agency', 'stats.date_range') dockets = dict([(docket.id, docket) for docket in dockets_search]) # stitch this back onto the main records for mention_type in ['text_mentions', 'submitter_mentions']: for docket in stats[mention_type]['top_dockets']: rdocket = dockets[docket['id']] docket.update({ 'title': rdocket.title, 'url': reverse('docket-view', kwargs={'docket_id': rdocket.id}), 'year': rdocket.year if rdocket.year else (getattr(rdocket.stats['date_range'][0], 'year', None) if 'date_range' in rdocket.stats else None), 'rulemaking': rdocket.details.get('Type', 'Nonrulemaking').lower() == 'rulemaking', 'agency': rdocket.agency if rdocket.agency else re.split("[-_]", rdocket.id)[0] }) # repeat for agencies ids = list(set([record['id'] for record in stats['submitter_mentions']['top_agencies']] + [record['id'] for record in stats['text_mentions']['top_agencies']])) agencies_search = Agency.objects(id__in=ids).only('id', 'name') agencies = dict([(agency.id, agency) for agency in agencies_search]) # ...and stitch for mention_type in ['text_mentions', 'submitter_mentions']: for agency in stats[mention_type]['top_agencies']: ragency = agencies.get(agency['id'], None) agency.update({ 'name': ragency.name if ragency else agency['id'], 'url': '/agency/%s' % agency['id'] }) # and for comments recent_comments = [] if 'recent_comments' in stats['submitter_mentions']: recent_comments_search = Doc.objects(id__in=[doc['id'] for doc in stats['submitter_mentions']['recent_comments']]).only('id', 'title', 'details') for comment in recent_comments_search: comment_item = { 'title': comment.title, 'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None, 'author': " ".join([comment.details.get('First_Name', ''), comment.details.get('Last_Name', '')]).strip(), 'organization': comment.details.get('Organization_Name', ''), 'url': '/document/' + comment.id } comment_item['author'] = comment_item['author'] if comment_item['author'] else None recent_comments.append(comment_item) stats['submitter_mentions']['recent_comments'] = recent_comments out['stats'] = stats else: out['stats'] = {'count': 0} return Response(out)
def get(self, request, *args, **kwargs): "Access basic metadata about regulations.gov documents." results = list(Doc.objects(id=kwargs['document_id'])) if not results or results[0].deleted: raise Http404('Document not found.') document = results[0] # basic document metadata out = { 'title': document.title, 'url': reverse('document-view', kwargs=kwargs), 'id': document.id, 'agency': { 'id': document.agency, 'url': reverse('agency-view', kwargs={'agency': document.agency}), 'name': Agency.objects(id=document.agency).only("name")[0].name }, 'date': document.details.get('Date_Posted', None), 'type': document.type, 'views': [], 'attachments': [], 'details': document.details if document.details else {} } # inter-dataset suppression if 'replaced_by' in document.suppression: new_kwargs = dict(kwargs) new_kwargs['document_id'] = document.suppression['replaced_by'][0] out['redirect_to'] = reverse('document-view', kwargs=new_kwargs) # comment-on metadata if document.comment_on: # if we don't have all the data built in, grab it from its original record comment_on_doc = document.comment_on if 'title' in document.comment_on else Doc.objects.get(id=document.comment_on['document_id']).to_mongo() out['comment_on'] = { "fr_doc": comment_on_doc.get('fr_doc', False), "type": comment_on_doc.get('type', None), "id": document.comment_on['document_id'], 'url': reverse('document-view', kwargs={'document_id': document.comment_on['document_id']}), "title": comment_on_doc['title'] } if comment_on_doc['agency'] == out['agency']['id'] or not comment_on_doc['agency']: out['comment_on']['agency'] = out['agency'] else: out['comment_on']['agency'] = { 'id': comment_on_doc['agency'], 'url': reverse('agency-view', kwargs={'agency': comment_on_doc['agency']}), 'name': Agency.objects(id=comment_on_doc['agency']).only("name")[0].name } else: out['comment_on'] = {} # docket metadata docket = Docket.objects(id=document.docket_id)[0] out['docket'] = { 'id': document.docket_id, 'url': reverse('docket-view', kwargs={'docket_id': document.docket_id}), 'title': docket.title, 'weeks': [], 'fr_docs': [] } if docket.stats: out['docket']['weeks'] = prettify_weeks(docket.stats['weeks']) out['docket']['fr_docs'] = docket.stats['doc_info'].get('fr_docs', []) if out['date']: out['date'] = out['date'].isoformat() text_entities = set() submitter_entities = set(document.submitter_entities if document.submitter_entities else []) # a weird thing happens with iterating over mongoengine lists where they lose references to their parent instances, so do this weird generator thing for view in (document.views[i] for i in xrange(len(document.views))): # hack to deal with documents whose scrapes failed but still got extracted object_id = document.object_id if document.object_id else view.file_path.split('/')[-1].split('.')[0] out['views'].append({ 'object_id': object_id, 'file_type': view.type, 'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()), 'extracted': view.extracted == 'yes', 'url': view.download_url, 'html': reverse('raw-text-view', kwargs={'document_id': document.id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'view'}) if view.extracted == 'yes' else None }) for entity in view.entities: text_entities.add(entity) for attachment in (document.attachments[i] for i in xrange(len(document.attachments))): a = { 'title': attachment.title, 'views': [] } for view in (attachment.views[i] for i in xrange(len(attachment.views))): a['views'].append({ 'object_id': attachment.object_id, 'file_type': view.type, 'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()), 'extracted': view.extracted == 'yes', 'url': view.download_url, 'html': reverse('raw-text-view', kwargs={'document_id': document.id, 'object_id': attachment.object_id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'attachment'}) if view.extracted == 'yes' else None }) for entity in view.entities: text_entities.add(entity) out['attachments'].append(a) # stats for FR docs stats = document.stats if document.stats else {'count': 0} # limit ourselves to the top five of each match type, and grab their extra metadata for label in ['text_entities', 'submitter_entities']: stats['top_' + label] = [{ 'id': i[0], 'count': i[1] } for i in sorted(stats.get(label, {}).items(), key=lambda x: x[1], reverse=True)[:5]] if label in stats: del stats[label] top_entities = set([record['id'] for record in stats['top_text_entities']] + [record['id'] for record in stats['top_submitter_entities']]) entities_search = Entity.objects(id__in=list(submitter_entities.union(text_entities, top_entities))).only('id', 'td_type', 'aliases') entities = dict([(entity.id, entity) for entity in entities_search]) for label, items in [('submitter_entities', sorted(list(submitter_entities))), ('text_entities', sorted(list(text_entities)))]: out[label] = [{ 'id': item, 'type': entities[item].td_type, 'name': entities[item].aliases[0], 'url': '/%s/%s/%s' % (entities[item].td_type, slugify(entities[item].aliases[0]), item) } for item in items] for label in ['top_text_entities', 'top_submitter_entities']: for entity in stats[label]: if not entities[entity['id']].td_type: continue entity['type'] = entities[entity['id']].td_type entity['name'] = entities[entity['id']].aliases[0] entity['url'] = '/%s/%s/%s' % (entity['type'], slugify(entity['name']), entity['id']) if 'weeks' in stats: stats['weeks'] = prettify_weeks(stats['weeks']) recent_comments = [] if 'recent_comments' in stats: recent_comments_search = Doc.objects(id__in=[doc['id'] for doc in stats['recent_comments']]).only('id', 'title', 'details') for comment in recent_comments_search: comment_item = { 'title': comment.title, 'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None, 'author': " ".join([comment.details.get('First_Name', ''), comment.details.get('Last_Name', '')]).strip(), 'organization': comment.details.get('Organization_Name', ''), 'url': '/document/' + comment.id } comment_item['author'] = comment_item['author'] if comment_item['author'] else None recent_comments.append(comment_item) stats['recent_comments'] = recent_comments out['comment_stats'] = stats # links upstream out['source'] = document.source out['upstream_urls'] = [] if out['source'] == 'regulations.gov': out['upstream_urls'].append({ 'url': 'http://www.regulations.gov/#!documentDetail;D=' + document.id, 'label': 'Regulations.gov' }) elif out['source'] == 'sec_cftc': for replaced in document.suppression.get('replaces', []): out['upstream_urls'].append({ 'url': 'http://www.regulations.gov/#!documentDetail;D=' + replaced, 'label': 'Regulations.gov' }) # cleaned-up details details = out['details'].copy() dp = lambda key, default=None: details.pop(key, default) out['clean_details'] = dtls( ('Submitter Information', dtls( ('Name', combine(dp('First_Name'), dp('Middle_Name'), dp('Last_Name'))), ('Organization', dp('Organization_Name')), ('Location', combine(dp('Mailing_Address'), dp('Mailing_Address_'), dp('City'), expand_state(dp('State_or_Province')), dp('Postal_Code'), dp('Country'), sep=", ")), ('Email Address', dp('Email_Address')), ('Phone Number', dp('Phone_Number')), ('Fax Number', dp('Fax_Number')), ("Submitter's Representative", dp('Submitter_s_Representative')) )), ('Dates and Times', dtls( ('Document Date', dp('Document_Date')), # rarely-used ('Date Received', dp('Received_Date')), ('Postmark Date', dp('Postmark_Date', dp('Post_Mark_Date'))), ('Date Posted', dp('Date_Posted')), (None, dp('Date')), # Swallow this one, since it's always the same as Date_Posted, ('Comment Period', combine( short_date(force_date(dp('Comment_Start_Date'))), short_date(force_date(dp('Comment_Due_Date'))), sep="–" )), # all the other dates -- don't even know what most of these are ("File Date", dp("File_Date")), ("Answer Date", dp("Answer_Date")), ("Author Date", dp("Author_Date")), ("Author Document Date", dp("Author_Document_Date")), ("Effective Date", dp("Effective_Date")), ("Implementation Date", dp("Implementation_Date")), ("Implementation Service Date", dp("Implementation_Service_Date")) )), ('Citations and References', dtls( ("RIN", document.rin if document.rin else None), ("Federal Register No.", dp("Federal_Register_Number")), ("Federal Register Pages", dp("Start_End_Page", "").replace(" - ", "–")), (None, dp("Page_Count")), # who cares? (None, dp("Page_Start")), # who cares? ("Federal Register Citation", dp("Federal_Register_Citation")), ("CFR Section(s)", dp("CFR")), ("Related RINs", dp("Related_RIN_s_")), )), ('Additional Details', dtls(*details.items())) ) return Response(out)
def get(self, request, *args, **kwargs): out = super(DocketView, self).get(request, *args, **kwargs).data out['source'] = self.item.source stats = out['stats'] stats['similar_dockets'] = [] summaries = [] out['upstream_urls'] = [] if out['source'] == 'regulations.gov': out['upstream_urls'].append({ 'url': 'http://www.regulations.gov/#!docketDetail;D=' + self.item.id, 'label': 'Regulations.gov' }) elif out['source'] == 'sec_cftc': if 'Source_URL' in self.item.details: out['upstream_urls'].append({ 'url': self.item.details['Source_URL'], 'label': 'SEC.gov' if self.item.agency == 'SEC' else 'CFTC.gov' }) for replaced in self.item.suppression.get('replaces', []): out['upstream_urls'].append({ 'url': 'http://www.regulations.gov/#!docketDetail;D=' + replaced, 'label': 'Regulations.gov' }) if stats['count'] > 0: # do a similar thing with FR documents if stats.get('doc_info', {}).get('fr_docs', None): fr_doc_ids = [doc['id'] for doc in stats['doc_info']['fr_docs']] fr_search = Doc.objects(id__in=fr_doc_ids) fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search]) for doc in stats['doc_info']['fr_docs']: if doc['id'] in fr_docs: fr_doc = fr_docs[doc['id']] doc['stats'] = { 'date_range': fr_doc.stats['date_range'], 'count': fr_doc.stats['count'] } if fr_doc.stats else {'count': 0} if fr_doc.annotations.get('fr_data', None): doc['summary'] = fr_doc.annotations['fr_data'].get('abstract', None) if not doc.get('summary', None): doc['summary'] = fr_doc.get_summary() doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and force_date(fr_doc.details['Comment_Due_Date']) > datetime.datetime.now() if doc['summary']: summaries.append(doc['summary']) else: doc['stats'] = {'count': 0, 'comments_open': False} doc['summary'] = None # remove duplicates, if any tmp = stats['doc_info']['fr_docs'] included = set() stats['doc_info']['fr_docs'] = [] for doc in tmp: if doc['id'] not in included: stats['doc_info']['fr_docs'].append(doc) included.add(doc['id']) summary_text = "\n".join(summaries) if summary_text: similar_dockets = get_similar_dockets(summary_text, kwargs[self.aggregation_field])[:3] if similar_dockets: sd = dict([(docket.id, docket.title) for docket in Docket.objects(id__in=similar_dockets).only('id', 'title')]) stats['similar_dockets'] = [{ 'id': docket, 'title': sd[docket] } for docket in similar_dockets] agency = self.item.agency if not agency: agency = re.split("[-_]", self.item.id)[0] if agency: agency_meta = list(Agency.objects(id=agency).only("name")) if agency_meta: out['agency'] = { 'id': agency, 'name': agency_meta[0].name, 'url': '/agency/%s' % agency } else: agency = None if not agency: out['agency'] = None return Response(out)
def get(self, request, entity_id, docket_id, document_type, entity_type): dkt_results = list(Docket.objects(id=docket_id).only('id', 'title')) ent_results = list(Entity.objects(id=entity_id).only('id', 'aliases')) if not dkt_results or not ent_results: raise Http404('Not found.') docket = dkt_results[0] entity = ent_results[0] if document_type == 'mentions': docs_q = Doc.objects(Q(attachments__views__entities=entity_id) | Q(views__entities=entity_id), docket_id=docket_id) else: docs_q = Doc.objects(submitter_entities=entity_id, docket_id=docket_id) \ docs_q = docs_q.only('type', 'title', 'id', 'views', 'attachments.views', 'details.Date_Posted', 'deleted').hint([("docket_id", 1)]) docs = filter( lambda d: not d.deleted, sorted(list(docs_q), key=lambda doc: doc.details.get( 'Date_Posted', datetime.datetime(1900, 1, 1)), reverse=True)) get_views = lambda doc: [ { 'object_id': view.object_id, 'file_type': view.type, 'url': view.url.replace('inline', 'attachment') } for view in doc.views if entity_id in view.entities ] out_docs = [] for doc in docs[:10]: out_doc = { 'title': doc.title, 'id': doc.id, 'date_posted': doc.details['Date_Posted'], 'type': doc.type, 'url': '/document/' + doc.id } if document_type == 'mentions': out_doc['files'] = get_views(doc) + list( itertools.chain.from_iterable([ get_views(attachment) for attachment in doc.attachments ])) out_docs.append(out_doc) return Response({ 'documents': out_docs, 'has_more': len(docs) > 10, 'count': len(docs), 'document_search_url': "/search-document/" + \ url_quote(":".join(["mentioned" if document_type == "mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) + \ url_quote(":".join(["docket", docket.id, '"%s"' % docket.title])), 'docket': { 'id': docket.id, 'title': docket.title, }, 'entity': { 'id': entity.id, 'name': entity.aliases[0] }, 'filter_type': document_type })
def get(self, request, *args, **kwargs): "Access aggregate information about entities as they occur in regulations.gov data." results = Entity.objects(id=kwargs['entity_id']) if not results: raise Http404('Docket not found.') entity = results[0] # basic docket metadata out = { 'name': entity.aliases[0], 'url': reverse('entity-view', args=args, kwargs=kwargs), 'id': entity.id, 'type': entity.td_type, 'stats': entity.stats } stats = entity.stats if stats: # cleanup, plus stitch on some additional data now = datetime.datetime.now().date() for mention_type in ["text_mentions", "submitter_mentions"]: stats[mention_type].update({ 'months': [ month for month in prettify_months(stats[mention_type] ['months']) if month['date_range'][0] <= now ] if stats[mention_type]['months'] else [], }) # limit ourselves to the top ten of each match type, and grab their extra metadata agencies = sorted(stats[mention_type]['agencies'].items(), key=lambda x: x[1], reverse=True)[:10] stats[mention_type]['top_agencies'] = [{ 'id': item[0], 'count': item[1], 'months': prettify_months( stats[mention_type]['agencies_by_month'][item[0]]) } for item in agencies] del stats[mention_type]['agencies'], stats[mention_type][ 'agencies_by_month'] docket_list = stats[mention_type]['dockets'].items() years = request.GET.get('years', None) if years: year_set = set(years.split(",")) docket_list = [ item for item in docket_list if get_docket_year(item[0]) in year_set ] dockets = sorted(docket_list, key=lambda x: x[1], reverse=True)[:10] stats[mention_type]['top_dockets'] = [{ 'id': item[0], 'count': item[1] } for item in dockets] stats[mention_type]['docket_count'] = len(docket_list) del stats[mention_type]['dockets'] stats[mention_type][ 'docket_search_url'] = "/search-docket/" + url_quote( ":".join([ "mentioned" if mention_type == "text_mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0] ])) # grab additional docket metadata ids = list( set([ record['id'] for record in stats['submitter_mentions']['top_dockets'] ] + [ record['id'] for record in stats['text_mentions']['top_dockets'] ])) dockets_search = Docket.objects(id__in=ids).only( 'id', 'title', 'year', 'details.dk_type', 'agency', 'stats.date_range') dockets = dict([(docket.id, docket) for docket in dockets_search]) # stitch this back onto the main records for mention_type in ['text_mentions', 'submitter_mentions']: for docket in stats[mention_type]['top_dockets']: rdocket = dockets[docket['id']] docket.update({ 'title': rdocket.title, 'url': reverse('docket-view', kwargs={'docket_id': rdocket.id}), 'year': rdocket.year if rdocket.year else (getattr(rdocket.stats['date_range'][0], 'year', None) if 'date_range' in rdocket.stats else None), 'rulemaking': rdocket.details.get( 'Type', 'Nonrulemaking').lower() == 'rulemaking', 'agency': rdocket.agency if rdocket.agency else re.split( "[-_]", rdocket.id)[0] }) # repeat for agencies ids = list( set([ record['id'] for record in stats['submitter_mentions']['top_agencies'] ] + [ record['id'] for record in stats['text_mentions']['top_agencies'] ])) agencies_search = Agency.objects(id__in=ids).only('id', 'name') agencies = dict([(agency.id, agency) for agency in agencies_search]) # ...and stitch for mention_type in ['text_mentions', 'submitter_mentions']: for agency in stats[mention_type]['top_agencies']: ragency = agencies.get(agency['id'], None) agency.update({ 'name': ragency.name if ragency else agency['id'], 'url': '/agency/%s' % agency['id'] }) # and for comments recent_comments = [] if 'recent_comments' in stats['submitter_mentions']: recent_comments_search = Doc.objects(id__in=[ doc['id'] for doc in stats['submitter_mentions']['recent_comments'] ]).only('id', 'title', 'details') for comment in recent_comments_search: comment_item = { 'title': comment.title, 'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None, 'author': " ".join([ comment.details.get('First_Name', ''), comment.details.get('Last_Name', '') ]).strip(), 'organization': comment.details.get('Organization_Name', ''), 'url': '/document/' + comment.id } comment_item['author'] = comment_item[ 'author'] if comment_item['author'] else None recent_comments.append(comment_item) stats['submitter_mentions']['recent_comments'] = recent_comments out['stats'] = stats else: out['stats'] = {'count': 0} return Response(out)
def get(self, request, *args, **kwargs): "Access basic metadata about regulations.gov documents." results = list(Doc.objects(id=kwargs['document_id'])) if not results or results[0].deleted: raise Http404('Document not found.') document = results[0] # basic document metadata out = { 'title': document.title, 'url': reverse('document-view', kwargs=kwargs), 'id': document.id, 'agency': { 'id': document.agency, 'url': reverse('agency-view', kwargs={'agency': document.agency}), 'name': Agency.objects(id=document.agency).only("name")[0].name }, 'date': document.details.get('Date_Posted', None), 'type': document.type, 'views': [], 'attachments': [], 'details': document.details if document.details else {} } # inter-dataset suppression if 'replaced_by' in document.suppression: new_kwargs = dict(kwargs) new_kwargs['document_id'] = document.suppression['replaced_by'][0] out['redirect_to'] = reverse('document-view', kwargs=new_kwargs) # comment-on metadata if document.comment_on: # if we don't have all the data built in, grab it from its original record comment_on_doc = document.comment_on if 'title' in document.comment_on else Doc.objects.get( id=document.comment_on['document_id']).to_mongo() out['comment_on'] = { "fr_doc": comment_on_doc.get('fr_doc', False), "type": comment_on_doc.get('type', None), "id": document.comment_on['document_id'], 'url': reverse( 'document-view', kwargs={'document_id': document.comment_on['document_id']}), "title": comment_on_doc['title'] } if comment_on_doc['agency'] == out['agency'][ 'id'] or not comment_on_doc['agency']: out['comment_on']['agency'] = out['agency'] else: out['comment_on']['agency'] = { 'id': comment_on_doc['agency'], 'url': reverse('agency-view', kwargs={'agency': comment_on_doc['agency']}), 'name': Agency.objects( id=comment_on_doc['agency']).only("name")[0].name } else: out['comment_on'] = {} # docket metadata docket = Docket.objects(id=document.docket_id)[0] out['docket'] = { 'id': document.docket_id, 'url': reverse('docket-view', kwargs={'docket_id': document.docket_id}), 'title': docket.title, 'weeks': [], 'fr_docs': [] } if docket.stats: out['docket']['weeks'] = prettify_weeks(docket.stats['weeks']) out['docket']['fr_docs'] = docket.stats['doc_info'].get( 'fr_docs', []) if out['date']: out['date'] = out['date'].isoformat() text_entities = set() submitter_entities = set( document.submitter_entities if document.submitter_entities else []) # a weird thing happens with iterating over mongoengine lists where they lose references to their parent instances, so do this weird generator thing for view in (document.views[i] for i in xrange(len(document.views))): # hack to deal with documents whose scrapes failed but still got extracted object_id = document.object_id if document.object_id else view.file_path.split( '/')[-1].split('.')[0] out['views'].append({ 'object_id': object_id, 'file_type': view.type, 'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()), 'extracted': view.extracted == 'yes', 'url': view.download_url, 'html': reverse('raw-text-view', kwargs={ 'document_id': document.id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'view' }) if view.extracted == 'yes' else None }) for entity in view.entities: text_entities.add(entity) for attachment in (document.attachments[i] for i in xrange(len(document.attachments))): a = {'title': attachment.title, 'views': []} for view in (attachment.views[i] for i in xrange(len(attachment.views))): a['views'].append({ 'object_id': attachment.object_id, 'file_type': view.type, 'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()), 'extracted': view.extracted == 'yes', 'url': view.download_url, 'html': reverse('raw-text-view', kwargs={ 'document_id': document.id, 'object_id': attachment.object_id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'attachment' }) if view.extracted == 'yes' else None }) for entity in view.entities: text_entities.add(entity) out['attachments'].append(a) # stats for FR docs stats = document.stats if document.stats else {'count': 0} # limit ourselves to the top five of each match type, and grab their extra metadata for label in ['text_entities', 'submitter_entities']: stats['top_' + label] = [{ 'id': i[0], 'count': i[1] } for i in sorted(stats.get(label, {}).items(), key=lambda x: x[1], reverse=True)[:5]] if label in stats: del stats[label] top_entities = set( [record['id'] for record in stats['top_text_entities']] + [record['id'] for record in stats['top_submitter_entities']]) entities_search = Entity.objects(id__in=list( submitter_entities.union(text_entities, top_entities))).only( 'id', 'td_type', 'aliases') entities = dict([(entity.id, entity) for entity in entities_search]) for label, items in [('submitter_entities', sorted(list(submitter_entities))), ('text_entities', sorted(list(text_entities)))]: out[label] = [{ 'id': item, 'type': entities[item].td_type, 'name': entities[item].aliases[0], 'url': '/%s/%s/%s' % (entities[item].td_type, slugify(entities[item].aliases[0]), item) } for item in items] for label in ['top_text_entities', 'top_submitter_entities']: for entity in stats[label]: if not entities[entity['id']].td_type: continue entity['type'] = entities[entity['id']].td_type entity['name'] = entities[entity['id']].aliases[0] entity['url'] = '/%s/%s/%s' % ( entity['type'], slugify(entity['name']), entity['id']) if 'weeks' in stats: stats['weeks'] = prettify_weeks(stats['weeks']) recent_comments = [] if 'recent_comments' in stats: recent_comments_search = Doc.objects( id__in=[doc['id'] for doc in stats['recent_comments']]).only( 'id', 'title', 'details') for comment in recent_comments_search: comment_item = { 'title': comment.title, 'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None, 'author': " ".join([ comment.details.get('First_Name', ''), comment.details.get('Last_Name', '') ]).strip(), 'organization': comment.details.get('Organization_Name', ''), 'url': '/document/' + comment.id } comment_item['author'] = comment_item[ 'author'] if comment_item['author'] else None recent_comments.append(comment_item) stats['recent_comments'] = recent_comments out['comment_stats'] = stats # links upstream out['source'] = document.source out['upstream_urls'] = [] if out['source'] == 'regulations.gov': out['upstream_urls'].append({ 'url': 'http://www.regulations.gov/#!documentDetail;D=' + document.id, 'label': 'Regulations.gov' }) elif out['source'] == 'sec_cftc': for replaced in document.suppression.get('replaces', []): out['upstream_urls'].append({ 'url': 'http://www.regulations.gov/#!documentDetail;D=' + replaced, 'label': 'Regulations.gov' }) # cleaned-up details details = out['details'].copy() dp = lambda key, default=None: details.pop(key, default) out['clean_details'] = dtls( ('Submitter Information', dtls(('Name', combine(dp('First_Name'), dp('Middle_Name'), dp('Last_Name'))), ('Organization', dp('Organization_Name')), ('Location', combine(dp('Mailing_Address'), dp('Mailing_Address_'), dp('City'), expand_state(dp('State_or_Province')), dp('Postal_Code'), dp('Country'), sep=", ")), ('Email Address', dp('Email_Address')), ('Phone Number', dp('Phone_Number')), ('Fax Number', dp('Fax_Number')), ("Submitter's Representative", dp('Submitter_s_Representative')))), ( 'Dates and Times', dtls( ('Document Date', dp('Document_Date')), # rarely-used ('Date Received', dp('Received_Date')), ('Postmark Date', dp('Postmark_Date', dp('Post_Mark_Date'))), ('Date Posted', dp('Date_Posted')), ( None, dp('Date') ), # Swallow this one, since it's always the same as Date_Posted, ('Comment Period', combine(short_date(force_date(dp('Comment_Start_Date'))), short_date(force_date(dp('Comment_Due_Date'))), sep="–")), # all the other dates -- don't even know what most of these are ("File Date", dp("File_Date")), ("Answer Date", dp("Answer_Date")), ("Author Date", dp("Author_Date")), ("Author Document Date", dp("Author_Document_Date")), ("Effective Date", dp("Effective_Date")), ("Implementation Date", dp("Implementation_Date")), ("Implementation Service Date", dp("Implementation_Service_Date")))), ( 'Citations and References', dtls( ("RIN", document.rin if document.rin else None), ("Federal Register No.", dp("Federal_Register_Number")), ("Federal Register Pages", dp( "Start_End_Page", "").replace(" - ", "–")), (None, dp("Page_Count")), # who cares? (None, dp("Page_Start")), # who cares? ("Federal Register Citation", dp("Federal_Register_Citation")), ("CFR Section(s)", dp("CFR")), ("Related RINs", dp("Related_RIN_s_")), )), ('Additional Details', dtls(*details.items()))) return Response(out)
def get(self, request, *args, **kwargs): out = super(DocketView, self).get(request, *args, **kwargs).data out['source'] = self.item.source stats = out['stats'] stats['similar_dockets'] = [] summaries = [] out['upstream_urls'] = [] if out['source'] == 'regulations.gov': out['upstream_urls'].append({ 'url': 'http://www.regulations.gov/#!docketDetail;D=' + self.item.id, 'label': 'Regulations.gov' }) elif out['source'] == 'sec_cftc': if 'Source_URL' in self.item.details: out['upstream_urls'].append({ 'url': self.item.details['Source_URL'], 'label': 'SEC.gov' if self.item.agency == 'SEC' else 'CFTC.gov' }) for replaced in self.item.suppression.get('replaces', []): out['upstream_urls'].append({ 'url': 'http://www.regulations.gov/#!docketDetail;D=' + replaced, 'label': 'Regulations.gov' }) if stats['count'] > 0: # do a similar thing with FR documents if stats.get('doc_info', {}).get('fr_docs', None): fr_doc_ids = [ doc['id'] for doc in stats['doc_info']['fr_docs'] ] fr_search = Doc.objects(id__in=fr_doc_ids) fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search]) for doc in stats['doc_info']['fr_docs']: if doc['id'] in fr_docs: fr_doc = fr_docs[doc['id']] doc['stats'] = { 'date_range': fr_doc.stats['date_range'], 'count': fr_doc.stats['count'] } if fr_doc.stats else { 'count': 0 } if fr_doc.annotations.get('fr_data', None): doc['summary'] = fr_doc.annotations['fr_data'].get( 'abstract', None) if not doc.get('summary', None): doc['summary'] = fr_doc.get_summary() doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and force_date( fr_doc.details['Comment_Due_Date'] ) > datetime.datetime.now() if doc['summary']: summaries.append(doc['summary']) else: doc['stats'] = {'count': 0, 'comments_open': False} doc['summary'] = None # remove duplicates, if any tmp = stats['doc_info']['fr_docs'] included = set() stats['doc_info']['fr_docs'] = [] for doc in tmp: if doc['id'] not in included: stats['doc_info']['fr_docs'].append(doc) included.add(doc['id']) summary_text = "\n".join(summaries) if summary_text: similar_dockets = get_similar_dockets( summary_text, kwargs[self.aggregation_field])[:3] if similar_dockets: sd = dict([(docket.id, docket.title) for docket in Docket.objects( id__in=similar_dockets).only('id', 'title') ]) stats['similar_dockets'] = [{ 'id': docket, 'title': sd[docket] } for docket in similar_dockets] agency = self.item.agency if not agency: agency = re.split("[-_]", self.item.id)[0] if agency: agency_meta = list(Agency.objects(id=agency).only("name")) if agency_meta: out['agency'] = { 'id': agency, 'name': agency_meta[0].name, 'url': '/agency/%s' % agency } else: agency = None if not agency: out['agency'] = None return Response(out)
from regs_models import Doc import json import itertools def split_seq(iterable, size): it = iter(iterable) item = list(itertools.islice(it, size)) while item: yield item item = list(itertools.islice(it, size)) all_ids = json.load(open("/tmp/problems.json")) for ids in split_seq(all_ids, 1000): for doc in Doc.objects(id__in=ids): for view in doc.views: if view.type == "pdf" and view.mode == "html" and view.extracted == "yes": view.extracted = "no" view.content.delete() for attachment in doc.attachments: for view in attachment.views: if view.type == "pdf" and view.mode == "html" and view.extracted == "yes": view.extracted = "no" view.content.delete() doc.in_search_index = False doc.in_cluster_db = False doc.entities_last_extracted = None print "Repaired %s" % doc.id doc.save()