def setUp(self): self.test_search = 'Calpers special review' self.test_id = '74103-report-of-the-calpers-special-review' self.public_client = DocumentCloud() self.private_client = DocumentCloud(DOCUMENTCLOUD_USERNAME, DOCUMENTCLOUD_PASSWORD) self.fake_client = DocumentCloud("John Doe", "TK")
def setUp(self): """ Initialize a bunch of variables we'll use across tests. """ self.test_id = '2511322-lafd-recruitment-report' self.public_client = DocumentCloud() self.private_client = DocumentCloud( os.environ['DOCUMENTCLOUD_TEST_USERNAME'], os.environ['DOCUMENTCLOUD_TEST_PASSWORD']) self.fake_client = DocumentCloud("John Doe", "TK") self.version = self.get_version()
def setUp(self): """ Initialize a bunch of variables we'll use across tests. """ self.test_id = '74103-report-of-the-calpers-special-review' self.public_client = DocumentCloud() self.private_client = DocumentCloud( os.environ['DOCUMENTCLOUD_TEST_USERNAME'], os.environ['DOCUMENTCLOUD_TEST_PASSWORD']) self.fake_client = DocumentCloud("John Doe", "TK") self.version = self.get_version()
def edit(): from documentcloud import DocumentCloud dc_id = request.args(0) or redirect(URL('document', 'index')) db.documentCloud.file.requires = None record = db.documentCloud(dc_id) form = SQLFORM(db.documentCloud, record) if form.validate(): client = DocumentCloud(username=dc_username, password=dc_password) doc_cloud = client.documents.get(record.dc_id) doc_cloud.title = form.vars['title'] doc_cloud.source = form.vars['source'] doc_cloud.description = form.vars['description'] doc_cloud.related_article = form.vars['related_article'] doc_cloud.published_url = form.vars['published_url'] doc_cloud.access = form.vars['access'] doc_cloud.project = form.vars['project'] #data=json.dumps(form.vars['data']), doc_cloud.secure = form.vars['secure'] rest = doc_cloud.put() if rest is not None: response.flash = T('Documento actualizado') elif form.errors: response.flash = T('Hay errores en el formulario') else: response.flash = T('Por favor llene el formulario') return dict(form=form)
def update_to_documentcloud(self, field, value): if self.source_type not in AttachmentSourceType.DOCUMENTCLOUD_SOURCE_TYPES: return client = DocumentCloud(settings.DOCUMENTCLOUD_USER, settings.DOCUMENTCLOUD_PASSWORD) try: doc = client.documents.get(self.external_id) except DoesNotExistError: logger.error( f'Cannot find document with external id {self.external_id} on DocumentCloud' ) return if getattr(doc, field, None) == value: return setattr(doc, field, value) try: doc.save() except HTTPError: logger.error( f'Cannot save document with external id {self.external_id} on DocumentCloud' )
def get_project(args): """Retrieve project metadata""" client = DocumentCloud(args.username, args.password) if args.id_or_title is None: # Get all projects projects = client.projects.all() print(json.dumps({ 'projects': [serialize_project(p) for p in projects], })) match_order = ['title', 'id'] if re.match(r'^\d+$', args.id_or_title): match_order = ['id', 'title'] for match_term in match_order: kwargs = {} kwargs[match_term] = args.id_or_title try: project = client.projects.get(**kwargs) print(json.dumps({ 'projects': [serialize_project(project)], })) break except DoesNotExistError: pass else: sys.stderr.write("Project with id or title '{}' does not exist\n".format( args.id_or_title)) sys.exit(1)
def upload_to_documentcloud(self): client = DocumentCloud(settings.DOCUMENTCLOUD_USER, settings.DOCUMENTCLOUD_PASSWORD) attachments = AttachmentFile.objects.filter( source_type=self.source_type, file_type=MEDIA_TYPE_DOCUMENT, pending_documentcloud_id__isnull=True, upload_fail_attempts__lte=UPLOAD_FAIL_MAX_ATTEMPTS) self.log_info( f'Uploading {len(attachments)} documents to DocumentCloud') for attachment in tqdm(attachments): source_type = AttachmentSourceType.SOURCE_TYPE_MAPPINGS[ attachment.source_type] cloud_document = client.documents.upload( attachment.original_url, title=format_copa_documentcloud_title( attachment.allegation.crid, attachment.title), description=source_type, access='public', force_ocr=True) attachment.pending_documentcloud_id = parse_id(cloud_document.id) attachment.save() self.log_info(f'Done uploading!')
def download_ocr_text_per_page(documentcloud_id, page_num, credentials): page_text = None try: dc_client = DocumentCloud(**credentials) obj = dc_client.documents.get(documentcloud_id) ''' workaround for private docs is, set public, wait, then private again. https://github.com/documentcloud/documentcloud/issues/220 ''' apply_access_workaround = (obj.access == DC_PRIVATE) if apply_access_workaround: obj.access = DC_PUBLIC obj.put() while obj.access in [DC_PRIVATE, DC_PENDING]: sleep(WORKAROUND_SLEEP) obj = dc_client.documets.get(documentcloud_id) page_text = obj.get_page_text(page_num) if apply_access_workaround: obj.access = DC_PRIVATE obj.put() except Exception as e: print "download_ocr_text_per_page ERROR" print e, type(e) return page_text
def __init__(self, logger, force_update=False, custom_search_syntaxes=None): super(DocumentCloudAttachmentImporter, self).__init__(logger) self.kept_attachments = [] self.updated_attachments = [] self.force_update = force_update self.custom_search_syntaxes = custom_search_syntaxes self.client = DocumentCloud(settings.DOCUMENTCLOUD_USER, settings.DOCUMENTCLOUD_PASSWORD)
def upload_file(project_name, file_to_upload): client = DocumentCloud(USERNAME, PASSWORD, loglevel=logging.INFO, timeout=30) project, created = client.projects.get_or_create_by_title(project_name) obj = client.documents.upload(file_to_upload, handle_errors=True, project=project.id)
def connect_client(): '''Connect your machine to DocumentCloud's API''' if PASSWORD and USERNAME: print('Getting DocumentCloud credentials from local_settings.py') client = DocumentCloud(USERNAME, PASSWORD) return client else: print("You must add your credentials to local_settings.py") exit()
def upload_pdf_to_documentcloud(pdf, credentials): try: dc_client = DocumentCloud(**credentials) return dc_client.documents.upload(pdf, secure=True, force_ocr=True) except Exception as e: print "upload_pdf_to_documentcloud ERROR" print e, type(e) return None
def add_document(): import os import json from documentcloud import DocumentCloud dc_id = None respuesta = None #this is the controller function that will appear in our dialog form = SQLFORM(db.documentCloud) if form.validate(): dc_cloud = DocumentCloud(username=dc_username, password=dc_password) dc_id = dc_cloud.documents.upload( os.path.join(request.folder, 'uploads', form.vars['file']), title=form.vars['title'], source=form.vars['source'], description=form.vars['description'], related_article=form.vars['related_article'], published_url=form.vars['published_url'], access=form.vars['access'], project=form.vars['project'], #data=json.dumps(form.vars['data']), secure=form.vars['secure']) if dc_id is not None: form.vars.dc_id = dc_id.id id = db.documentCloud.insert( **db.documentCloud._filter_fields(form.vars)) respuesta = request.post_vars #Successfully added new item #do whatever else you may want #Then let the user know adding via our widget worked response.flash = T("Added") target = request.args[0] #close the widget's dialog box response.js = '$( "#%s_dialog-form" ).dialog( "close" ); ' % ( target) #update the options they can select their new category in the main form response.js += """$("#%s").append("<option value='%s'>%s</option>");""" \ % (target, form.vars.id, form.vars.name) #and select the one they just added response.js += """var selected=$("#%s").val();""" % (target) response.js += """if (selected==null) { selected = [] }""" response.js += """selected.push("%s");""" % (form.vars.id) response.js += """$("#%s").val(selected);""" % (target) #finally, return a blank form incase for some reason they wanted to add another option return form else: response.flash = T('Error en subir Documento a DocumentCloud') return form elif form.errors: #silly user, just send back the form and it'll still be in our dialog box complete with error messages return form else: #hasn't been submitted yet, just give them the fresh blank form return form
def update_all(): from documentcloud import DocumentCloud client = DocumentCloud(username=dc_username, password=dc_password) client_docs = DocumentCloud(username=dc_username, password=dc_password) document_ids = {} insert = [] projects_list = client.projects.all() for project in projects_list: try: obj = client_docs.projects.get(project.id) document_ids[project.id] = obj.document_list except: ex = T('No existe Projecto') #document_ids[project] = obj.document_ids #document_ids[project] = project.title if len(document_ids[project.id]) > 0: for doc in document_ids[project.id]: doc_cloud = db((db.documentCloud.dc_id == doc.id) & ( db.documentCloud.is_active == True)).select().first() if doc_cloud is None: docs = db.documentCloud.validate_and_insert( dc_id=doc.id, title=doc.title, project=project.id, is_active=True) insert.append(docs) else: doc_cloud.dc_id = doc.id doc_cloud.title = doc.title doc_cloud.source = doc.source doc_cloud.description = doc.description doc_cloud.related_article = doc.related_article doc_cloud.published_url = doc.published_url doc_cloud.access = doc.access doc_cloud.project = project.id #data=json.dumps(form.vars['data']), #doc_cloud.secure=doc.secure doc_cloud.update_record() return dict(projects=projects_list, docs=insert)
def search(): from documentcloud import DocumentCloud client = DocumentCloud() obj_list = {} form = FORM(T('Búsqueda:'), INPUT(_name='q'), INPUT(_type='submit')) if form.validate(): obj_list = client.documents.search(form.vars.q) return dict(form=form, obj_list=obj_list)
def get_document(args): """Retrieve document metadata""" client = DocumentCloud(args.username, args.password) try: doc = client.documents.get(args.id) print(json.dumps(serialize_document(doc))) except DoesNotExistError: sys.stderr.write("Document with id '{}' does not exist\n".format( args.id)) sys.exit(1)
def handle(self, *args, **options): client = DocumentCloud() for document_type, syntax in self.search_syntaxes: results = client.documents.search(syntax) if results: results = self.clean_documentcloud_results(results) for result in results: self.process_documentcloud_result(result, document_type) DocumentCrawler.objects.create(num_documents=Document.objects.filter( documentcloud_id__gt=0).count())
def foia_file_delete_dc(sender, **kwargs): """Delete file from DocumentCloud after the model is deleted""" # pylint: disable=unused-argument foia_file = kwargs["instance"] if foia_file.doc_id: dc_client = DocumentCloud( username=settings.DOCUMENTCLOUD_BETA_USERNAME, password=settings.DOCUMENTCLOUD_BETA_PASSWORD, base_uri=f"{settings.DOCCLOUD_API_URL}/api/", auth_uri=f"{settings.SQUARELET_URL}/api/", ) dc_client.documents.delete(foia_file.doc_id)
def datum_per_page(crowdsource_pk, doc_id, metadata): """Create a crowdsource data item for each page of the document""" crowdsource = Crowdsource.objects.get(pk=crowdsource_pk) dc_client = DocumentCloud( username=settings.DOCUMENTCLOUD_BETA_USERNAME, password=settings.DOCUMENTCLOUD_BETA_PASSWORD, base_uri=f"{settings.DOCCLOUD_API_URL}/api/", auth_uri=f"{settings.SQUARELET_URL}/api/", ) document = dc_client.documents.get(doc_id) for i in range(1, document.pages + 1): crowdsource.data.create(url=f"{document.canonical_url}/pages/{i}", metadata=metadata)
def search_all(logger=_logger, custom_search_syntaxes=None): client = DocumentCloud(settings.DOCUMENTCLOUD_USER, settings.DOCUMENTCLOUD_PASSWORD) search_syntaxes = custom_search_syntaxes or DocumentCloudSearchQuery.objects.all( ).values_list('types', 'query') all_documents = [] for document_types, syntax in search_syntaxes: if syntax: logger.info(f'Searching Documentcloud for {syntax}') all_documents += _remove_duplicated( _remove_invalid_documents( _add_attributes(client.documents.search(syntax), document_types))) return all_documents
def batch_upload_files(project_name, files_to_batch): if not files_to_batch: print('No files available to upload') return # Connect to documentcloud client = DocumentCloud(USERNAME, PASSWORD, loglevel=logging.INFO, timeout=30) #logging.INFO # Create the project project, created = client.projects.get_or_create_by_title(project_name) # Upload all the pdfs obj_list = client.documents.upload_directory(files_to_batch, handle_errors=True, project=project.id)
def documentcloud_handler(request): methodname = documentcloud_handler.__name__ try: client = DocumentCloud('email', 'password') obj_list = client.documents.search(request, data=True) for i in range(len(obj_list)): contrib = obj_list[i].contributor if contrib == "Laurent Bastien": break link = obj_list[i].canonical_url return link except Exception as error: errormsg = "Error in {}. Error is {}".format(methodname, error) print(errormsg)
def get_document_entities(args): """Retrieve document entities""" client = DocumentCloud(args.username, args.password) try: doc = client.documents.get(args.id) print(json.dumps({ 'entities': [serialize_entity(e) for e in doc.entities], })) except DoesNotExistError: sys.stderr.write("Document with id '{}' does not exist\n".format( args.id)) sys.exit(1)
def upload(args): """Upload a document""" client = DocumentCloud(args.username, args.password) project = None if args.project: project = client.projects.get(title=args.project) # TODO: Handle error if this project doesn't exist project = client.projects.get(title=args.project) # HACK: The API docs seem to indicate that I should just be able to pass the # project ID here. I think things break with newer versions of python. for file_or_url in args.file_or_url: document_id = client.documents.upload(file_or_url, project=str(project.id))
def create(): #from documentCloud import document_cloud import os import json from documentcloud import DocumentCloud dc_id = None respuesta = None #db.documentCloud.referenceEntity.default='organizacion' #db.documentCloud.id_reference.default=_id #db.documentCloud.project.default=7144 form = SQLFORM(db.documentCloud) jeison = {} if form.validate(): dc_cloud = DocumentCloud(username=dc_username, password=dc_password) dc_id = dc_cloud.documents.upload( os.path.join(request.folder, 'uploads', form.vars['file']), title=form.vars['title'], source=form.vars['source'], description=form.vars['description'], related_article=form.vars['related_article'], published_url=form.vars['published_url'], access=form.vars['access'], project=form.vars['project'], #data=json.dumps(form.vars['data']), secure=form.vars['secure']) if dc_id is not None: form.vars.dc_id = dc_id.id id = db.documentCloud.insert( **db.documentCloud._filter_fields(form.vars)) respuesta = request.post_vars response.flash = T('Formulario aceptado') else: response.flash = T('Error en subir Documento a DocumentCloud') elif form.errors: response.flash = T('Hay errores en el formulario') else: response.flash = T('Por favor llene el formulario') return dict(form=form)
def import_doccloud_proj(crowdsource_pk, proj_id, metadata, doccloud_each_page): """Import documents from a document cloud project""" crowdsource = Crowdsource.objects.get(pk=crowdsource_pk) dc_client = DocumentCloud( username=settings.DOCUMENTCLOUD_BETA_USERNAME, password=settings.DOCUMENTCLOUD_BETA_PASSWORD, base_uri=f"{settings.DOCCLOUD_API_URL}/api/", auth_uri=f"{settings.SQUARELET_URL}/api/", ) project = dc_client.projects.get(proj_id) for document in project.documents: if doccloud_each_page: datum_per_page.delay(crowdsource.pk, document.id, metadata) else: crowdsource.data.create(url=document.canonical_url, metadata=metadata)
def process_link(self, link, document_type): matched_link = self.parse_link(link) if matched_link: client = DocumentCloud() results = client.documents.search( self.ID_SEARCH_SYNTAX.format( id=matched_link['documentcloud_id'])) if results: title = results[0].title crid = self.parse_crid_from_title(title, document_type) if crid: return { 'documentcloud_id': matched_link['documentcloud_id'], 'normalized_title': matched_link['normalized_title'], 'allegation_crid': crid, 'title': title } return False
from __future__ import print_function import os import sys from documentcloud import DocumentCloud print("Enter your Document Cloud Credentials") sys.stdout.write("Username: "******"Password: "******"https://sourceafrica.net/api/" client = DocumentCloud(username, password, base_uri) import pdb pdb.set_trace() print(client.projects.all())
""" file: proj_get_doc_urls.py what: script to get an url for each document in a given project """ # import the modules for this script from documentcloud import DocumentCloud from config_file import config_settings # varible to hold the project we're targeting PROJECT_ID = 16900 # authenticate with document cloud with user_name & password in docConfig.py client = DocumentCloud( config_settings["user_name"], config_settings["password"] ) def proj_get_doc_urls(project_id): """ begin function to return document ids """ # creates an object that contains the documents in the project obj = client.projects.get(id = project_id) # list to hold all of the documents ids list_of_documents = obj.document_ids # begin looping through each document in our list for document in list_of_documents:
class caseShare: VERBOSE = True # List of filing types that should not be tweeted (because they're routine and seldom interesting) DONOTTWEET = ['Notice of Appearance','Pro Hac Vice', 'Appear Pro Hac Vice', 'Appearance', 'LCvR 7.1 Certificate of Disclosure - Corporate Affiliations/Financial Interests'] tw = Twython( settings.twitter_app_key, settings.twitter_app_secret, settings.twitter_oauth_key, settings.twitter_oauth_secret ) db = dbconnect_db_add.db(host = settings.db_host, user=settings.db_user, pwd=settings.db_pass, port=settings.db_port, database=settings.db_database) dc = DocumentCloud(settings.dc_user, settings.dc_pass) # Re-arrange list of big cases into a dict with a unique ID bigcases = dict((item['court']+item['case_number'], item) for item in bigcases_list.cases) def __init__(self): if self.VERBOSE: self.bigCasesMessage() self.listNew() return def listNew(self): # List new filings in selected cases that haven't been tweeted yet cases = self.db.getDict(""" SELECT * FROM pacer_raw WHERE bigcase = 1 ORDER BY pid DESC LIMIT 100 """) for case in cases: self.share(case) self.update(case) return def update(self, case): # Update a case after it's tweeted self.db.run(""" UPDATE pacer_raw SET bigcase = 2 WHERE pid = %s """, (case['pid'], )) def twitter_upload(self, image_list): # Upload images of first four pages media_ids = [] for image in image_list: try: res = requests.get(image) res.raise_for_status() uploadable = BytesIO(res.content) response = self.tw.upload_media(media=uploadable) media_ids.append(response['media_id']) except: pass return media_ids def share(self, case): # Post a new filing to Twitter uid = case['court'] + case['case_number'] DP1 = re.compile('\[(.*?)\].*?<a href="(.*?)"', re.IGNORECASE) DP2 = re.compile('\[(.*?)\]', re.IGNORECASE) d = case['description'] media_ids = [] typ = DP2.search(d).group(1) if case['dcid'] is not None: # Best case: We have the document on DC, and probably attached images link = case['document_location'] nd = DP2.search(d).group(1) + '\n\n' + link doc = self.dc.documents.get(case['dcid']) images = doc.normal_image_url_list[:4] if len(images) > 0: media_ids = self.twitter_upload(images) elif DP1.search(d): # If the document hasn't made it to DC, send the PACER link link = DP1.search(d).group(2) nd = DP1.search(d).group(1) + '\n\nDoc. on PACER: ' + link elif DP2.search(d): # If there is no document, send a link to the PACER docket for the case nd = DP2.search(d).group(1) + '\n\nPACER Docket: ' + case['link'] else: nd = False if nd: if typ not in self.DONOTTWEET: msg = 'New filing in %s: %s' % (self.bigcases[uid]['name'], nd ) try: if len(media_ids) > 0: self.tw.update_status(status = msg, media_ids = media_ids) else: self.tw.update_status(status = msg) except Exception, e: print '##' + str(e) pass return