def archive_document_locally(document, directory="blacklisted_documents"): doc_url = IACommon.get_pdf_url(document.court, document.casenum, document.docnum, document.subdocnum) if os.system("wget --quiet --directory-prefix=%s %s" % (directory, doc_url)) != 0: print "There was an error archiving document (%s.%s.%s.%s), it has been marked as unavailble, but has not been deleted from the Internet Archive" % (document.court, document.casenum, document.docnum, document.subdocnum) exit() print " saved document %s.%s for analysis in %s directory" % (document.docnum, document.subdocnum, directory)
def archive_document_locally(document, directory="blacklisted_documents"): doc_url = IACommon.get_pdf_url(document.court, document.casenum, document.docnum, document.subdocnum) if os.system("wget --quiet --directory-prefix=%s %s" % ( directory, doc_url)) != 0: print "There was an error archiving document (%s.%s.%s.%s), it has been marked as unavailble, but has not been deleted from the Internet Archive" % ( document.court, document.casenum, document.docnum, document.subdocnum) exit() print " saved document %s.%s for analysis in %s directory" % ( document.docnum, document.subdocnum, directory)
def _get_documents_dict(court, casenum): """ Create a dict containing the info for the docs specified """ documents = {} query = Document.objects.filter(court=court, casenum=casenum) if query: for document in query: if document.docid: docmeta = {"casenum": document.casenum, "docnum": document.docnum, "subdocnum": document.subdocnum} if document.available: docmeta.update({"filename": IACommon.get_pdf_url(document.court, document.casenum, document.docnum, document.subdocnum), "timestamp": document.lastdate.strftime("%m/%d/%y")}) documents[document.docid] = docmeta return documents
def _get_documents_dict(court, casenum): """ Create a dict containing the info for the docs specified """ documents = {} query = Document.objects.filter(court=court, casenum=casenum) if query: for document in query: if document.docid: docmeta = {"casenum": document.casenum, "docnum": document.docnum, "subdocnum": document.subdocnum} if document.available: docmeta.update( {"filename": IACommon.get_pdf_url(document.court, document.casenum, document.docnum, document.subdocnum), "timestamp": document.lastdate.strftime("%m/%d/%y")}) documents[document.docid] = docmeta return documents
def query(request): """ Query the database to check which PDF documents we have. The json input is {"court": <court>, "urls": <list of PACER doc1 urls>} The json output is a set of mappings: {<pacer url>: { "filename": <public url>, "timestamp": <last time seen> }, <pacer url>: ... } """ response = {} if request.method != "POST": message = "query: Not a POST request." logging.error(message) return HttpResponse(message) try: jsonin = simplejson.loads(request.POST["json"]) except KeyError: message = "query: no 'json' POST argument" logging.warning(message) return HttpResponse(message) except ValueError: message = "query: malformed 'json' POST argument" logging.warning(message) return HttpResponse(message) except IOError: # Not something we can fix I don't think. Client fails to send data. message = "query: Client read error (Timeout?)" logging.warning(message) return HttpResponse(message) try: court = jsonin["court"].strip() except KeyError: message = "query: missing json 'court' argument." logging.warning(message) return HttpResponse(message) try: urls = jsonin["urls"] except KeyError: message = "query: missing json 'urls' argument." logging.warning(message) return HttpResponse(message) for url in urls: # detect show_doc style document links sdre = re.search("show_doc\.pl\?(.*)",url) if sdre: argsstring = sdre.group(1) args = argsstring.split("&") argsdict = {} for arg in args: (key, val) = arg.split("=") argsdict[key] = val # maybe need to add some checks for whether # these vars exist in argsdict query = Document.objects.filter(court=court) \ .filter(docnum=argsdict["doc_num"]) \ .filter(casenum=argsdict["caseid"]) \ .filter(dm_id=int(argsdict["dm_id"])) \ .filter(available=1) else: # otherwise, assume it's a normal doc1 style url docid = UploadHandler.docid_from_url_name(url) query = Document.objects.filter(docid=docid) \ .filter(available=1) if query: query = query[0] real_casenum = query.casenum response[url] = { "filename": IACommon.get_pdf_url(court, real_casenum, query.docnum, query.subdocnum), "timestamp": query.lastdate.strftime("%m/%d/%y")} if query.subdocnum == 0: subquery = Document.objects.filter(court=court, casenum=query.casenum, docnum=query.docnum, available=1).exclude( subdocnum=0) if len(subquery) > 0: response[url]["subDocuments"] = {} for subDoc in subquery: real_sub_casenum = subDoc.casenum response[url]["subDocuments"][subDoc.subdocnum] = { "filename" : IACommon.get_pdf_url(court, real_sub_casenum, subDoc.docnum, subDoc.subdocnum), "timestamp": subDoc.lastdate.strftime("%m/%d/%y")} jsonout = simplejson.dumps(response) return HttpResponse(jsonout, mimetype="application/json")
def query(request): """ Query the database to check which PDF documents we have. The json input is {"court": <court>, "urls": <list of PACER doc1 urls>} The json output is a set of mappings: {<pacer url>: { "filename": <public url>, "timestamp": <last time seen> }, <pacer url>: ... } """ response = {} if request.method != "POST": message = "query: Not a POST request." logging.error(message) return HttpResponse(message) try: jsonin = simplejson.loads(request.POST["json"]) except KeyError: message = "query: no 'json' POST argument" logging.warning(message) return HttpResponse(message) except ValueError: message = "query: malformed 'json' POST argument" logging.warning(message) return HttpResponse(message) except IOError: # Not something we can fix I don't think. Client fails to send data. message = "query: Client read error (Timeout?)" logging.warning(message) return HttpResponse(message) try: court = jsonin["court"].strip() except KeyError: message = "query: missing json 'court' argument." logging.warning(message) return HttpResponse(message) try: urls = jsonin["urls"] except KeyError: message = "query: missing json 'urls' argument." logging.warning(message) return HttpResponse(message) for url in urls: # detect show_doc style document links sdre = re.search("show_doc\.pl\?(.*)", url) if sdre: argsstring = sdre.group(1) args = argsstring.split("&") argsdict = {} for arg in args: (key, val) = arg.split("=") argsdict[key] = val # maybe need to add some checks for whether # these vars exist in argsdict query = Document.objects.filter(court=court) \ .filter(docnum=argsdict["doc_num"]) \ .filter(casenum=argsdict["caseid"]) \ .filter(dm_id=int(argsdict["dm_id"])) \ .filter(available=1) else: # otherwise, assume it's a normal doc1 style url docid = UploadHandler.docid_from_url_name(url) query = Document.objects.filter(docid=docid) \ .filter(available=1) if query: query = query[0] real_casenum = query.casenum response[url] = { "filename": IACommon.get_pdf_url(court, real_casenum, query.docnum, query.subdocnum), "timestamp": query.lastdate.strftime("%m/%d/%y") } if query.subdocnum == 0: subquery = Document.objects.filter( court=court, casenum=query.casenum, docnum=query.docnum, available=1).exclude(subdocnum=0) if len(subquery) > 0: response[url]["subDocuments"] = {} for subDoc in subquery: real_sub_casenum = subDoc.casenum response[url]["subDocuments"][subDoc.subdocnum] = { "filename": IACommon.get_pdf_url(court, real_sub_casenum, subDoc.docnum, subDoc.subdocnum), "timestamp": subDoc.lastdate.strftime("%m/%d/%y") } jsonout = simplejson.dumps(response) return HttpResponse(jsonout, mimetype="application/json")