def import_law_box_case(case_path): """Open the file, get its contents, convert to XML and extract the meta data. Return a document object for saving in the database """ raw_text = open(case_path).read() clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text( raw_text) sha1 = hashlib.sha1(clean_html_str).hexdigest() citations = get_citations_from_tree(complete_html_tree, case_path) judges = get_judge(clean_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path, judges) doc = Document( source='L', sha1=sha1, html= clean_html_str, # we clear this field later, putting the value into html_lawbox. date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court), precedential_status=get_precedential_status(), judges=judges, download_url=case_path, ) cite = Citation(docket_number=get_docket_number( clean_html_tree, case_path=case_path, court=court)) docket = Docket( case_name=get_case_name(complete_html_tree, case_path), court=court, ) # Necessary for dup_finder. path = '//p/text()' doc.body_text = ' '.join(clean_html_tree.xpath(path)) # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) doc.citation = cite doc.docket = docket return doc
def import_mayer(case_path): """Open the file, get its contents, convert to XML and extract the meta data. Return a document object for saving in the database """ #raw_text = open(case_path).read() #clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(raw_text) tree = html.parse(case_path) sha1 = hashlib.sha1(clean_html_str).hexdigest() citations = get_citations_from_tree(complete_html_tree, case_path) judges = get_judge(clean_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path, judges) doc = Document( source='L', sha1=sha1, html=clean_html_str, # we clear this field later, putting the value into html_lawbox. date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court), precedential_status=get_precedential_status(), judges=judges, download_url=case_path, ) cite = Citation() docket = Docket( docket_number=get_docket_number( clean_html_tree, case_path=case_path, court=court ), case_name=get_case_name(complete_html_tree, case_path), court=court, ) # Necessary for dup_finder. path = '//p/text()' doc.body_text = ' '.join(clean_html_tree.xpath(path)) # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) doc.citation = cite doc.docket = docket return doc
def create_stub(citations): """Creates a stub document with the bare minimum of meta data.""" cite = Citation() # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) # TODO: We can use the court information in the citation here. Failure to do so will mean that our URLs will later # change -- something we wish to avoid. stub_doc = Document( is_stub_document=True, sha1='!', court=None, citation=cite, ) stub_doc.save(index=False) return stub_doc
def citation_redirector(request, reporter, volume, page): """Take a citation URL and use it to redirect the user to the canonical page for that citation. This uses the same infrastructure as the thing that identifies citations in the text of opinions. """ citation_str = " ".join([volume, reporter, page]) try: citation = get_citations(citation_str)[0] citation_str = citation.base_citation() # Corrects typos/variations. lookup_fields = [map_citations_to_models([citation]).keys()[0]] except IndexError: # Unable to disambiguate the citation. Try looking in *all* citation # fields. lookup_fields = [ "neutral_cite", "federal_cite_one", "federal_cite_two", "federal_cite_three", "specialty_cite_one", "state_cite_regional", "state_cite_one", "state_cite_two", "state_cite_three", "westlaw_cite", "lexis_cite", ] # We were able to get a match, expand it if it's a federal/state match. if len(lookup_fields) == 1 and lookup_fields[0] == "federal_cite_one": lookup_fields = ["federal_cite_one", "federal_cite_two", "federal_cite_three"] elif len(lookup_fields) == 1 and lookup_fields[0] == "state_cite_one": lookup_fields = ["state_cite_one", "state_cite_two", "state_cite_three"] q = Q() for lookup_field in lookup_fields: q |= Q(**{"citation__" + lookup_field: citation_str}) documents = Document.objects.filter(q) # Show the correct page.... if documents.count() == 0: # No results for an otherwise valid citation. response = render_to_response( "casepage/citation_redirect_info_page.html", {"none_found": True, "citation_str": citation_str, "private": True}, RequestContext(request), # status=404, ) response.status_code = 404 return response elif documents.count() == 1: # Total success. Redirect to correct location. return HttpResponsePermanentRedirect(documents[0].get_absolute_url()) elif documents.count() > 1: # Multiple results. Show them. response = render_to_response( "casepage/citation_redirect_info_page.html", {"too_many": True, "citation_str": citation_str, "documents": documents, "private": True}, RequestContext(request), # status=300, ) response.status_code = 300 return response
def citation_redirector(request, reporter, volume, page): """Take a citation URL and use it to redirect the user to the canonical page for that citation. This uses the same infrastructure as the thing that identifies citations in the text of opinions. """ citation_str = " ".join([volume, reporter, page]) try: citation = get_citations(citation_str)[0] citation_str = citation.base_citation() # Corrects typos/variations. lookup_fields = [map_citations_to_models([citation]).keys()[0]] except IndexError: # Unable to disambiguate the citation. Try looking in *all* citation # fields. lookup_fields = [ 'neutral_cite', 'federal_cite_one', 'federal_cite_two', 'federal_cite_three', 'specialty_cite_one', 'state_cite_regional', 'state_cite_one', 'state_cite_two', 'state_cite_three', 'westlaw_cite', 'lexis_cite' ] # We were able to get a match, expand it if it's a federal/state match. if len(lookup_fields) == 1 and lookup_fields[0] == 'federal_cite_one': lookup_fields = [ 'federal_cite_one', 'federal_cite_two', 'federal_cite_three' ] elif len(lookup_fields) == 1 and lookup_fields[0] == 'state_cite_one': lookup_fields = [ 'state_cite_one', 'state_cite_two', 'state_cite_three' ] q = Q() for lookup_field in lookup_fields: q |= Q(**{'citation__' + lookup_field: citation_str}) documents = Document.objects.filter(q) # Show the correct page.... if documents.count() == 0: # No results for an otherwise valid citation. response = render_to_response( 'casepage/citation_redirect_info_page.html', { 'none_found': True, 'citation_str': citation_str, 'private': True, }, RequestContext(request), #status=404, ) response.status_code = 404 return response elif documents.count() == 1: # Total success. Redirect to correct location. return HttpResponsePermanentRedirect(documents[0].get_absolute_url()) elif documents.count() > 1: # Multiple results. Show them. response = render_to_response( 'casepage/citation_redirect_info_page.html', { 'too_many': True, 'citation_str': citation_str, 'documents': documents, 'private': True, }, RequestContext(request), #status=300, ) response.status_code = 300 return response