示例#1
0
def cleaner(simulate=False, verbose=False):
    """Fixes the titles of cases where the name is untitle disposition.

    Basically, the algorithm here is to find all cases with the error, then
    open each in Firefox one by one. After each case is opened, a prompt will
    allow the case name to be typed in, and it will be corrected on the site.

    These corrections will go live immediately, but will require a reindex to
    be live in the search system.
    """
    queryset = Document.search.query('@casename "unpublished disposition"')
    docs = queryset.set_options(
        mode="SPH_MATCH_EXTENDED2").order_by("-date_filed")
    if verbose:
        print "%s results found." % (docs.count())

    # Must slice here, or else only get top 20 results
    for doc in docs[0:docs.count()]:
        if doc.citation.caseNameFull.lower() == "unpublished disposition":
            # Only do each case once, since the index isn't updated until
            # later, and I may run this script many times.
            print doc.download_url
            casename = raw_input("Case name: ")
            doc.citation.caseNameFull = casename
            doc.citation.caseNameShort = trunc(casename, 100)
            doc.citation.slug = trunc(slugify(casename), 50)
            doc.precedential_status = "Unpublished"
            if not simulate:
                doc.citation.save()
                doc.save()
            print ""
示例#2
0
def view_audio_file(request, pk, _):
    """Using the ID, return the oral argument page.

    We also test if the item is a favorite and send data as such.
    """
    af = get_object_or_404(Audio, pk=pk)
    title = "Oral Argument for " + trunc(af.case_name, 100)
    get_string = search_utils.make_get_string(request)

    try:
        fave = Favorite.objects.get(audio_id=af.pk, users__user=request.user)
        favorite_form = FavoriteForm(instance=fave)
    except (ObjectDoesNotExist, TypeError):
        # Not favorited or anonymous user
        favorite_form = FavoriteForm(
            initial={
                'audio_id': af.pk,
                'name': trunc(af.docket.case_name, 100, ellipsis='...'),
            })

    return render_to_response(
        'audio/oral_argument.html', {
            'title': title,
            'af': af,
            'favorite_form': favorite_form,
            'get_string': get_string,
            'private': af.blocked,
        }, RequestContext(request))
示例#3
0
def view_audio_file(request, pk, _):
    """Using the ID, return the oral argument page.

    We also test if the item is a favorite and send data as such.
    """
    af = get_object_or_404(Audio, pk=pk)
    title = "Oral Argument for " + trunc(af.case_name, 100)
    get_string = search_utils.make_get_string(request)

    try:
        fave = Favorite.objects.get(audio_id=af.pk, users__user=request.user)
        favorite_form = FavoriteForm(instance=fave)
    except (ObjectDoesNotExist, TypeError):
        # Not favorited or anonymous user
        favorite_form = FavoriteForm(
            initial={
                'audio_id': af.pk,
                'name': trunc(af.docket.case_name, 100, ellipsis='...'),
            }
        )

    return render_to_response(
        'audio/oral_argument.html',
        {'title': title,
         'af': af,
         'favorite_form': favorite_form,
         'get_string': get_string,
         'private': af.blocked,
         },
        RequestContext(request)
    )
def cleaner(simulate=False, verbose=False):
    """Fixes the titles of cases where the name is untitle disposition.

    Basically, the algorithm here is to find all cases with the error, then
    open each in Firefox one by one. After each case is opened, a prompt will
    allow the case name to be typed in, and it will be corrected on the site.

    These corrections will go live immediately, but will require a reindex to
    be live in the search system.
    """
    queryset = Document.search.query('@casename "unpublished disposition"')
    docs = queryset.set_options(mode="SPH_MATCH_EXTENDED2").order_by('-date_filed')
    if verbose:
        print "%s results found." % (docs.count())

    # Must slice here, or else only get top 20 results
    for doc in docs[0:docs.count()]:
        if doc.citation.caseNameFull.lower() == "unpublished disposition":
            # Only do each case once, since the index isn't updated until
            # later, and I may run this script many times.
            print doc.download_url
            casename = raw_input("Case name: ")
            doc.citation.caseNameFull = casename
            doc.citation.caseNameShort = trunc(casename, 100)
            doc.citation.slug = trunc(slugify(casename), 50)
            doc.precedential_status = "Unpublished"
            if not simulate:
                doc.citation.save()
                doc.save()
            print ""
示例#5
0
def view_opinion_citations(request, pk, _):
    doc = get_object_or_404(Document, pk=pk)
    title = '%s, %s' % (
        trunc(doc.citation.case_name, 100), make_citation_string(doc))

    # Get list of cases we cite, ordered by citation count
    citing_opinions = doc.citation.citing_opinions.select_related(
        'citation', 'docket__court').order_by('-citation_count', '-date_filed')

    paginator = Paginator(citing_opinions, 20, orphans=2)
    page = request.GET.get('page')
    try:
        citing_opinions = paginator.page(page)
    except (TypeError, PageNotAnInteger):
        # TypeError can be removed in Django 1.4, where it properly will be
        # caught upstream.
        citing_opinions = paginator.page(1)
    except EmptyPage:
        citing_opinions = paginator.page(paginator.num_pages)

    private = False
    if doc.blocked:
        private = True
    else:
        for case in citing_opinions.object_list:
            if case.blocked:
                private = True
                break

    return render_to_response('casepage/view_opinion_citations.html',
                              {'title': title,
                               'doc': doc,
                               'private': private,
                               'citing_opinions': citing_opinions},
                              RequestContext(request))
示例#6
0
def view_authorities(request, pk, case_name):
    pk = ascii_to_num(pk)

    doc = get_object_or_404(Document, pk=pk)
    title = '%s, %s' % (trunc(doc.citation.case_name, 100), make_citation_string(doc))

    # Ordering is by case name is the norm.
    authorities = doc.cases_cited.all().select_related(
        'document').order_by('case_name')

    private = False
    if doc.blocked:
        private = True
    else:
        for case in authorities:
            if case.parent_documents.all()[0].blocked:
                private = True
                break

    return render_to_response('view_case_authorities.html',
                              {'title': title,
                               'doc': doc,
                               'private': private,
                               'authorities': authorities},
                              RequestContext(request))
示例#7
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert',
                         site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{
         'q': 'supreme',
         'caller': 'scraper_test',
     }).execute()
     count = response.result.numFound
     self.assertEqual(
         count, 1,
         "There were %s items found when there should have been 1" % count)
示例#8
0
 def test_trunc(self):
     """Does trunc give us the results we expect?"""
     s = "Henry wants apple."
     tests = (
         # Simple case
         {"length": 13, "result": "Henry wants"},
         # Off by one cases
         {"length": 4, "result": "Henr"},
         {"length": 5, "result": "Henry"},
         {"length": 6, "result": "Henry"},
         # Do we include the length of the ellipsis when measuring?
         {"length": 12, "ellipsis": "...", "result": "Henry..."},
         # What happens when an alternate ellipsis is used instead?
         {"length": 15, "ellipsis": "....", "result": "Henry wants...."},
         # Do we cut properly when no spaces are found?
         {"length": 2, "result": "He"},
         # Do we cut properly when ellipsizing if no spaces found?
         {"length": 6, "ellipsis": "...", "result": "Hen..."},
         # Do we return the whole s when length >= s?
         {"length": 50, "result": s},
     )
     for test_dict in tests:
         result = trunc(s=s, length=test_dict["length"], ellipsis=test_dict.get("ellipsis", None))
         self.assertEqual(
             result,
             test_dict["result"],
             msg="Failed with dict: %s.\n" "%s != %s" % (test_dict, result, test_dict["result"]),
         )
         self.assertTrue(
             len(result) <= test_dict["length"],
             msg="Failed with dict: %s.\n" "%s is longer than %s" % (test_dict, result, test_dict["length"]),
         )
示例#9
0
def view_opinion_citations(request, pk, _):
    doc = get_object_or_404(Document, pk=pk)
    title = '%s, %s' % (
        trunc(doc.citation.case_name, 100), make_citation_string(doc))

    # Get list of cases we cite, ordered by citation count
    citing_opinions = doc.citation.citing_opinions.select_related(
        'citation', 'docket__court').order_by('-citation_count', '-date_filed')

    paginator = Paginator(citing_opinions, 20, orphans=2)
    page = request.GET.get('page')
    try:
        citing_opinions = paginator.page(page)
    except (TypeError, PageNotAnInteger):
        # TypeError can be removed in Django 1.4, where it properly will be
        # caught upstream.
        citing_opinions = paginator.page(1)
    except EmptyPage:
        citing_opinions = paginator.page(paginator.num_pages)

    private = False
    if doc.blocked:
        private = True
    else:
        for case in citing_opinions.object_list:
            if case.blocked:
                private = True
                break

    return render_to_response('casepage/view_opinion_citations.html',
                              {'title': title,
                               'doc': doc,
                               'private': private,
                               'citing_opinions': citing_opinions},
                              RequestContext(request))
示例#10
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text successfully, including OCR?"""
        site = test_scraper.Site().parse()

        test_strings = ['supreme',
                        'intelligence',
                        'indiana',
                        'reagan',
                        'indiana',
                        'fidelity']
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation(case_name=site.case_names[i])
            cite.save(index=False)
            doc = Document(date_filed=site.case_dates[i],
                           court=self.court,
                           citation=cite)
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
示例#11
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{'q': 'supreme', 'caller': 'scraper_test',}).execute()
     count = response.result.numFound
     self.assertEqual(count, 1, "There were %s items found when there should have been 1" % count)
示例#12
0
def view_opinion(request, pk, _):
    """Using the ID, return the document.

    We also test if the document ID is a favorite for the user, and send data
    as such. If it's a favorite, we send the bound form for the favorite so
    it can populate the form on the page. If it is not a favorite, we send the
    unbound form.
    """
    # Look up the court, document, title and favorite information
    doc = get_object_or_404(Document, pk=pk)
    citation_string = make_citation_string(doc)
    title = '%s, %s' % (trunc(doc.citation.case_name, 100), citation_string)
    get_string = search_utils.make_get_string(request)

    try:
        fave = Favorite.objects.get(doc_id=doc.pk, users__user=request.user)
        favorite_form = FavoriteForm(instance=fave)
    except (ObjectDoesNotExist, TypeError):
        # Not favorited or anonymous user
        favorite_form = FavoriteForm(
            initial={
                'doc_id': doc.pk,
                'name': trunc(doc.citation.case_name, 100, ellipsis='...'),
            }
        )

    # get most influential opinions that cite this opinion
    cited_by_trunc = doc.citation.citing_opinions.select_related(
        'citation').order_by('-citation_count', '-date_filed')[:5]

    authorities_trunc = doc.cases_cited.all().select_related(
        'document').order_by('case_name')[:5]
    authorities_count = doc.cases_cited.all().count()

    return render_to_response(
        'casepage/view_opinion.html',
        {'title': title,
         'citation_string': citation_string,
         'doc': doc,
         'favorite_form': favorite_form,
         'get_string': get_string,
         'private': doc.blocked,
         'cited_by_trunc': cited_by_trunc,
         'authorities_trunc': authorities_trunc,
         'authorities_count': authorities_count},
        RequestContext(request)
    )
示例#13
0
def view_opinion(request, pk, _):
    """Using the ID, return the document.

    We also test if the document ID is a favorite for the user, and send data
    as such. If it's a favorite, we send the bound form for the favorite so
    it can populate the form on the page. If it is not a favorite, we send the
    unbound form.
    """
    # Look up the court, document, title and favorite information
    doc = get_object_or_404(Document, pk=pk)
    citation_string = make_citation_string(doc)
    title = "%s, %s" % (trunc(doc.citation.case_name, 100), citation_string)
    get_string = search_utils.make_get_string(request)

    try:
        fave = Favorite.objects.get(doc_id=doc.pk, users__user=request.user)
        favorite_form = FavoriteForm(instance=fave)
    except (ObjectDoesNotExist, TypeError):
        # Not favorited or anonymous user
        favorite_form = FavoriteForm(
            initial={"doc_id": doc.pk, "name": trunc(doc.citation.case_name, 100, ellipsis="...")}
        )

    # get most influential opinions that cite this opinion
    cited_by_trunc = doc.citation.citing_opinions.select_related("citation").order_by("-citation_count", "-date_filed")[
        :5
    ]

    authorities_trunc = doc.cases_cited.all().select_related("document").order_by("case_name")[:5]
    authorities_count = doc.cases_cited.all().count()

    return render_to_response(
        "casepage/view_opinion.html",
        {
            "title": title,
            "citation_string": citation_string,
            "doc": doc,
            "favorite_form": favorite_form,
            "get_string": get_string,
            "private": doc.blocked,
            "cited_by_trunc": cited_by_trunc,
            "authorities_trunc": authorities_trunc,
            "authorities_count": authorities_count,
        },
        RequestContext(request),
    )
示例#14
0
def merge_cases_simple(new, target_id):
    """Add `new` to the database, merging with target_id

     Merging is done by picking the best fields from each item.
    """
    target = Document.objects.get(pk=target_id)
    print "Merging %s with" % new.citation.case_name
    print "        %s" % target.citation.case_name

    cached_source = target.source  # Original value is needed below.
    if target.source == 'C':
        target.source = 'LC'
    elif target.source == 'R':
        target.source = 'LR'
    elif target.source == 'CR':
        target.source = 'LCR'

    # Add the URL if it's not a court one, replacing resource.org's info in some cases.
    if cached_source == 'R':
        target.download_url = new.download_url

    # Recreate the slug from the new case name (this changes the URL, but the old will continue working)
    target.citation.slug = trunc(slugify(new.citation.case_name), 50)

    # Take the case name from the new item; they tend to be pretty good
    target.citation.case_name = new.citation.case_name

    # Add the docket number if the old doesn't exist, but keep the old if one does.
    if not target.citation.docket_number:
        target.citation.docket_number = new.citation.docket_number

    # Get the citations from the new item (ditch the old).
    target.citation.federal_cite_one = new.citation.federal_cite_one
    target.citation.federal_cite_two = new.citation.federal_cite_two
    target.citation.federal_cite_three = new.citation.federal_cite_three
    target.citation.state_cite_one = new.citation.state_cite_one
    target.citation.state_cite_two = new.citation.state_cite_two
    target.citation.state_cite_three = new.citation.state_cite_three
    target.citation.state_cite_regional = new.citation.state_cite_regional
    target.citation.specialty_cite_one = new.citation.specialty_cite_one
    target.citation.scotus_early_cite = new.citation.scotus_early_cite
    target.citation.lexis_cite = new.citation.lexis_cite
    target.citation.westlaw_cite = new.citation.westlaw_cite
    target.citation.neutral_cite = new.citation.neutral_cite

    # Add judge information if lacking. New is dirty, but better than none.
    if not target.judges:
        target.judges = new.judges

    # Add the text.
    target.html_lawbox, blocked = anonymize(new.html)
    if blocked:
        target.blocked = True
        target.date_blocked = now()

    target.extracted_by_ocr = False  # No longer true for any LB case.

    save_doc_and_cite(target, index=False)
示例#15
0
def merge_cases_simple(new, target_id):
    """Add `new` to the database, merging with target_id

     Merging is done by picking the best fields from each item.
    """
    target = Document.objects.get(pk=target_id)
    print "Merging %s with" % new.citation.case_name
    print "        %s" % target.citation.case_name

    cached_source = target.source  # Original value is needed below.
    if target.source == 'C':
        target.source = 'LC'
    elif target.source == 'R':
        target.source = 'LR'
    elif target.source == 'CR':
        target.source = 'LCR'

    # Add the URL if it's not a court one, replacing resource.org's info in some cases.
    if cached_source == 'R':
        target.download_url = new.download_url

    # Recreate the slug from the new case name (this changes the URL, but the old will continue working)
    target.citation.slug = trunc(slugify(new.citation.case_name), 50)

    # Take the case name from the new item; they tend to be pretty good
    target.citation.case_name = new.citation.case_name

    # Add the docket number if the old doesn't exist, but keep the old if one does.
    if not target.citation.docket_number:
        target.citation.docket_number = new.citation.docket_number

    # Get the citations from the new item (ditch the old).
    target.citation.federal_cite_one = new.citation.federal_cite_one
    target.citation.federal_cite_two = new.citation.federal_cite_two
    target.citation.federal_cite_three = new.citation.federal_cite_three
    target.citation.state_cite_one = new.citation.state_cite_one
    target.citation.state_cite_two = new.citation.state_cite_two
    target.citation.state_cite_three = new.citation.state_cite_three
    target.citation.state_cite_regional = new.citation.state_cite_regional
    target.citation.specialty_cite_one = new.citation.specialty_cite_one
    target.citation.scotus_early_cite = new.citation.scotus_early_cite
    target.citation.lexis_cite = new.citation.lexis_cite
    target.citation.westlaw_cite = new.citation.westlaw_cite
    target.citation.neutral_cite = new.citation.neutral_cite

    # Add judge information if lacking. New is dirty, but better than none.
    if not target.judges:
        target.judges = new.judges

    # Add the text.
    target.html_lawbox, blocked = anonymize(new.html)
    if blocked:
        target.blocked = True
        target.date_blocked = now()

    target.extracted_by_ocr = False  # No longer true for any LB case.

    save_doc_and_cite(target, index=False)
def fixer(simulate=False, verbose=False):
    """If a Citation lacks a slug, we make one for it."""
    citations = Citation.objects.filter(slug=None)

    for citation in citations:
        if verbose:
            print "Fixing %s" % citation
        citation.slug = trunc(slugify(citation.case_name), 50)
        if not simulate:
            citation.save()
def fixer(simulate=False, verbose=False):
    """If a Citation lacks a slug, we make one for it."""
    citations = Citation.objects.filter(slug=None)

    for citation in citations:
        if verbose:
            print "Fixing %s" % citation
        citation.slug = trunc(slugify(citation.case_name), 50)
        if not simulate:
            citation.save()
示例#18
0
    def save(self, index=True, *args, **kwargs):
        """
        Note that there is a pre_save receiver below.
        """
        created = self.pk is None
        self.slug = trunc(slugify(self.case_name), 50)
        super(Citation, self).save(*args, **kwargs)

        # We only do this on update, not creation
        if index and not created:
            # Import is here to avoid looped import problem
            from search.tasks import update_cite
            update_cite.delay(self.pk)
示例#19
0
    def save(self, index=True, force_commit=False, *args, **kwargs):
        """
        Note that there is a pre_save receiver below.
        """
        created = self.pk is None
        self.slug = trunc(slugify(self.case_name), 50)
        super(Citation, self).save(*args, **kwargs)

        # We only do this on update, not creation
        if index and not created:
            # Import is here to avoid looped import problem
            from search.tasks import update_cite
            update_cite.delay(self.pk, force_commit)
示例#20
0
    def save(self, index=True, *args, **kwargs):
        """
        create the URL from the case name, but only if this is the first
        time it has been saved.
        """
        created = self.pk is None
        if created:
            # it's the first time it has been saved; generate the slug stuff
            self.slug = trunc(slugify(self.case_name), 50)
        super(Citation, self).save(*args, **kwargs)

        # We only do this on update, not creation
        if index and not created:
            # Import is here to avoid looped import problem
            from search.tasks import update_cite
            update_cite.delay(self.pk)
示例#21
0
def view_authorities(request, pk, _):
    doc = get_object_or_404(Document, pk=pk)
    title = '%s, %s' % (trunc(doc.citation.case_name,
                              100), make_citation_string(doc))

    # Ordering is by case name is the norm.
    authorities = doc.cases_cited.all().select_related('document').order_by(
        'case_name')

    private = False
    if doc.blocked:
        private = True
    else:
        for case in authorities:
            if case.parent_documents.all()[0].blocked:
                private = True
                break
示例#22
0
def view_authorities(request, pk, _):
    doc = get_object_or_404(Document, pk=pk)
    title = "%s, %s" % (trunc(doc.citation.case_name, 100), make_citation_string(doc))

    # Ordering is by case name is the norm.
    authorities = doc.cases_cited.all().select_related("document").order_by("case_name")

    private = False
    if doc.blocked:
        private = True
    else:
        for case in authorities:
            if case.parent_documents.all()[0].blocked:
                private = True
                break

    return render_to_response(
        "casepage/view_opinion_authorities.html",
        {"title": title, "doc": doc, "private": private, "authorities": authorities},
        RequestContext(request),
    )
示例#23
0
def process_audio_file(pk):
    """Given the key to an audio file, extract its content and add the related
    meta data to the database.
    """
    audio_file = Audio.objects.get(pk=pk)
    path_to_original = audio_file.local_path_original_file.path

    path_to_tmp_location = os.path.join('/tmp', str(time.time()) + '.mp3')

    # Convert original file to:
    #  - mono (-ac 1)
    #  - sample rate (audio samples / s) of 22050Hz (-ar 22050)
    #  - constant bit rate (sample resolution) of 48kbps (-ab 48k)
    avconv_command = [
        'avconv', '-i', path_to_original, '-ac', '1', '-ar', '22050', '-ab',
        '48k', path_to_tmp_location
    ]
    _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3'
    set_mp3_meta_data(audio_file, path_to_tmp_location)

    audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            audio_file.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (audio_file.pk, traceback.format_exc())
            logger.critical(msg)
            ErrorLog(log_level='CRITICAL',
                     court=audio_file.docket.court,
                     message=msg).save()

    audio_file.processing_complete = True
    audio_file.save()
示例#24
0
def process_audio_file(pk):
    """Given the key to an audio file, extract its content and add the related
    meta data to the database.
    """
    audio_file = Audio.objects.get(pk=pk)
    path_to_original = audio_file.local_path_original_file.path

    path_to_tmp_location = os.path.join('/tmp', str(time.time()) + '.mp3')

    # Convert original file to:
    #  - mono (-ac 1)
    #  - sample rate (audio samples / s) of 22050Hz (-ar 22050)
    #  - constant bit rate (sample resolution) of 48kbps (-ab 48k)
    avconv_command = ['avconv', '-i', path_to_original,
                      '-ac', '1',
                      '-ar', '22050',
                      '-ab', '48k',
                      path_to_tmp_location]
    _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3'
    set_mp3_meta_data(audio_file, path_to_tmp_location)

    audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            audio_file.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (audio_file.pk, traceback.format_exc())
            logger.critical(msg)
            ErrorLog(log_level='CRITICAL', court=audio_file.docket.court,
                     message=msg).save()

    audio_file.processing_complete = True
    audio_file.save()
示例#25
0
 def test_trunc(self):
     """Does trunc give us the results we expect?"""
     s = 'Henry wants apple.'
     tests = (
         # Simple case
         {'length': 13, 'result': 'Henry wants'},
         # Off by one cases
         {'length': 4, 'result': 'Henr'},
         {'length': 5, 'result': 'Henry'},
         {'length': 6, 'result': 'Henry'},
         # Do we include the length of the ellipsis when measuring?
         {'length': 12, 'ellipsis': '...', 'result': 'Henry...'},
         # What happens when an alternate ellipsis is used instead?
         {'length': 15, 'ellipsis': '....', 'result': 'Henry wants....'},
         # Do we cut properly when no spaces are found?
         {'length': 2, 'result': 'He'},
         # Do we cut properly when ellipsizing if no spaces found?
         {'length': 6, 'ellipsis': '...', 'result': 'Hen...'},
         # Do we return the whole s when length >= s?
         {'length': 50, 'result': s}
     )
     for test_dict in tests:
         result = trunc(
             s=s,
             length=test_dict['length'],
             ellipsis=test_dict.get('ellipsis', None),
         )
         self.assertEqual(
             result,
             test_dict['result'],
             msg='Failed with dict: %s.\n'
                 '%s != %s' % (test_dict, result, test_dict['result'])
         )
         self.assertTrue(
             len(result) <= test_dict['length'],
             msg="Failed with dict: %s.\n"
                 "%s is longer than %s" %
                 (test_dict, result, test_dict['length'])
         )
示例#26
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = [
            'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
            'fidelity'
        ]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert',
                                site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(
                case_name=site.case_names[i],
                court=self.court,
            )
            docket.save()
            doc = Document(
                date_filed=site.case_dates[i],
                citation=cite,
                docket=docket,
            )
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
示例#27
0
def update_dockets_if_citation_case_name_changed(sender, instance, **kwargs):
    """Updates the docket.case_name field for all associated Dockets when the
    Citation.case_name field changes.

     - From http://stackoverflow.com/a/7934958/64911.

    There are a few alternative ways to implement this that don't hit the database
    an extra time (as this one does). However, those solutions are longer and more
    controversial, so I chose this one based on the fact that we rarely change
    objects once they are saved and the performance penalty is probably acceptable.
    """
    try:
        cite = Citation.objects.get(pk=instance.pk)
    except Citation.DoesNotExist:
        # Object is new
        pass
    else:
        if not cite.case_name == instance.case_name:
            # Update the associated dockets
            for d in cite.parent_documents.all():
                d.docket.case_name = instance.case_name
                d.docket.slug = trunc(slugify(instance.case_name), 50)
                d.docket.save()
示例#28
0
def update_dockets_if_citation_case_name_changed(sender, instance, **kwargs):
    """Updates the docket.case_name field for all associated Dockets when the
    Citation.case_name field changes.

     - From http://stackoverflow.com/a/7934958/64911.

    There are a few alternative ways to implement this that don't hit the database
    an extra time (as this one does). However, those solutions are longer and more
    controversial, so I chose this one based on the fact that we rarely change
    objects once they are saved and the performance penalty is probably acceptable.
    """
    try:
        cite = Citation.objects.get(pk=instance.pk)
    except Citation.DoesNotExist:
        # Object is new
        pass
    else:
        if not cite.case_name == instance.case_name:
            # Update the associated dockets
            for d in cite.parent_documents.all():
                d.docket.case_name = instance.case_name
                d.docket.slug = trunc(slugify(instance.case_name), 50)
                d.docket.save()
示例#29
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                # Make a hash of the data
                sha1_hash = hashlib.sha1(r.content).hexdigest()
                if court_str == 'nev' and site.precedential_statuses[
                        i] == 'Unpublished':
                    # Nevada's non-precedential cases have different SHA1 sums every time.
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document,
                        current_date,
                        next_date,
                        lookup_value=site.download_urls[i],
                        lookup_by='download_url')
                else:
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document,
                        current_date,
                        next_date,
                        lookup_value=sha1_hash,
                        lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    cite, docket, doc = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(r.content)
                        extension = get_extension(r.content)
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        doc.local_path.save(file_name, cf, save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    # Save everything, but don't update Solr index yet
                    self.save_everything(cite, docket, doc, index=False)
                    random_delay = random.randint(0, 3600)
                    extract_doc_content.delay(doc.pk,
                                              callback=subtask(extract_by_ocr),
                                              citation_countdown=random_delay)

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=doc.pk, name=site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
示例#30
0
 def title(self, obj):
     return "Cases citing %s, ordered by filing date" % trunc(str(obj.citation), 50)
示例#31
0
 def title(self, obj):
     return "Cases Citing %s, Ordered by Filing Date" % \
            trunc(obj.citation.case_name, 50)
示例#32
0
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name,
                                                         cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async((audio_file.pk, ),
                                           countdown=random_delay)

            logger.info("Successfully added audio file %s: %s" %
                        (audio_file.pk, audio_file.case_name))
示例#33
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(
                    site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level="WARNING", court=court, message=msg).save()
                    continue
                content = site.cleanup_content(r.content)

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                # Make a hash of the data
                if isinstance(content, unicode):
                    sha1_hash = hashlib.sha1(content.encode("utf-8")).hexdigest()
                else:
                    sha1_hash = hashlib.sha1(content).hexdigest()
                if court_str == "nev" and site.precedential_statuses[i] == "Unpublished":
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by="download_url"
                    )
                else:
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document, current_date, next_date, lookup_value=sha1_hash, lookup_by="sha1"
                    )

                if onwards == "CONTINUE":
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == "BREAK":
                    # It's a duplicate, and we hit a date or dup_count
                    # threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == "CARRY_ON":
                    # Not a duplicate, carry on
                    logger.info("Adding new document found at: %s" % site.download_urls[i].encode("utf-8"))
                    dup_checker.reset()

                    cite, docket, doc = self.associate_meta_data_to_objects(site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(content)
                        extension = get_extension(content)
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(), 75) + extension
                        doc.local_path.save(file_name, cf, save=False)
                    except:
                        msg = "Unable to save binary to disk. Deleted " "document: % s.\n % s" % (
                            site.case_names[i],
                            traceback.format_exc(),
                        )
                        logger.critical(msg.encode("utf-8"))
                        ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
                        download_error = True
                        continue

                    # Save everything, but don't update Solr index yet
                    self.save_everything(cite, docket, doc, index=False)
                    random_delay = random.randint(0, 3600)
                    extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr), citation_countdown=random_delay)

                    logger.info(
                        "Successfully added doc {pk}: {name}".format(pk=doc.pk, name=site.case_names[i].encode("utf-8"))
                    )

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
def scrape_court(site, full_crawl=False):
    download_error = False
    # Get the court object early for logging
    # opinions.united_states.federal.ca9_u --> ca9
    court_str = site.court_id.split('.')[-1].split('_')[0]
    court = Court.objects.get(pk=court_str)

    dup_checker = DupChecker(site.court_id, full_crawl=full_crawl)
    abort = dup_checker.abort_by_hash(site.hash)
    if not abort:
        for i in range(0, len(site.case_names)):
            msg, r = get_binary_content(site.download_urls[i], site._get_cookies())
            clean_content = site._cleanup_content(r.content)
            if msg:
                logger.warn(msg)
                ErrorLog(log_level='WARNING',
                         court=court,
                         message=msg).save()
                continue

            current_date = site.case_dates[i]
            try:
                next_date = site.case_dates[i + 1]
            except IndexError:
                next_date = None

            # Make a hash of the data. Need to convert unicode to binary before hashing.
            if type(clean_content) == unicode:
                hash_content = clean_content.encode('utf-8')
            else:
                hash_content = clean_content
            sha1_hash = hashlib.sha1(hash_content).hexdigest()
            if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished':
                # Nevada's non-precedential cases have different SHA1 sums every time.
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    current_date,
                    next_date,
                    lookup_value=site.download_urls[i],
                    lookup_by='download_url'
                )
            else:
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1'
                )

            if onwards == 'CONTINUE':
                # It's a duplicate, but we haven't hit any thresholds yet.
                continue
            elif onwards == 'BREAK':
                # It's a duplicate, and we hit a date or dup_count threshold.
                dup_checker.update_site_hash(sha1_hash)
                break
            elif onwards == 'CARRY_ON':
                # Not a duplicate, carry on
                logger.info('Adding new document found at: %s' % site.download_urls[i])
                dup_checker.reset()

                # Make a citation
                cite = Citation(case_name=site.case_names[i])
                if site.docket_numbers:
                    cite.docket_number = site.docket_numbers[i]
                if site.neutral_citations:
                    cite.neutral_cite = site.neutral_citations[i]
                if site.west_citations:
                    cite.federal_cite_one = site.west_citations[i]
                if site.west_state_citations:
                    cite.west_state_cite = site.west_state_citations[i]

                # Make the document object
                doc = Document(source='C',
                               sha1=sha1_hash,
                               date_filed=site.case_dates[i],
                               court=court,
                               download_url=site.download_urls[i],
                               precedential_status=site.precedential_statuses[i])

                # Make and associate the file object
                try:
                    cf = ContentFile(clean_content)
                    extension = get_extension(r.content)
                    # See issue #215 for why this must be lower-cased.
                    file_name = trunc(site.case_names[i].lower(), 75) + extension
                    doc.local_path.save(file_name, cf, save=False)
                except:
                    msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                          (cite.case_name, traceback.format_exc())
                    logger.critical(msg)
                    ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
                    download_error = True
                    continue

                if site.judges:
                    doc.judges = site.judges[i]
                if site.nature_of_suit:
                    doc.nature_of_suit = site.nature_of_suit[i]

                # Save everything, but don't update Solr index yet
                cite.save(index=False)
                doc.citation = cite
                doc.save(index=False)

                # Extract the contents asynchronously.
                extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))

                logger.info("Successfully added doc %s: %s" % (doc.pk, site.case_names[i]))

        # Update the hash if everything finishes properly.
        logger.info("%s: Successfully crawled." % site.court_id)
        if not download_error and not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)
示例#35
0
 def test_trunc(self):
     """Does trunc give us the results we expect?"""
     s = 'Henry wants apple.'
     tests = (
         # Simple case
         {
             'length': 13,
             'result': 'Henry wants'
         },
         # Off by one cases
         {
             'length': 4,
             'result': 'Henr'
         },
         {
             'length': 5,
             'result': 'Henry'
         },
         {
             'length': 6,
             'result': 'Henry'
         },
         # Do we include the length of the ellipsis when measuring?
         {
             'length': 12,
             'ellipsis': '...',
             'result': 'Henry...'
         },
         # What happens when an alternate ellipsis is used instead?
         {
             'length': 15,
             'ellipsis': '....',
             'result': 'Henry wants....'
         },
         # Do we cut properly when no spaces are found?
         {
             'length': 2,
             'result': 'He'
         },
         # Do we cut properly when ellipsizing if no spaces found?
         {
             'length': 6,
             'ellipsis': '...',
             'result': 'Hen...'
         },
         # Do we return the whole s when length >= s?
         {
             'length': 50,
             'result': s
         })
     for test_dict in tests:
         result = trunc(
             s=s,
             length=test_dict['length'],
             ellipsis=test_dict.get('ellipsis', None),
         )
         self.assertEqual(result,
                          test_dict['result'],
                          msg='Failed with dict: %s.\n'
                          '%s != %s' %
                          (test_dict, result, test_dict['result']))
         self.assertTrue(len(result) <= test_dict['length'],
                         msg="Failed with dict: %s.\n"
                         "%s is longer than %s" %
                         (test_dict, result, test_dict['length']))
示例#36
0
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name, cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async(
                (audio_file.pk,),
                countdown=random_delay
            )

            logger.info("Successfully added audio file %s: %s" % (
                audio_file.pk, audio_file.case_name))
示例#37
0
        '48k', path_to_tmp_location
    ]
    try:
        _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError, e:
        print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \
              (avconv_command, e.returncode, e.output)
        print traceback.format_exc()
        raise

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    set_mp3_meta_data(af, path_to_tmp_location)

    af.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            file_name = trunc(af.case_name.lower(), 72) + '_cl.mp3'
            af.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (af.pk, traceback.format_exc())
            ErrorLog(log_level='CRITICAL', court=af.docket.court,
                     message=msg).save()

    af.processing_complete = True
    af.save()
    os.remove(path_to_tmp_location)
示例#38
0
            af.judges = item['judges']
        if item['docket_number']:
            af.docket_number = item['docket_number']

        court = Court.objects.get(pk=item['court_code'])
        docket.court = court

        # Fix the files. First save the location of the old files.
        original_local_path = af.local_path_original_file.path
        original_mp3_path = af.local_path_mp3.path

        # Create a new file with the contents of the old and a corrected
        # name. This is only in memory for the moment.
        cf = ContentFile(af.local_path_original_file.read())
        extension = '.' + af.local_path_original_file.path.rsplit('.', 1)[1]
        file_name = trunc(item['case_name'].lower(), 75) + extension
        af.local_path_original_file.save(file_name, cf, save=False)

        # Create a new mp3 file with the new contents
        cf = ContentFile(af.local_path_mp3.read())
        file_name = trunc(af.case_name.lower(), 72) + '_cl.mp3'
        af.local_path_mp3.save(file_name, cf, save=False)

        # Save things so they can be referenced in a sec.
        docket.save()
        af.save(index=False)

        # Update the ID3 information and duration data.
        new_mp3_path = af.local_path_mp3.path
        logger.info("Updating mpr at: %s" % new_mp3_path)
        set_mp3_meta_data(af, new_mp3_path)
示例#39
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue
                content = site.cleanup_content(r.content)

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(content)
                        extension = get_extension(content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit(
                                '.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        audio_file.local_path_original_file.save(file_name,
                                                                 cf,
                                                                 save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async((audio_file.pk, ),
                                                   countdown=random_delay)

                    logger.info("Successfully added audio file %s: %s" %
                                (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." %
                        site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
示例#40
0
 def save(self, *args, **kwargs):
     self.slug = trunc(slugify(self.case_name), 50)
     super(Docket, self).save(*args, **kwargs)
示例#41
0
        _ = subprocess.check_output(
            avconv_command,
            stderr=subprocess.STDOUT
        )
    except subprocess.CalledProcessError, e:
        print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \
              (avconv_command, e.returncode, e.output)
        print traceback.format_exc()
        raise

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    set_mp3_meta_data(af, path_to_tmp_location)

    af.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            file_name = trunc(af.case_name.lower(), 72) + '_cl.mp3'
            af.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (af.pk, traceback.format_exc())
            ErrorLog(log_level='CRITICAL', court=af.docket.court,
                     message=msg).save()

    af.processing_complete = True
    af.save()
    os.remove(path_to_tmp_location)
示例#42
0
 def title(self, obj):
     return "Cases Citing %s, Ordered by Filing Date" % \
            trunc(str(obj.citation.case_name), 50)
示例#43
0
 def save(self, *args, **kwargs):
     self.slug = trunc(slugify(self.case_name), 50)
     super(Docket, self).save(*args, **kwargs)
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(
                    site.download_urls[i],
                    site.cookies,
                    method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING',
                             court=court,
                             message=msg).save()
                    continue

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(r.content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1'
                )

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' % site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    audio_file.docket = docket

                    # Make and associate the file object
                    try:
                        cf = ContentFile(r.content)
                        extension = get_extension(r.content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit('.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(), 75) + extension
                        audio_file.local_path_original_file.save(file_name, cf, save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async(
                        (audio_file.pk,),
                        countdown=random_delay
                    )

                    logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
示例#45
0
    #  - constant bit rate (sample resolution) of 48kbps (-ab 48k)
    avconv_command = ['avconv', '-i', path_to_original,
                      '-ac', '1',
                      '-ar', '22050',
                      '-ab', '48k',
                      path_to_tmp_location]
    try:
        output = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError, e:
        print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \
              (avconv_command, e.returncode, e.output)
        print traceback.format_exc()
        raise

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3'
    set_mp3_meta_data(audio_file, path_to_tmp_location)

    audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            audio_file.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (audio_file.pk, traceback.format_exc())
            ErrorLog(log_level='CRITICAL', court=audio_file.docket.court,
                     message=msg).save()