def fixer(simulate=False, verbose=False):
    """Remove leading slashes by running the new and improved harmonize/clean_string scipts"""
    docs = Document.objects.raw(r'''select Document.pk
                                    from Document, Citation
                                    where Document.citation_id = Citation.pk and
                                    Citation.case_name like '(%%';''')

    for doc in docs:
        # Special cases
        if 'Klein' in doc.case_name:
            continue
        elif 'in re' in doc.case_name.lower():
            continue
        elif doc.case_name == "(White) v. Gray":
            doc.case_name = "White v. Gray"
            if not simulate:
                doc.save()


        # Otherwise, we nuke the leading parens.
        old_case_name = doc.case_name
        new_case_name = titlecase(harmonize(clean_string(re.sub('\(.*?\)', '', doc.case_name, 1))))

        if verbose:
            print "Fixing document %s: %s" % (doc.pk, doc)
            print "        New for %s: %s\n" % (doc.pk, new_case_name)

        if not simulate:
            doc.case_name = new_case_name
            doc.citation.save()
def fixer(simulate=False, verbose=False):
    """Remove leading slashes by running the new and improved harmonize/clean_string scipts"""
    docs = Document.objects.raw(r'''select Document.pk
                                    from Document, Citation
                                    where Document.citation_id = Citation.pk and
                                    Citation.case_name like '/%%';''')
    for doc in docs:
        if verbose:
            print "Fixing document %s: %s" % (doc.pk, doc)

        if not simulate:
            doc.citation.case_name = harmonize(clean_string(doc.citation.case_name))
            doc.citation.save()
Пример #3
0
 def _clean_attributes(self):
     """Iterate over attribute values and clean them"""
     for attr in self._all_attrs:
         item = getattr(self, attr)
         if item is not None:
             cleaned_item = []
             for sub_item in item:
                 if attr == "download_urls":
                     sub_item = sub_item.strip()
                 else:
                     if isinstance(sub_item, basestring):
                         sub_item = clean_string(sub_item)
                     if attr in ["case_names", "docket_numbers"]:
                         sub_item = harmonize(sub_item)
                 cleaned_item.append(sub_item)
             self.__setattr__(attr, cleaned_item)
Пример #4
0
 def _clean_attributes(self):
     """Iterate over attribute values and clean them"""
     for attr in self._all_attrs:
         item = getattr(self, attr)
         if item is not None:
             cleaned_item = []
             for sub_item in item:
                 if attr == 'download_urls':
                     sub_item = sub_item.strip()
                 else:
                     if isinstance(sub_item, six.string_types):
                         sub_item = clean_string(sub_item)
                     elif isinstance(sub_item, datetime):
                         sub_item = sub_item.date()
                     if attr in ['case_names', 'docket_numbers']:
                         sub_item = harmonize(sub_item)
                 cleaned_item.append(sub_item)
             self.__setattr__(attr, cleaned_item)
Пример #5
0
def get_case_name(complete_html_tree, case_path):
    path = '//head/title/text()'
    # Text looks like: 'In re 221A Holding Corp., Inc, 1 BR 506 - Dist. Court, ED Pennsylvania 1979'
    s = complete_html_tree.xpath(path)[0].rsplit('-', 1)[0].rsplit(',', 1)[0]
    # returns 'In re 221A Holding Corp., Inc.'
    case_name = harmonize(clean_string(titlecase(s)))
    if not s:
        try:
            case_name = fixes[case_path]['case_name']
        except KeyError:
            if 'input_case_names' in DEBUG:
                if 'firefox' in DEBUG:
                    subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                input_case_name = raw_input('  No case name found. What should be here? ')
                input_case_name = unicode(input_case_name)
                add_fix(case_path, {'case_name': input_case_name})
                case_name = input_case_name

    if 'case_name' in DEBUG:
        log_print("  Case name: %s" % case_name)
    return case_name
Пример #6
0
def get_case_name(complete_html_tree, case_path):
    path = "//head/title/text()"
    # Text looks like: 'In re 221A Holding Corp., Inc, 1 BR 506 - Dist. Court, ED Pennsylvania 1979'
    s = complete_html_tree.xpath(path)[0].rsplit("-", 1)[0].rsplit(",", 1)[0]
    # returns 'In re 221A Holding Corp., Inc.'
    case_name = harmonize(clean_string(titlecase(s)))
    if not s:
        try:
            case_name = fixes[case_path]["case_name"]
        except KeyError:
            if "input_case_names" in DEBUG:
                if "firefox" in DEBUG:
                    subprocess.Popen(["firefox", "file://%s" % case_path], shell=False).communicate()
                input_case_name = raw_input("  No case name found. What should be here? ")
                input_case_name = unicode(input_case_name)
                add_fix(case_path, {"case_name": input_case_name})
                case_name = input_case_name

    if "case_name" in DEBUG:
        log_print("  Case name: %s" % case_name)
    return case_name
Пример #7
0
    def __init__(self, path):
        logger.info("Initializing parser for %s" % path)
        # High-level attributes
        self.path = path
        self.xml = self.get_xml_contents()
        self.case_details = self.get_case_details()
        self.document_list = self.get_document_list()
        self.document_count = self.get_document_count()

        # Docket attributes
        self.court = self.get_court()
        self.docket_number = self.get_str_from_node(self.case_details,
                                                    'docket_num')
        self.pacer_case_id = self.get_str_from_node(self.case_details,
                                                    'pacer_case_num')
        self.date_filed = self.get_datetime_from_node(self.case_details,
                                                      'date_case_filed',
                                                      cast_to_date=True)
        self.date_terminated = self.get_datetime_from_node(
            self.case_details, 'date_case_terminated', cast_to_date=True)
        self.date_last_filing = self.get_datetime_from_node(self.case_details,
                                                            'date_last_filing',
                                                            cast_to_date=True)
        self.case_name = harmonize(
            self.get_str_from_node(self.case_details, 'case_name'))
        self.case_name_short = self.cnt.make_case_name_short(self.case_name)
        self.cause = self.get_str_from_node(self.case_details, 'case_cause')
        self.nature_of_suit = self.get_str_from_node(self.case_details,
                                                     'nature_of_suit')
        self.jury_demand = self.get_str_from_node(self.case_details,
                                                  'jury_demand')
        self.jurisdiction_type = self.get_str_from_node(
            self.case_details, 'jurisdiction')
        self.assigned_to, self.assigned_to_str = self.get_judges('assigned_to')
        self.referred_to, self.referred_to_str = self.get_judges('referred_to')
        self.blocked, self.date_blocked = self.set_blocked_fields()

        # Non-parsed fields
        self.filepath_local = os.path.join('recap', self.path)
        self.filepath_ia = get_docketxml_url_from_path(self.path)
Пример #8
0
    def __init__(self, path):
        print "Doing %s" % path
        # High-level attributes
        self.path = path
        self.xml = self.get_xml_contents()
        self.case_details = self.get_case_details()
        self.document_list = self.get_document_list()
        self.document_count = self.get_document_count()

        # Docket attributes
        self.court = self.get_court()
        self.docket_number = self.get_str_from_node(
            self.case_details, 'docket_num')
        self.pacer_case_id = self.get_str_from_node(
            self.case_details, 'pacer_case_num')
        self.date_filed = self.get_datetime_from_node(
            self.case_details, 'date_case_filed', cast_to_date=True)
        self.date_terminated = self.get_datetime_from_node(
            self.case_details, 'date_case_terminated', cast_to_date=True)
        self.date_last_filing = self.get_datetime_from_node(
            self.case_details, 'date_last_filing', cast_to_date=True)
        self.case_name = harmonize(self.get_str_from_node(
            self.case_details, 'case_name'))
        self.case_name_short = self.cnt.make_case_name_short(self.case_name)
        self.cause = self.get_str_from_node(
            self.case_details, 'case_cause')
        self.nature_of_suit = self.get_str_from_node(
            self.case_details, 'nature_of_suit')
        self.jury_demand = self.get_str_from_node(
            self.case_details, 'jury_demand')
        self.jurisdiction_type = self.get_str_from_node(
            self.case_details, 'jurisdiction')
        self.assigned_to, self.assigned_to_str = self.get_judges('assigned_to')
        self.referred_to, self.referred_to_str = self.get_judges('referred_to')
        self.blocked, self.date_blocked = self.set_blocked_fields()

        # Non-parsed fields
        self.filepath_local = os.path.join('recap', self.path)
        self.filepath_ia = get_docketxml_url_from_path(self.path)
Пример #9
0
def make_line_to_dict(row):
    columns = row.split('\t')
    item = {
        'court_code':    columns[0],
        'docket_number': columns[1],
        'case_name':     columns[2],
        'url':           columns[3],
        'size':          columns[4],
        'counsel':       columns[5],
        'issues':        columns[6],
        'judges':        columns[7],
        'date_argued':   datetime.strptime(columns[8], '%Y-%m-%d').date(),
        'orig_url':      columns[9],
    }

    for key, value in item.iteritems():
        if key == 'url':
            item['url'] = value.strip()
        else:
            if isinstance(value, basestring):
                item[key] = clean_string(value)
                if key in ['case_name', 'docket_number']:
                    item[key] = harmonize(value)
    return item
Пример #10
0
def make_line_to_dict(row):
    columns = row.split('\t')
    item = {
        'court_code': columns[0],
        'docket_number': columns[1],
        'case_name': columns[2],
        'url': columns[3],
        'size': columns[4],
        'counsel': columns[5],
        'issues': columns[6],
        'judges': columns[7],
        'date_argued': datetime.strptime(columns[8], '%Y-%m-%d').date(),
        'orig_url': columns[9],
    }

    for key, value in item.iteritems():
        if key == 'url':
            item['url'] = value.strip()
        else:
            if isinstance(value, basestring):
                item[key] = clean_string(value)
                if key in ['case_name', 'docket_number']:
                    item[key] = harmonize(value)
    return item
Пример #11
0
    def _get_case_name_and_status(self):
        case_name = self.url_element.get('title').lower()
        ca1regex = re.compile('(unpublished disposition )?notice: first circuit local rule 36.2\(b\)6 states unpublished opinions may be cited only in related cases.?')
        ca2regex = re.compile('(unpublished disposition )?notice: second circuit local rule 0.23 states unreported opinions shall not be cited or otherwise used in unrelated cases.?')
        ca2regex2 = re.compile('(unpublished disposition )?notice: this summary order may not be cited as precedential authority, but may be called to the attention of the court in a subsequent stage of this case, in a related case, or in any case for purposes of collateral estoppel or res judicata. see second circuit rule 0.23.?')
        ca3regex = re.compile('(unpublished disposition )?notice: third circuit rule 21\(i\) states citations to federal decisions which have not been formally reported should identify the court, docket number and date.?')
        ca4regex = re.compile('(unpublished disposition )?notice: fourth circuit (local rule 36\(c\)|i.o.p. 36.6) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the fourth circuit.?')
        ca5regex = re.compile('(unpublished disposition )?notice: fifth circuit local rule 47.5.3 states that unpublished opinions should normally be cited only when they establish the law of the case, are relied upon as a basis for res judicata or collateral estoppel, or involve related facts. if an unpublished opinion is cited, a copy shall be attached to each copy of the brief.?')
        ca6regex = re.compile('(unpublished disposition )?notice: sixth circuit rule 24\(c\) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the sixth circuit.?')
        ca7regex = re.compile('(unpublished disposition )?notice: seventh circuit rule 53\(b\)\(2\) states unpublished orders shall not be cited or used as precedent except to support a claim of res judicata, collateral estoppel or law of the case in any federal court within the circuit.?')
        ca8regex = re.compile('(unpublished disposition )?notice: eighth circuit rule 28a\(k\) governs citation of unpublished opinions and provides that (no party may cite an opinion not intended for publication unless the cases are related by identity between the parties or the causes of action|they are not precedent and generally should not be cited unless relevant to establishing the doctrines of res judicata, collateral estoppel, the law of the case, or if the opinion has persuasive value on a material issue and no published opinion would serve as well).?')
        ca9regex = re.compile('(unpublished disposition )?notice: ninth circuit rule 36-3 provides that dispositions other than opinions or orders designated for publication are not precedential and should not be cited except when relevant under the doctrines of law of the case, res judicata, or collateral estoppel.?')
        ca10regex = re.compile('(unpublished disposition )?notice: tenth circuit rule 36.3 states that unpublished opinions and orders and judgments have no precedential value and shall not be cited except for purposes of establishing the doctrines of the law of the case, res judicata, or collateral estoppel.?')
        cadcregex = re.compile('(unpublished disposition )?notice: d.c. circuit local rule 11\(c\) states that unpublished orders, judgments, and explanatory memoranda may not be cited as precedents, but counsel may refer to unpublished dispositions when the binding or preclusive effect of the disposition, rather than its quality as precedent, is relevant.?')
        cafcregex = re.compile('(unpublished disposition )?notice: federal circuit local rule 47.(6|8)\(b\) states that opinions and orders which are designated as not citable as precedent shall not be employed or cited as precedent. this does not preclude assertion of issues of claim preclusion, issue preclusion, judicial estoppel, law of the case or the like based on a decision of the court rendered in a nonprecedential opinion or order.?')
        # Clean off special cases
        if 'first circuit' in case_name:
            case_name = re.sub(ca1regex, '', case_name)
            status = 'Unpublished'
        elif 'second circuit' in case_name:
            case_name = re.sub(ca2regex, '', case_name)
            case_name = re.sub(ca2regex2, '', case_name)
            status = 'Unpublished'
        elif 'third circuit' in case_name:
            case_name = re.sub(ca3regex, '', case_name)
            status = 'Unpublished'
        elif 'fourth circuit' in case_name:
            case_name = re.sub(ca4regex, '', case_name)
            status = 'Unpublished'
        elif 'fifth circuit' in case_name:
            case_name = re.sub(ca5regex, '', case_name)
            status = 'Unpublished'
        elif 'sixth circuit' in case_name:
            case_name = re.sub(ca6regex, '', case_name)
            status = 'Unpublished'
        elif 'seventh circuit' in case_name:
            case_name = re.sub(ca7regex, '', case_name)
            status = 'Unpublished'
        elif 'eighth circuit' in case_name:
            case_name = re.sub(ca8regex, '', case_name)
            status = 'Unpublished'
        elif 'ninth circuit' in case_name:
            case_name = re.sub(ca9regex, '', case_name)
            status = 'Unpublished'
        elif 'tenth circuit' in case_name:
            case_name = re.sub(ca10regex, '', case_name)
            status = 'Unpublished'
        elif 'd.c. circuit' in case_name:
            case_name = re.sub(cadcregex, '', case_name)
            status = 'Unpublished'
        elif 'federal circuit' in case_name:
            case_name = re.sub(cafcregex, '', case_name)
            status = 'Unpublished'
        else:
            status = 'Published'

        case_name = titlecase(harmonize(clean_string(case_name)))

        if case_name == '' or case_name == 'unpublished disposition':
            # No luck getting the case name
            saved_case_name = self._check_fix_list(self.sha1_hash, self.case_name_dict)
            if saved_case_name:
                case_name = saved_case_name
            else:
                print self.url
                if BROWSER:
                    subprocess.Popen([BROWSER, self.url], shell=False).communicate()
                case_name = raw_input("Short case name: ")
                self.case_name_fix_file.write("%s|%s\n" % (self.sha1_hash, case_name))

        return case_name, status
Пример #12
0
def get_clean_case_name_and_sniff_status(s):
    """Strips out warnings re non-precedential status that occur in case
    names. If such a warning is discovered, we set the status flag to
    'nonprecedential'.

    Returns a cleaned case name and the status of the item, both as
    strings.
    """
    s = s.lower()
    regexes = (
        ('first circuit',
         '(unpublished disposition )?notice: first circuit local rule 36.2'
         '\(b\)6 states unpublished opinions may be cited only in related '
         'cases.?'),
        ('second circuit',
         '(unpublished disposition )?notice: second circuit local rule '
         '0.23 states unreported opinions shall not be cited or otherwise '
         'used in unrelated cases.?'),
        ('second circuit',
         '(unpublished disposition )?notice: this summary order may not '
         'be cited as precedential authority, but may be called to the '
         'attention of the court in a subsequent stage of this case, in a '
         'related case, or in any case for purposes of collateral '
         'estoppel or res judicata. see second circuit rule 0.23.?'),
        ('third circuit',
         '(unpublished disposition )?notice: third circuit rule 21\(i\) '
         'states citations to federal decisions which have not been '
         'formally reported should identify the court, docket number and '
         'date.?'),
        ('fourth circuit',
         '(unpublished disposition )?notice: fourth circuit (local rule '
         '36\(c\)|i.o.p. 36.6) states that citation of unpublished '
         'dispositions is disfavored except for establishing res '
         'judicata, estoppel, or the law of the case and requires service '
         'of copies of cited unpublished dispositions of the fourth '
         'circuit.?'),
        ('fifth circuit',
         '(unpublished disposition )?notice: fifth circuit local rule '
         '47.5.3 states that unpublished opinions should normally be '
         'cited only when they establish the law of the case, are relied '
         'upon as a basis for res judicata or collateral estoppel, or '
         'involve related facts. if an unpublished opinion is cited, a '
         'copy shall be attached to each copy of the brief.?'),
        ('sixth circuit',
         '(unpublished disposition )?notice: sixth circuit rule 24\(c\) '
         'states that citation of unpublished dispositions is disfavored '
         'except for establishing res judicata, estoppel, or the law of '
         'the case and requires service of copies of cited unpublished '
         'dispositions of the sixth circuit.?'),
        ('seventh circuit',
         '(unpublished disposition )?notice: seventh circuit rule '
         '53\(b\)\(2\) states unpublished orders shall not be cited or '
         'used as precedent except to support a claim of res judicata, '
         'collateral estoppel or law of the case in any federal court '
         'within the circuit.?'),
        ('eighth circuit',
         '(unpublished disposition )?notice: eighth circuit rule 28a\(k\) '
         'governs citation of unpublished opinions and provides that (no '
         'party may cite an opinion not intended for publication unless '
         'the cases are related by identity between the parties or the '
         'causes of action|they are not precedent and generally should not '
         'be cited unless relevant to establishing the doctrines of res '
         'judicata, collateral estoppel, the law of the case, or if the '
         'opinion has persuasive value on a material issue and no '
         'published opinion would serve as well).?'),
        ('ninth circuit',
         '(unpublished disposition )?notice: ninth circuit rule 36-3 '
         'provides that dispositions other than opinions or orders '
         'designated for publication are not precedential and should not '
         'be cited except when relevant under the doctrines of law of the '
         'case, res judicata, or collateral estoppel.?'),
        ('tenth circuit',
         '(unpublished disposition )?notice: tenth circuit rule 36.3 '
         'states that unpublished opinions and orders and judgments have '
         'no precedential value and shall not be cited except for '
         'purposes of establishing the doctrines of the law of the case, '
         'res judicata, or collateral estoppel.?'),
        ('d.c. circuit',
         '(unpublished disposition )?notice: d.c. circuit local rule '
         '11\(c\) states that unpublished orders, judgments, and '
         'explanatory memoranda may not be cited as precedents, but '
         'counsel may refer to unpublished dispositions when the binding '
         'or preclusive effect of the disposition, rather than its '
         'quality as precedent, is relevant.?'),
        ('federal circuit',
         '(unpublished disposition )?notice: federal circuit local rule '
         '47.(6|8)\(b\) states that opinions and orders which are '
         'designated as not citable as precedent shall not be employed or '
         'cited as precedent. this does not preclude assertion of issues '
         'of claim preclusion, issue preclusion, judicial estoppel, law '
         'of the case or the like based on a decision of the court '
         'rendered in a nonprecedential opinion or order.?'),
    )
    status = 'Published'
    for test, regex in regexes:
        if test in s:
            if re.match(regex, s):
                s = re.sub(regex, '', s)
                status = 'Unpublished'

    s = titlecase(harmonize(clean_string(s)))
    return s, status
Пример #13
0
def process_free_opinion_result(self, row_pk, cnt):
    """Process a single result from the free opinion report"""
    result = PACERFreeDocumentRow.objects.get(pk=row_pk)
    result.court = Court.objects.get(pk=map_pacer_to_cl_id(result.court_id))
    result.case_name = harmonize(result.case_name)
    result.case_name_short = cnt.make_case_name_short(result.case_name)
    row_copy = copy.copy(result)
    # If we don't do this, the doc's date_filed becomes the docket's
    # date_filed. Bad.
    delattr(row_copy, 'date_filed')
    # If we don't do this, we get the PACER court id and it crashes
    delattr(row_copy, 'court_id')
    # If we don't do this, the id of result tries to smash that of the docket.
    delattr(row_copy, 'id')
    try:
        with transaction.atomic():
            docket = lookup_and_save(row_copy)
            if not docket:
                msg = "Unable to create docket for %s" % result
                logger.error(msg)
                result.error_msg = msg
                result.save()
                self.request.callbacks = None
                return
            docket.blocked, docket.date_blocked = get_blocked_status(docket)
            docket.save()

            de, de_created = DocketEntry.objects.update_or_create(
                docket=docket,
                entry_number=result.document_number,
                defaults={
                    'date_filed': result.date_filed,
                    'description': result.description,
                })
            rd, rd_created = RECAPDocument.objects.update_or_create(
                docket_entry=de,
                document_number=result.document_number,
                attachment_number=None,
                defaults={
                    'pacer_doc_id': result.pacer_doc_id,
                    'document_type': RECAPDocument.PACER_DOCUMENT,
                    'is_free_on_pacer': True,
                })
    except IntegrityError as e:
        msg = "Raised IntegrityError: %s" % e
        logger.error(msg)
        if self.request.retries == self.max_retries:
            result.error_msg = msg
            result.save()
            return
        raise self.retry(exc=e)
    except DatabaseError as e:
        msg = "Unable to complete database transaction:\n%s" % e
        logger.error(msg)
        result.error_msg = msg
        result.save()
        self.request.callbacks = None
        return

    if not rd_created and rd.is_available:
        # The item already exists and is available. Fantastic, mark it as free,
        # and call it a day.
        rd.is_free_on_pacer = True
        rd.save()
        result.delete()
        self.request.callbacks = None
        return

    return {
        'result': result,
        'rd_pk': rd.pk,
        'pacer_court_id': result.court_id
    }
Пример #14
0
    def _get_case_name_and_status(self):
        case_name = self.url_element.get('title').lower()
        ca1regex = re.compile(
            '(unpublished disposition )?notice: first circuit local rule 36.2\(b\)6 states unpublished opinions may be cited only in related cases.?'
        )
        ca2regex = re.compile(
            '(unpublished disposition )?notice: second circuit local rule 0.23 states unreported opinions shall not be cited or otherwise used in unrelated cases.?'
        )
        ca2regex2 = re.compile(
            '(unpublished disposition )?notice: this summary order may not be cited as precedential authority, but may be called to the attention of the court in a subsequent stage of this case, in a related case, or in any case for purposes of collateral estoppel or res judicata. see second circuit rule 0.23.?'
        )
        ca3regex = re.compile(
            '(unpublished disposition )?notice: third circuit rule 21\(i\) states citations to federal decisions which have not been formally reported should identify the court, docket number and date.?'
        )
        ca4regex = re.compile(
            '(unpublished disposition )?notice: fourth circuit (local rule 36\(c\)|i.o.p. 36.6) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the fourth circuit.?'
        )
        ca5regex = re.compile(
            '(unpublished disposition )?notice: fifth circuit local rule 47.5.3 states that unpublished opinions should normally be cited only when they establish the law of the case, are relied upon as a basis for res judicata or collateral estoppel, or involve related facts. if an unpublished opinion is cited, a copy shall be attached to each copy of the brief.?'
        )
        ca6regex = re.compile(
            '(unpublished disposition )?notice: sixth circuit rule 24\(c\) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the sixth circuit.?'
        )
        ca7regex = re.compile(
            '(unpublished disposition )?notice: seventh circuit rule 53\(b\)\(2\) states unpublished orders shall not be cited or used as precedent except to support a claim of res judicata, collateral estoppel or law of the case in any federal court within the circuit.?'
        )
        ca8regex = re.compile(
            '(unpublished disposition )?notice: eighth circuit rule 28a\(k\) governs citation of unpublished opinions and provides that (no party may cite an opinion not intended for publication unless the cases are related by identity between the parties or the causes of action|they are not precedent and generally should not be cited unless relevant to establishing the doctrines of res judicata, collateral estoppel, the law of the case, or if the opinion has persuasive value on a material issue and no published opinion would serve as well).?'
        )
        ca9regex = re.compile(
            '(unpublished disposition )?notice: ninth circuit rule 36-3 provides that dispositions other than opinions or orders designated for publication are not precedential and should not be cited except when relevant under the doctrines of law of the case, res judicata, or collateral estoppel.?'
        )
        ca10regex = re.compile(
            '(unpublished disposition )?notice: tenth circuit rule 36.3 states that unpublished opinions and orders and judgments have no precedential value and shall not be cited except for purposes of establishing the doctrines of the law of the case, res judicata, or collateral estoppel.?'
        )
        cadcregex = re.compile(
            '(unpublished disposition )?notice: d.c. circuit local rule 11\(c\) states that unpublished orders, judgments, and explanatory memoranda may not be cited as precedents, but counsel may refer to unpublished dispositions when the binding or preclusive effect of the disposition, rather than its quality as precedent, is relevant.?'
        )
        cafcregex = re.compile(
            '(unpublished disposition )?notice: federal circuit local rule 47.(6|8)\(b\) states that opinions and orders which are designated as not citable as precedent shall not be employed or cited as precedent. this does not preclude assertion of issues of claim preclusion, issue preclusion, judicial estoppel, law of the case or the like based on a decision of the court rendered in a nonprecedential opinion or order.?'
        )
        # Clean off special cases
        if 'first circuit' in case_name:
            case_name = re.sub(ca1regex, '', case_name)
            status = 'Unpublished'
        elif 'second circuit' in case_name:
            case_name = re.sub(ca2regex, '', case_name)
            case_name = re.sub(ca2regex2, '', case_name)
            status = 'Unpublished'
        elif 'third circuit' in case_name:
            case_name = re.sub(ca3regex, '', case_name)
            status = 'Unpublished'
        elif 'fourth circuit' in case_name:
            case_name = re.sub(ca4regex, '', case_name)
            status = 'Unpublished'
        elif 'fifth circuit' in case_name:
            case_name = re.sub(ca5regex, '', case_name)
            status = 'Unpublished'
        elif 'sixth circuit' in case_name:
            case_name = re.sub(ca6regex, '', case_name)
            status = 'Unpublished'
        elif 'seventh circuit' in case_name:
            case_name = re.sub(ca7regex, '', case_name)
            status = 'Unpublished'
        elif 'eighth circuit' in case_name:
            case_name = re.sub(ca8regex, '', case_name)
            status = 'Unpublished'
        elif 'ninth circuit' in case_name:
            case_name = re.sub(ca9regex, '', case_name)
            status = 'Unpublished'
        elif 'tenth circuit' in case_name:
            case_name = re.sub(ca10regex, '', case_name)
            status = 'Unpublished'
        elif 'd.c. circuit' in case_name:
            case_name = re.sub(cadcregex, '', case_name)
            status = 'Unpublished'
        elif 'federal circuit' in case_name:
            case_name = re.sub(cafcregex, '', case_name)
            status = 'Unpublished'
        else:
            status = 'Published'

        case_name = titlecase(harmonize(clean_string(case_name)))

        if case_name == '' or case_name == 'unpublished disposition':
            # No luck getting the case name
            saved_case_name = self._check_fix_list(self.sha1_hash,
                                                   self.case_name_dict)
            if saved_case_name:
                case_name = saved_case_name
            else:
                print self.url
                if BROWSER:
                    subprocess.Popen([BROWSER, self.url],
                                     shell=False).communicate()
                case_name = raw_input("Short case name: ")
                self.case_name_fix_file.write("%s|%s\n" %
                                              (self.sha1_hash, case_name))

        return case_name, status
    def do_second_pass(options):
        """In the first pass, we ignored the duplicates that we got, preferring
        to let them stack up for later analysis. In this pass, we attempt to
        merge those failed items into the DB by more aggressive filtering and
        algorithmic selection.
        """
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
            docket__isnull=True,
        ).order_by('pk')
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
                docket_number__startswith='%s:' % idb_row.office
            ).exclude(
                docket_number__icontains='cr'
            ).exclude(
                case_name__icontains="sealed"
            ).exclude(
                case_name__icontains='suppressed'
            ).exclude(
                case_name__icontains='search warrant'
            )
            count = ds.count()

            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s",
                            i, idb_row)
                create_new_docket_from_idb(idb_row.pk)
                continue
            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s",
                            i, d, idb_row)
                merge_docket_with_idb(d.pk, idb_row.pk)
                continue

            logger.info("%s: Still have %s results after office and civil "
                        "docket number filtering. Filtering further.",
                        i, count)

            case_names = []
            for d in ds:
                case_name = harmonize(d.case_name)
                parts = case_name.lower().split(' v. ')
                if len(parts) == 1:
                    case_names.append(case_name)
                elif len(parts) == 2:
                    plaintiff, defendant = parts[0], parts[1]
                    case_names.append(
                        '%s v. %s' % (plaintiff[0:30], defendant[0:30])
                    )
                elif len(parts) > 2:
                    case_names.append(case_name)
            idb_case_name = harmonize('%s v. %s' % (idb_row.plaintiff,
                                                    idb_row.defendant))
            results = find_best_match(case_names, idb_case_name,
                                      case_sensitive=False)

            if results['ratio'] > 0.65:
                logger.info("%s Found good match by case name for %s: %s",
                            i, idb_case_name, results['match_str'])
                d = ds[results['match_index']]
                merge_docket_with_idb(d.pk, idb_row.pk)
            else:
                logger.info("%s No good match after office and case name "
                            "filtering. Creating new item: %s", i, idb_row)
                create_new_docket_from_idb(idb_row.pk)
 def test_harmonize_and_clean_string_tests(self):
     """Tests various inputs for the clean_string and harmonize functions"""
     test_pairs = [
         # Et al
         ["Lissner, et. al.", u"Lissner"],
         ["Lissner, et. al", u"Lissner"],
         ["Lissner, et al.", u"Lissner"],
         ["Lissner, et al", u"Lissner"],
         ["Lissner et. al.", u"Lissner"],
         ["Lissner et. al", u"Lissner"],
         ["Lissner et al.", u"Lissner"],
         ["Lissner et al", u"Lissner"],
         # US --> United States
         ["US v. Lissner, Plaintiff", u"United States v. Lissner"],
         [
             "US v. Lissner, Petitioner-appellant",
             u"United States v. Lissner",
         ],
         [
             "United States, Petitioner, v. Lissner",
             u"United States v. Lissner",
         ],
         [
             "United States of America, Plaintiff-Appellee, v. Orlando B. "
             "Pino, Defendant-Appellant, Joseph",
             u"United States v. Orlando B. Pino, Joseph",
         ],
         ["Herring v. U.S. **", u"Herring v. United States"],
         ["Test v. U.S", u"Test v. United States"],
         ["The United States v. Lissner", u"United States v. Lissner"],
         # Tests the output from a titlecased word containing
         # US to ensure it gets harmonized.
         ["Carver v. US", u"Carver v. United States"],
         # US Steel --> US Steel
         ["US Steel v.  US", u"US Steel v. United States"],
         ["US v. V.Vivack", u"United States v. V.Vivack"],
         ["US vs. Lissner", u"United States v. Lissner"],
         [
             "[email protected] vs. USA",
             u"[email protected] v. United States",
         ],
         ["US v. US", u"United States v. United States"],
         ["US  Steel v.  US", u"US Steel v. United States"],
         ["U.S.A. v. Mr. v.", u"United States v. Mr. v."],
         ["U.S.S. v. Lissner", u"U.S.S. v. Lissner"],
         ["USC v. Lissner", u"USC v. Lissner"],
         ["U.S.C. v. Lissner", u"U.S.C. v. Lissner"],
         ["U.S. Steel v. Colgate", u"U.S. Steel v. Colgate"],
         ["U.S.A. v. Lissner", u"United States v. Lissner"],
         ["U.S. v. Lissner", u"United States v. Lissner"],
         ["U. S. v. Lissner", u"United States v. Lissner"],
         ["United States v. Lissner", u"United States v. Lissner"],
         ["Usa v. Lissner", u"United States v. Lissner"],
         ["USA v. Lissner", u"United States v. Lissner"],
         [
             "United States of America v. Lissner",
             u"United States v. Lissner",
         ],
         [
             "Lissner v. United States of America",
             u"Lissner v. United States",
         ],
         # tests no period in v.
         ["USA v White", u"United States v. White"],
         # tests no period in vs.
         ["USA vs White", u"United States v. White"],
         [
             "V.Vivack and Associates v. US",
             u"V.Vivack and Associates v. United States",
         ],
         [
             "v.v. Hendricks & Sons v. James v. Smith",
             u"v.v. Hendricks & Sons v. James v. Smith",
         ],
         # Normalize "The State"
         ["Aimee v. The State", u"Aimee v. State"],
         # Nuke Pet (short for petitioners)
         ["Commonwealth v. Mickle, V., Pet.", u"Commonwealth v. Mickle v."],
         # Unchanged, despite having the word Pet
         ["Pet Doctors inc. v. Spoon", u"Pet Doctors inc. v. Spoon"],
         # Nukes the No. and Nos., but not
         ["No. 23423", u"23423"],
         ["Nos. 23 and 232", u"23 and 232"],
         ["No Expletives Inc.", u"No Expletives Inc."],
         # Tests that "Nothing" doesn't get nuked.
         ["No. 232 Nothing 232", "232 Nothing 232"],
         # Garbage
         # leading slash.
         ["/USA vs White", u"United States v. White"],
         # unicode input
         ["12–1438-cr", u"12–1438-cr"],
         # Randoms
         ["clarinet alibi", u"clarinet alibi"],
         ["papusa", u"papusa"],
         ["CUSANO", u"CUSANO"],
         # Filter out invalid XML characters
         [
             u"Special Counsel ex rel. Karla Saunders",
             u"Special Counsel ex rel. Karla Saunders",
         ],
     ]
     for pair in test_pairs:
         self.assertEqual(harmonize(clean_string(pair[0])), pair[1])
Пример #17
0
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(
                        itertools.chain.from_iterable(judge_list + author_list)
                    )
                )
            )
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number"
                        % trunc(docket_string, length=5000, ellipsis="...")
                    )
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase(
                        "".join(extract_judge_last_name(author_tag_str))
                    )
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
Пример #18
0
 def test_harmonize_and_clean_string_tests(self):
     """Tests various inputs for the clean_string and harmonize functions"""
     test_pairs = [
         ['U.S.A. v. Lissner', u'United States v. Lissner'],
         ['U.S. v. Lissner', u'United States v. Lissner'],
         ['U. S. v. Lissner', u'United States v. Lissner'],
         ['United States v. Lissner', u'United States v. Lissner'],
         ['Usa v. Lissner', u'United States v. Lissner'],
         ['USA v. Lissner', u'United States v. Lissner'],
         [
             'United States of America v. Lissner',
             u'United States v. Lissner'
         ],
         [
             'Lissner v. United States of America',
             u'Lissner v. United States'
         ],
         [
             'V.Vivack and Associates v. US',
             u'V.Vivack and Associates v. United States'
         ],
         [
             'v.v. Hendricks & Sons v. James v. Smith',
             u'v.v. Hendricks & Sons v. James v. Smith'
         ],
         ['U.S.A. v. Mr. v.', u'United States v. Mr. v.'],
         ['U.S.S. v. Lissner', u'U.S.S. v. Lissner'],
         ['USC v. Lissner', u'USC v. Lissner'],
         ['U.S.C. v. Lissner', u'U.S.C. v. Lissner'],
         ['U.S. Steel v. Colgate', u'U.S. Steel v. Colgate'],
         ['papusa', u'papusa'],
         ['CUSANO', u'CUSANO'],
         ['US Steel v.  US', u'US Steel v. United States'],
         ['US v. V.Vivack', u'United States v. V.Vivack'],
         ['US vs. Lissner', u'United States v. Lissner'],
         [
             '[email protected] vs. USA',
             u'[email protected] v. United States'
         ],
         ['US v. US', u'United States v. United States'],
         ['US  Steel v.  US', u'US Steel v. United States'],
         ['Lissner, et. al.', u'Lissner'],
         ['Lissner, et. al', u'Lissner'],
         ['Lissner, et al.', u'Lissner'],
         ['Lissner, et al', u'Lissner'],
         ['Lissner et. al.', u'Lissner'],
         ['Lissner et. al', u'Lissner'],
         ['Lissner et al.', u'Lissner'],
         ['Lissner et al', u'Lissner'],
         ['clarinet alibi', u'clarinet alibi'],
         ['US v. Lissner, Plaintiff', u'United States v. Lissner'],
         [
             'US v. Lissner, Petitioner-appellant',
             u'United States v. Lissner'
         ],
         [
             'United States, Petitioner, v. Lissner',
             u'United States v. Lissner'
         ],
         [
             'United States of America, Plaintiff-Appellee, v. Orlando B. Pino, Defendant-Appellant, Joseph',
             u'United States v. Orlando B. Pino, Joseph'
         ],
         ['Herring v. U.S. **', u'Herring v. United States'],
         ['Test v. U.S', u'Test v. United States'],
         ['The United States v. Lissner', u'United States v. Lissner'],
         [
             'USA v White',  # tests no period in v.
             u'United States v. White'
         ],
         [
             'USA vs White',  # tests no period in vs.
             u'United States v. White'
         ],
         [
             '/USA vs White',  # tests leading slash.
             u'United States v. White'
         ],
         [
             '12–1438-cr',  # tests unicode input
             u'12–1438-cr'
         ],
         [
             'Carver v. US',
             # Tests the output from a titlecased word containing US to ensure it gets
             # harmonized.
             u'Carver v. United States'
         ],
         [
             'Aimee v. The State',  # Normalize "The State"
             u'Aimee v. State'
         ],
         ['Commonwealth v. Mickle, V., Pet.', u'Commonwealth v. Mickle v.'],
         # Nuke Pet (short for petitioners)
         ['Pet Doctors inc. v. Spoon', u'Pet Doctors inc. v. Spoon'],
         # Unchanged, despite having the word Pet
     ]
     for pair in test_pairs:
         self.assertEqual(harmonize(clean_string(pair[0])), pair[1])
Пример #19
0
def parse_harvard_opinions(reporter, volume):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"], html=False)
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            list(set(itertools.chain.from_iterable(judge_list + author_list)))
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            # Iterate over other xml fields in Harvard data set
            # and save as string list   for further processing at a later date.
            json_fields = [
                "attorneys",
                "disposition",
                "syllabus",
                "summary",
                "history",
                "otherdate",
                "seealso",
                "headnotes",
                "correction",
            ]
            data_set = {}
            while json_fields:
                key = json_fields.pop(0)
                data_set[key] = "|".join([x.text for x in soup.find_all(key)])

            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=data_set["attorneys"],
                disposition=data_set["disposition"],
                syllabus=data_set["syllabus"],
                summary=data_set["summary"],
                history=data_set["history"],
                other_dates=data_set["otherdate"],
                cross_reference=data_set["seealso"],
                headnotes=data_set["headnotes"],
                correction=data_set["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            for op in soup.find_all("opinion"):
                joined_by_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(judge_list)))
                    )
                )
                author_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(author_list)))
                    )
                )

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                Opinion.objects.create(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    joined_by_str=joined_by_str,
                    extracted_by_ocr=True,
                )

        logger.info("Finished: %s", citation.base_citation())
Пример #20
0
 def test_harmonize_and_clean_string_tests(self):
     """Tests various inputs for the clean_string and harmonize functions"""
     test_pairs = [['U.S.A. v. Lissner',
                    u'United States v. Lissner'],
                   ['U.S. v. Lissner',
                    u'United States v. Lissner'],
                   ['U. S. v. Lissner',
                    u'United States v. Lissner'],
                   ['United States v. Lissner',
                    u'United States v. Lissner'],
                   ['Usa v. Lissner',
                    u'United States v. Lissner'],
                   ['USA v. Lissner',
                    u'United States v. Lissner'],
                   ['United States of America v. Lissner',
                    u'United States v. Lissner'],
                   ['Lissner v. United States of America',
                    u'Lissner v. United States'],
                   ['V.Vivack and Associates v. US',
                    u'V.Vivack and Associates v. United States'],
                   ['v.v. Hendricks & Sons v. James v. Smith',
                    u'v.v. Hendricks & Sons v. James v. Smith'],
                   ['U.S.A. v. Mr. v.',
                    u'United States v. Mr. v.'],
                   ['U.S.S. v. Lissner',
                    u'U.S.S. v. Lissner'],
                   ['USC v. Lissner',
                    u'USC v. Lissner'],
                   ['U.S.C. v. Lissner',
                    u'U.S.C. v. Lissner'],
                   ['U.S. Steel v. Colgate',
                    u'U.S. Steel v. Colgate'],
                   ['papusa',
                    u'papusa'],
                   ['CUSANO',
                    u'CUSANO'],
                   ['US Steel v.  US',
                    u'US Steel v. United States'],
                   ['US v. V.Vivack',
                    u'United States v. V.Vivack'],
                   ['US vs. Lissner',
                    u'United States v. Lissner'],
                   ['[email protected] vs. USA',
                    u'[email protected] v. United States'],
                   ['US v. US',
                    u'United States v. United States'],
                   ['US  Steel v.  US',
                    u'US Steel v. United States'],
                   ['Lissner, et. al.',
                    u'Lissner'],
                   ['Lissner, et. al',
                    u'Lissner'],
                   ['Lissner, et al.',
                    u'Lissner'],
                   ['Lissner, et al',
                    u'Lissner'],
                   ['Lissner et. al.',
                    u'Lissner'],
                   ['Lissner et. al',
                    u'Lissner'],
                   ['Lissner et al.',
                    u'Lissner'],
                   ['Lissner et al',
                    u'Lissner'],
                   ['clarinet alibi',
                    u'clarinet alibi'],
                   ['US v. Lissner, Plaintiff',
                    u'United States v. Lissner'],
                   ['US v. Lissner, Petitioner-appellant',
                    u'United States v. Lissner'],
                   ['United States, Petitioner, v. Lissner',
                    u'United States v. Lissner'],
                   [
                       'United States of America, Plaintiff-Appellee, v. Orlando B. Pino, Defendant-Appellant, Joseph',
                       u'United States v. Orlando B. Pino, Joseph'],
                   ['Herring v. U.S. **',
                    u'Herring v. United States'],
                   ['Test v. U.S',
                    u'Test v. United States'],
                   ['The United States v. Lissner',
                    u'United States v. Lissner'],
                   ['USA v White',  # tests no period in v.
                    u'United States v. White'],
                   ['USA vs White',  # tests no period in vs.
                    u'United States v. White'],
                   ['/USA vs White',  # tests leading slash.
                    u'United States v. White'],
                   ['12–1438-cr',  # tests unicode input
                    u'12–1438-cr'],
                   ['Carver v. US',
                    # Tests the output from a titlecased word containing US to ensure it gets
                    # harmonized.
                    u'Carver v. United States'],
                   ['Aimee v. The State',  # Normalize "The State"
                    u'Aimee v. State'],
                   ['Commonwealth v. Mickle, V., Pet.',
                    u'Commonwealth v. Mickle v.'],
                   # Nuke Pet (short for petitioners)
                   ['Pet Doctors inc. v. Spoon',
                    u'Pet Doctors inc. v. Spoon'],
                   # Unchanged, despite having the word Pet
     ]
     for pair in test_pairs:
         self.assertEqual(harmonize(clean_string(pair[0])), pair[1])
Пример #21
0
def format_case_name(n):
    """Applies standard harmonization methods after normalizing with lowercase."""
    return titlecase(harmonize(n.lower()))
Пример #22
0
    def do_second_pass(options):
        """In the first pass, we ignored the duplicates that we got, preferring
        to let them stack up for later analysis. In this pass, we attempt to
        merge those failed items into the DB by more aggressive filtering and
        algorithmic selection.
        """
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
            docket__isnull=True,
        ).order_by('pk')
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
                docket_number__startswith='%s:' %
                idb_row.office).exclude(docket_number__icontains='cr').exclude(
                    case_name__icontains="sealed").exclude(
                        case_name__icontains='suppressed').exclude(
                            case_name__icontains='search warrant')
            count = ds.count()

            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s", i,
                            idb_row)
                create_new_docket_from_idb(idb_row.pk)
                continue
            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s", i, d,
                            idb_row)
                merge_docket_with_idb(d.pk, idb_row.pk)
                continue

            logger.info(
                "%s: Still have %s results after office and civil "
                "docket number filtering. Filtering further.", i, count)

            case_names = []
            for d in ds:
                case_name = harmonize(d.case_name)
                parts = case_name.lower().split(' v. ')
                if len(parts) == 1:
                    case_names.append(case_name)
                elif len(parts) == 2:
                    plaintiff, defendant = parts[0], parts[1]
                    case_names.append('%s v. %s' %
                                      (plaintiff[0:30], defendant[0:30]))
                elif len(parts) > 2:
                    case_names.append(case_name)
            idb_case_name = harmonize('%s v. %s' %
                                      (idb_row.plaintiff, idb_row.defendant))
            results = find_best_match(case_names,
                                      idb_case_name,
                                      case_sensitive=False)

            if results['ratio'] > 0.65:
                logger.info("%s Found good match by case name for %s: %s", i,
                            idb_case_name, results['match_str'])
                d = ds[results['match_index']]
                merge_docket_with_idb(d.pk, idb_row.pk)
            else:
                logger.info(
                    "%s No good match after office and case name "
                    "filtering. Creating new item: %s", i, idb_row)
                create_new_docket_from_idb(idb_row.pk)
Пример #23
0
 def test_harmonize_and_clean_string_tests(self):
     """Tests various inputs for the clean_string and harmonize functions"""
     test_pairs = [
         # Et al
         ["Lissner, et. al.", u"Lissner"],
         ["Lissner, et. al", u"Lissner"],
         ["Lissner, et al.", u"Lissner"],
         ["Lissner, et al", u"Lissner"],
         ["Lissner et. al.", u"Lissner"],
         ["Lissner et. al", u"Lissner"],
         ["Lissner et al.", u"Lissner"],
         ["Lissner et al", u"Lissner"],
         # US --> United States
         ["US v. Lissner, Plaintiff", u"United States v. Lissner"],
         ["US v. Lissner, Petitioner-appellant", u"United States v. Lissner"],
         ["United States, Petitioner, v. Lissner", u"United States v. Lissner"],
         [
             "United States of America, Plaintiff-Appellee, v. Orlando B. " "Pino, Defendant-Appellant, Joseph",
             u"United States v. Orlando B. Pino, Joseph",
         ],
         ["Herring v. U.S. **", u"Herring v. United States"],
         ["Test v. U.S", u"Test v. United States"],
         ["The United States v. Lissner", u"United States v. Lissner"],
         # Tests the output from a titlecased word containing
         # US to ensure it gets harmonized.
         ["Carver v. US", u"Carver v. United States"],
         # US Steel --> US Steel
         ["US Steel v.  US", u"US Steel v. United States"],
         ["US v. V.Vivack", u"United States v. V.Vivack"],
         ["US vs. Lissner", u"United States v. Lissner"],
         ["[email protected] vs. USA", u"[email protected] v. United States"],
         ["US v. US", u"United States v. United States"],
         ["US  Steel v.  US", u"US Steel v. United States"],
         ["U.S.A. v. Mr. v.", u"United States v. Mr. v."],
         ["U.S.S. v. Lissner", u"U.S.S. v. Lissner"],
         ["USC v. Lissner", u"USC v. Lissner"],
         ["U.S.C. v. Lissner", u"U.S.C. v. Lissner"],
         ["U.S. Steel v. Colgate", u"U.S. Steel v. Colgate"],
         ["U.S.A. v. Lissner", u"United States v. Lissner"],
         ["U.S. v. Lissner", u"United States v. Lissner"],
         ["U. S. v. Lissner", u"United States v. Lissner"],
         ["United States v. Lissner", u"United States v. Lissner"],
         ["Usa v. Lissner", u"United States v. Lissner"],
         ["USA v. Lissner", u"United States v. Lissner"],
         ["United States of America v. Lissner", u"United States v. Lissner"],
         ["Lissner v. United States of America", u"Lissner v. United States"],
         # tests no period in v.
         ["USA v White", u"United States v. White"],
         # tests no period in vs.
         ["USA vs White", u"United States v. White"],
         ["V.Vivack and Associates v. US", u"V.Vivack and Associates v. United States"],
         ["v.v. Hendricks & Sons v. James v. Smith", u"v.v. Hendricks & Sons v. James v. Smith"],
         # Normalize "The State"
         ["Aimee v. The State", u"Aimee v. State"],
         # Nuke Pet (short for petitioners)
         ["Commonwealth v. Mickle, V., Pet.", u"Commonwealth v. Mickle v."],
         # Unchanged, despite having the word Pet
         ["Pet Doctors inc. v. Spoon", u"Pet Doctors inc. v. Spoon"],
         # Nukes the No. and Nos., but not
         ["No. 23423", u"23423"],
         ["Nos. 23 and 232", u"23 and 232"],
         ["No Expletives Inc.", u"No Expletives Inc."],
         # Tests that "Nothing" doesn't get nuked.
         ["No. 232 Nothing 232", "232 Nothing 232"],
         # Garbage
         # leading slash.
         ["/USA vs White", u"United States v. White"],
         # unicode input
         ["12–1438-cr", u"12–1438-cr"],
         # Randoms
         ["clarinet alibi", u"clarinet alibi"],
         ["papusa", u"papusa"],
         ["CUSANO", u"CUSANO"],
         # Filter out invalid XML characters
         [u"Special Counsel ex rel. Karla Saunders", u"Special Counsel ex rel. Karla Saunders"],
     ]
     for pair in test_pairs:
         self.assertEqual(harmonize(clean_string(pair[0])), pair[1])
Пример #24
0
    def test_harmonize_and_clean_string_tests(self):
        """Tests various inputs for the clean_string and harmonize functions"""
        test_pairs = [
            # Et al
            ['Lissner, et. al.',
             u'Lissner'],
            ['Lissner, et. al',
             u'Lissner'],
            ['Lissner, et al.',
             u'Lissner'],
            ['Lissner, et al',
             u'Lissner'],
            ['Lissner et. al.',
             u'Lissner'],
            ['Lissner et. al',
             u'Lissner'],
            ['Lissner et al.',
             u'Lissner'],
            ['Lissner et al',
             u'Lissner'],

            # US --> United States
            ['US v. Lissner, Plaintiff',
             u'United States v. Lissner'],
            ['US v. Lissner, Petitioner-appellant',
             u'United States v. Lissner'],
            ['United States, Petitioner, v. Lissner',
             u'United States v. Lissner'],
            [
                'United States of America, Plaintiff-Appellee, v. Orlando B. '
                'Pino, Defendant-Appellant, Joseph',
                u'United States v. Orlando B. Pino, Joseph'],
            ['Herring v. U.S. **',
             u'Herring v. United States'],
            ['Test v. U.S',
             u'Test v. United States'],
            ['The United States v. Lissner',
             u'United States v. Lissner'],
            # Tests the output from a titlecased word containing
            # US to ensure it gets harmonized.
            ['Carver v. US',
             u'Carver v. United States'],
            # US Steel --> US Steel
            ['US Steel v.  US',
             u'US Steel v. United States'],
            ['US v. V.Vivack',
             u'United States v. V.Vivack'],
            ['US vs. Lissner',
             u'United States v. Lissner'],
            ['[email protected] vs. USA',
             u'[email protected] v. United States'],
            ['US v. US',
             u'United States v. United States'],
            ['US  Steel v.  US',
             u'US Steel v. United States'],
            ['U.S.A. v. Mr. v.',
             u'United States v. Mr. v.'],
            ['U.S.S. v. Lissner',
             u'U.S.S. v. Lissner'],
            ['USC v. Lissner',
             u'USC v. Lissner'],
            ['U.S.C. v. Lissner',
             u'U.S.C. v. Lissner'],
            ['U.S. Steel v. Colgate',
             u'U.S. Steel v. Colgate'],
            ['U.S.A. v. Lissner',
             u'United States v. Lissner'],
            ['U.S. v. Lissner',
             u'United States v. Lissner'],
            ['U. S. v. Lissner',
             u'United States v. Lissner'],
            ['United States v. Lissner',
             u'United States v. Lissner'],
            ['Usa v. Lissner',
             u'United States v. Lissner'],
            ['USA v. Lissner',
             u'United States v. Lissner'],
            ['United States of America v. Lissner',
             u'United States v. Lissner'],
            ['Lissner v. United States of America',
             u'Lissner v. United States'],

            # tests no period in v.
            ['USA v White',
             u'United States v. White'],
            # tests no period in vs.
            ['USA vs White',
             u'United States v. White'],
            ['V.Vivack and Associates v. US',
             u'V.Vivack and Associates v. United States'],
            ['v.v. Hendricks & Sons v. James v. Smith',
             u'v.v. Hendricks & Sons v. James v. Smith'],

            # Normalize "The State"
            ['Aimee v. The State',
             u'Aimee v. State'],

            # Nuke Pet (short for petitioners)
            ['Commonwealth v. Mickle, V., Pet.',
             u'Commonwealth v. Mickle v.'],
            # Unchanged, despite having the word Pet
            ['Pet Doctors inc. v. Spoon',
             u'Pet Doctors inc. v. Spoon'],

            # Nukes the No. and Nos., but not
            ['No. 23423',
             u'23423'],
            ['Nos. 23 and 232',
             u'23 and 232'],
            ['No Expletives Inc.',
             u'No Expletives Inc.'],
            # Tests that "Nothing" doesn't get nuked.
            ['No. 232 Nothing 232',
             '232 Nothing 232'],

            # Garbage
            # leading slash.
            ['/USA vs White',
             u'United States v. White'],
            # unicode input
            ['12–1438-cr',
             u'12–1438-cr'],

            # Randoms
            ['clarinet alibi',
             u'clarinet alibi'],
            ['papusa',
             u'papusa'],
            ['CUSANO',
             u'CUSANO'],

             # Filter out invalid XML characters
             [u'Special Counsel ex rel. Karla Saunders',
              u'Special Counsel ex rel. Karla Saunders'],
        ]
        for pair in test_pairs:
            self.assertEqual(harmonize(clean_string(pair[0])), pair[1])
Пример #25
0
def get_clean_case_name_and_sniff_status(s):
    """Strips out warnings re non-precedential status that occur in case
    names. If such a warning is discovered, we set the status flag to
    'nonprecedential'.

    Returns a cleaned case name and the status of the item, both as
    strings.
    """
    s = s.lower()
    regexes = (
        ('first circuit',
         '(unpublished disposition )?notice: first circuit local rule 36.2'
         '\(b\)6 states unpublished opinions may be cited only in related '
         'cases.?'),
        ('second circuit',
         '(unpublished disposition )?notice: second circuit local rule '
         '0.23 states unreported opinions shall not be cited or otherwise '
         'used in unrelated cases.?'),
        ('second circuit',
         '(unpublished disposition )?notice: this summary order may not '
         'be cited as precedential authority, but may be called to the '
         'attention of the court in a subsequent stage of this case, in a '
         'related case, or in any case for purposes of collateral '
         'estoppel or res judicata. see second circuit rule 0.23.?'),
        ('third circuit',
         '(unpublished disposition )?notice: third circuit rule 21\(i\) '
         'states citations to federal decisions which have not been '
         'formally reported should identify the court, docket number and '
         'date.?'),
        ('fourth circuit',
         '(unpublished disposition )?notice: fourth circuit (local rule '
         '36\(c\)|i.o.p. 36.6) states that citation of unpublished '
         'dispositions is disfavored except for establishing res '
         'judicata, estoppel, or the law of the case and requires service '
         'of copies of cited unpublished dispositions of the fourth '
         'circuit.?'),
        ('fifth circuit',
         '(unpublished disposition )?notice: fifth circuit local rule '
         '47.5.3 states that unpublished opinions should normally be '
         'cited only when they establish the law of the case, are relied '
         'upon as a basis for res judicata or collateral estoppel, or '
         'involve related facts. if an unpublished opinion is cited, a '
         'copy shall be attached to each copy of the brief.?'),
        ('sixth circuit',
         '(unpublished disposition )?notice: sixth circuit rule 24\(c\) '
         'states that citation of unpublished dispositions is disfavored '
         'except for establishing res judicata, estoppel, or the law of '
         'the case and requires service of copies of cited unpublished '
         'dispositions of the sixth circuit.?'),
        ('seventh circuit',
         '(unpublished disposition )?notice: seventh circuit rule '
         '53\(b\)\(2\) states unpublished orders shall not be cited or '
         'used as precedent except to support a claim of res judicata, '
         'collateral estoppel or law of the case in any federal court '
         'within the circuit.?'),
        ('eighth circuit',
         '(unpublished disposition )?notice: eighth circuit rule 28a\(k\) '
         'governs citation of unpublished opinions and provides that (no '
         'party may cite an opinion not intended for publication unless '
         'the cases are related by identity between the parties or the '
         'causes of action|they are not precedent and generally should not '
         'be cited unless relevant to establishing the doctrines of res '
         'judicata, collateral estoppel, the law of the case, or if the '
         'opinion has persuasive value on a material issue and no '
         'published opinion would serve as well).?'),
        ('ninth circuit',
         '(unpublished disposition )?notice: ninth circuit rule 36-3 '
         'provides that dispositions other than opinions or orders '
         'designated for publication are not precedential and should not '
         'be cited except when relevant under the doctrines of law of the '
         'case, res judicata, or collateral estoppel.?'),
        ('tenth circuit',
         '(unpublished disposition )?notice: tenth circuit rule 36.3 '
         'states that unpublished opinions and orders and judgments have '
         'no precedential value and shall not be cited except for '
         'purposes of establishing the doctrines of the law of the case, '
         'res judicata, or collateral estoppel.?'),
        ('d.c. circuit',
         '(unpublished disposition )?notice: d.c. circuit local rule '
         '11\(c\) states that unpublished orders, judgments, and '
         'explanatory memoranda may not be cited as precedents, but '
         'counsel may refer to unpublished dispositions when the binding '
         'or preclusive effect of the disposition, rather than its '
         'quality as precedent, is relevant.?'),
        ('federal circuit',
         '(unpublished disposition )?notice: federal circuit local rule '
         '47.(6|8)\(b\) states that opinions and orders which are '
         'designated as not citable as precedent shall not be employed or '
         'cited as precedent. this does not preclude assertion of issues '
         'of claim preclusion, issue preclusion, judicial estoppel, law '
         'of the case or the like based on a decision of the court '
         'rendered in a nonprecedential opinion or order.?'),
    )
    status = 'Published'
    for test, regex in regexes:
        if test in s:
            if re.match(regex, s):
                s = re.sub(regex, '', s)
                status = 'Unpublished'

    s = titlecase(harmonize(clean_string(s)))
    return s, status
Пример #26
0
def format_case_name(n):
    """Applies standard harmonization methods after normalizing with
    lowercase."""
    return titlecase(harmonize(n.lower()))
Пример #27
0
def process_free_opinion_result(self, row_pk, cnt):
    """Process a single result from the free opinion report"""
    result = PACERFreeDocumentRow.objects.get(pk=row_pk)
    result.court = Court.objects.get(pk=map_pacer_to_cl_id(result.court_id))
    result.case_name = harmonize(result.case_name)
    result.case_name_short = cnt.make_case_name_short(result.case_name)
    row_copy = copy.copy(result)
    # If we don't do this, the doc's date_filed becomes the docket's
    # date_filed. Bad.
    delattr(row_copy, 'date_filed')
    # If we don't do this, we get the PACER court id and it crashes
    delattr(row_copy, 'court_id')
    # If we don't do this, the id of result tries to smash that of the docket.
    delattr(row_copy, 'id')
    try:
        with transaction.atomic():
            docket = lookup_and_save(row_copy)
            if not docket:
                msg = "Unable to create docket for %s" % result
                logger.error(msg)
                result.error_msg = msg
                result.save()
                self.request.callbacks = None
                return
            docket.blocked, docket.date_blocked = get_blocked_status(docket)
            docket.save()

            de, de_created = DocketEntry.objects.update_or_create(
                docket=docket,
                entry_number=result.document_number,
                defaults={
                    'date_filed': result.date_filed,
                    'description': result.description,
                }
            )
            rd, rd_created = RECAPDocument.objects.update_or_create(
                docket_entry=de,
                document_number=result.document_number,
                attachment_number=None,
                defaults={
                    'pacer_doc_id': result.pacer_doc_id,
                    'document_type': RECAPDocument.PACER_DOCUMENT,
                    'is_free_on_pacer': True,
                }
            )
    except IntegrityError as e:
        msg = "Raised IntegrityError: %s" % e
        logger.error(msg)
        if self.request.retries == self.max_retries:
            result.error_msg = msg
            result.save()
            return
        raise self.retry(exc=e)
    except DatabaseError as e:
        msg = "Unable to complete database transaction:\n%s" % e
        logger.error(msg)
        result.error_msg = msg
        result.save()
        self.request.callbacks = None
        return

    if not rd_created and rd.is_available:
        # The item already exists and is available. Fantastic, mark it as free,
        # and call it a day.
        rd.is_free_on_pacer = True
        rd.save()
        result.delete()
        self.request.callbacks = None
        return

    return {'result': result, 'rd_pk': rd.pk, 'pacer_court_id': result.court_id}