Пример #1
0
def do_heuristic_match(idb_row, ds):
    """Use cosine similarity of case names from the IDB to try to find a match
    out of several possibilities in the DB.

    :param idb_row: The FJC IDB row to match against
    :param ds: A list of Dockets that might match
    :returns: The best-matching Docket in ds if possible, else None
    """
    case_names = []
    for d in ds:
        case_name = harmonize(d.case_name)
        parts = case_name.lower().split(" v. ")
        if len(parts) == 1:
            case_names.append(case_name)
        elif len(parts) == 2:
            plaintiff, defendant = parts[0], parts[1]
            case_names.append("%s v. %s" % (plaintiff[0:30], defendant[0:30]))
        elif len(parts) > 2:
            case_names.append(case_name)
    idb_case_name = harmonize(
        "%s v. %s" % (idb_row.plaintiff, idb_row.defendant)
    )
    results = find_best_match(case_names, idb_case_name, case_sensitive=False)
    if results["ratio"] > 0.65:
        logger.info(
            "Found good match by case name for %s: %s",
            idb_case_name,
            results["match_str"],
        )
        d = ds[results["match_index"]]
    else:
        logger.info(
            "No good match after office and case name filtering. Creating "
            "new item: %s",
            idb_row,
        )
        d = None
    return d
Пример #2
0
    def do_second_pass(options):
        """In the first pass, we ignored the duplicates that we got, preferring
        to let them stack up for later analysis. In this pass, we attempt to
        merge those failed items into the DB by more aggressive filtering and
        algorithmic selection.
        """
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
            docket__isnull=True,
        ).order_by('pk')
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
                docket_number__startswith='%s:' %
                idb_row.office).exclude(docket_number__icontains='cr').exclude(
                    case_name__icontains="sealed").exclude(
                        case_name__icontains='suppressed').exclude(
                            case_name__icontains='search warrant')
            count = ds.count()

            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s", i,
                            idb_row)
                create_new_docket_from_idb(idb_row.pk)
                continue
            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s", i, d,
                            idb_row)
                merge_docket_with_idb(d.pk, idb_row.pk)
                continue

            logger.info(
                "%s: Still have %s results after office and civil "
                "docket number filtering. Filtering further.", i, count)

            case_names = []
            for d in ds:
                case_name = harmonize(d.case_name)
                parts = case_name.lower().split(' v. ')
                if len(parts) == 1:
                    case_names.append(case_name)
                elif len(parts) == 2:
                    plaintiff, defendant = parts[0], parts[1]
                    case_names.append('%s v. %s' %
                                      (plaintiff[0:30], defendant[0:30]))
                elif len(parts) > 2:
                    case_names.append(case_name)
            idb_case_name = harmonize('%s v. %s' %
                                      (idb_row.plaintiff, idb_row.defendant))
            results = find_best_match(case_names,
                                      idb_case_name,
                                      case_sensitive=False)

            if results['ratio'] > 0.65:
                logger.info("%s Found good match by case name for %s: %s", i,
                            idb_case_name, results['match_str'])
                d = ds[results['match_index']]
                merge_docket_with_idb(d.pk, idb_row.pk)
            else:
                logger.info(
                    "%s No good match after office and case name "
                    "filtering. Creating new item: %s", i, idb_row)
                create_new_docket_from_idb(idb_row.pk)
    def do_second_pass(options):
        """In the first pass, we ignored the duplicates that we got, preferring
        to let them stack up for later analysis. In this pass, we attempt to
        merge those failed items into the DB by more aggressive filtering and
        algorithmic selection.
        """
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
            docket__isnull=True,
        ).order_by('pk')
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
                docket_number__startswith='%s:' % idb_row.office
            ).exclude(
                docket_number__icontains='cr'
            ).exclude(
                case_name__icontains="sealed"
            ).exclude(
                case_name__icontains='suppressed'
            ).exclude(
                case_name__icontains='search warrant'
            )
            count = ds.count()

            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s",
                            i, idb_row)
                create_new_docket_from_idb(idb_row.pk)
                continue
            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s",
                            i, d, idb_row)
                merge_docket_with_idb(d.pk, idb_row.pk)
                continue

            logger.info("%s: Still have %s results after office and civil "
                        "docket number filtering. Filtering further.",
                        i, count)

            case_names = []
            for d in ds:
                case_name = harmonize(d.case_name)
                parts = case_name.lower().split(' v. ')
                if len(parts) == 1:
                    case_names.append(case_name)
                elif len(parts) == 2:
                    plaintiff, defendant = parts[0], parts[1]
                    case_names.append(
                        '%s v. %s' % (plaintiff[0:30], defendant[0:30])
                    )
                elif len(parts) > 2:
                    case_names.append(case_name)
            idb_case_name = harmonize('%s v. %s' % (idb_row.plaintiff,
                                                    idb_row.defendant))
            results = find_best_match(case_names, idb_case_name,
                                      case_sensitive=False)

            if results['ratio'] > 0.65:
                logger.info("%s Found good match by case name for %s: %s",
                            i, idb_case_name, results['match_str'])
                d = ds[results['match_index']]
                merge_docket_with_idb(d.pk, idb_row.pk)
            else:
                logger.info("%s No good match after office and case name "
                            "filtering. Creating new item: %s", i, idb_row)
                create_new_docket_from_idb(idb_row.pk)