示例#1
0
    def get_human_review(self, clusters, d):
        for i, cluster in enumerate(clusters):
            logger.info("%s: Cluster %s (%0.3f sim):" % (
                i,
                cluster.pk,
                gen_diff_ratio(cluster.case_name.lower(),
                               d["caseName"].lower()),
            ))
            logger.info("https://www.courtlistener.com%s" %
                        cluster.get_absolute_url())
            logger.info("%s" % cluster.case_name.encode())
            if cluster.docket.docket_number:
                logger.info(cluster.docket.docket_number.encode())
            logger.info(cluster.date_filed)
        logger.info("SCDB info:")
        logger.info(d["caseName"])
        if d["docket"]:
            logger.info(d["docket"])
        logger.info(d["dateDecision"])

        if self.skip_human_review:
            logger.info(
                "Skipping human review and just returning the first item.")
            self.skipped_count += 1
            return clusters[0]
        else:
            choice = input("Which item should we update? [0-%s] " %
                           (len(clusters) - 1))

            try:
                choice = int(choice)
                cluster = clusters[choice]
            except ValueError:
                cluster = None
            return cluster
示例#2
0
    def get_human_review(self, clusters, d):
        for i, cluster in enumerate(clusters):
            logger.info('%s: Cluster %s (%0.3f sim):' % (
                i,
                cluster.pk,
                gen_diff_ratio(cluster.case_name.lower(),
                               d['caseName'].lower()),
            ))
            logger.info('https://www.courtlistener.com%s' %
                        cluster.get_absolute_url())
            logger.info('      %s' % cluster.case_name.encode('utf-8'))
            if cluster.docket.docket_number:
                logger.info(cluster.docket.docket_number.encode('utf-8'))
            logger.info(cluster.date_filed)
        logger.info('SCDB info:')
        logger.info(d['caseName'])
        if d['docket']:
            logger.info(d['docket'])
        logger.info(d['dateDecision'])

        if self.skip_human_review:
            logger.info('Skipping human review and just returning the first '
                        'item.')
            self.skipped_count += 1
            return clusters[0]
        else:
            choice = raw_input('  Which item should we update? [0-%s] ' %
                               (len(clusters) - 1))

            try:
                choice = int(choice)
                cluster = clusters[choice]
            except ValueError:
                cluster = None
            return cluster
示例#3
0
    def get_human_review(self, clusters, d):
        for i, cluster in enumerate(clusters):
            print '    %s: Cluster %s (%0.3f sim):' % (
                i,
                cluster.pk,
                gen_diff_ratio(
                    cluster.case_name.lower(),
                    d['caseName'].lower()
                ),
            )
            print '      https://www.courtlistener.com%s' % cluster.get_absolute_url()
            print '      %s' % cluster.case_name.encode('utf-8')
            if cluster.docket.docket_number:
                print '      %s' % cluster.docket.docket_number.encode('utf-8')
            print '      %s' % cluster.date_filed
        print '  SCDB info:'
        print '    %s' % d['caseName']
        if d['docket']:
            print '    %s' % d['docket']
        print '    %s' % d['dateDecision']

        if self.skip_human_review:
            print('  Skipping human review and just returning the first item.')
            self.skipped_count += 1
            return clusters[0]
        else:
            choice = raw_input('  Which item should we update? [0-%s] ' %
                               (len(clusters) - 1))

            try:
                choice = int(choice)
                cluster = clusters[choice]
            except ValueError:
                cluster = None
            return cluster
示例#4
0
def get_dup_stats(doc):
    """The heart of the duplicate algorithm. Returns stats about the case as
    compared to other cases already in the system. Other methods can call this
    one, and can make decisions based on the stats generated here.

    If no likely duplicates are encountered, stats are returned as zeroes.

    Process:
        1. Refine the possible result set down to just a few candidates.
        2. Determine their likelihood of being duplicates according to a
           number of measures:
            - Similarity of case name
            - Similarity of docket number
            - Comparison of content length
    """
    conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r')
    DEBUG = True

    ##########################################
    # 1: Refine by date, court and case name #
    ##########################################
    main_params = make_case_name_solr_query(
        doc.case_name,
        doc.docket.court_id,
        doc.date_filed,
        DEBUG=DEBUG,
    )
    main_params['caller'] = 'corpus_importer'
    if DEBUG:
        print "    - main_params are: %s" % main_params
    candidates = conn.raw_query(**main_params).execute()

    if not len(candidates) and doc.docket.docket_number is not None:
        # Try by docket number rather than case name
        clean_docket_number_words = []
        for word in doc.docket.docket_number.split():
            if not re.search('\d', word):
                # Must have numbers.
                continue
            word = word.strip(string.punctuation)
            regex = re.compile('[%s]' % re.escape(string.punctuation))
            if regex.search(re.sub('-', '', word)):
                # Can only have hyphens after stripping
                continue
            clean_docket_number_words.append(word)
        docket_q = ' OR '.join(clean_docket_number_words)
        if docket_q:
            main_params = {
                'fq': [
                    'court_exact:%s' % doc.docket.court_id,
                    'dateFiled:%s' % build_date_range(doc.date_filed, range=15),
                    'docketNumber:(%s)' % docket_q
                ],
                'rows': 100,
                'caller': 'corpus_importer',
            }
            if DEBUG:
                print "    - main_params are: %s" % main_params
            candidates = conn.raw_query(**main_params).execute()

    if not len(candidates) and doc.docket.court_id == 'scotus':
        if doc.federal_cite_one:
            # Scotus case, try by citation.
            main_params = {
                'fq': [
                    'court_exact:%s' % doc.docket.court_id,
                    'dateFiled:%s' % build_date_range(doc.date_filed, range=90),  # Creates ~6 month span.
                    'citation:(%s)' % ' '.join([re.sub(r"\D", '', w) for w in doc.federal_cite_one.split()])
                ],
                'rows': 100,
                'caller': 'corpus_importer',
            }
            if DEBUG:
                print "    - main_params are: %s" % main_params
            candidates = conn.raw_query(**main_params).execute()

    stats = {'candidate_count': len(candidates)}
    if not len(candidates):
        return stats, candidates

    #########################################
    # 2: Attempt filtering by docket number #
    #########################################
    # Two-step process. First we see if we have any exact hits.
    # Second, if there were exact hits, we forward those onwards. If not, we
    # forward everything.
    remaining_candidates = []
    if doc.docket.docket_number:
        new_docket_number = re.sub("(\D|0)", "", doc.docket.docket_number)
        for candidate in candidates:
            if candidate.get('docketNumber'):
                # Get rid of anything in the docket numbers that's not a digit
                result_docket_number = re.sub("(\D|0)", "", candidate['docketNumber'])
                # Get rid of zeroes too.
                if new_docket_number == result_docket_number:
                    remaining_candidates.append(candidate)

    if len(remaining_candidates) > 0:
        # We had one or more exact hits! Use those.
        candidates = remaining_candidates
    else:
        # We just let candidates from step one get passed through by doing nothing.
        pass

    stats = {'candidate_count': len(candidates)}

    ##############################
    # 3: Find the best case name #
    ##############################
    confidences = find_confidences(candidates, doc.case_name)
    stats['case_name_similarities'] = confidences

    #####################################################################
    # 4: Check content length, gestalt difference and cosine similarity #
    #####################################################################
    percent_diffs, gestalt_diffs, cos_sims = [], [], []
    new_stripped_content = re.sub('\W', '', doc.body_text).lower()
    for candidate in candidates:
        candidate_stripped_content = re.sub('\W', '', candidate['text']).lower()

        # Calculate the difference in text length and their gestalt difference
        try:
            length_diff = abs(len(candidate_stripped_content) - len(new_stripped_content))
        except ZeroDivisionError:
            length_diff = 0
        try:
            percent_diff = float(length_diff) / len(new_stripped_content)
        except ZeroDivisionError:
            percent_diff = 0
        cos_sim = get_cosine_similarity(doc.body_text, candidate['text'])
        percent_diffs.append(percent_diff)
        gestalt_diffs.append(gen_diff_ratio(candidate_stripped_content, new_stripped_content))
        cos_sims.append(cos_sim)

    stats['length_diffs'] = percent_diffs
    stats['gestalt_diffs'] = gestalt_diffs
    stats['cos_sims'] = cos_sims

    return stats, candidates