def query_dockets(query_string):
    """Identify the d_pks for all the dockets that we need to export

    :param query_string: The query to run as a URL-encoded string (typically starts
     with 'q='). E.g. 'q=foo&type=r&order_by=dateFiled+asc&court=dcd'
    :return: a set of docket PKs to export
    """
    main_query = build_main_query_from_query_string(
        query_string,
        {"fl": ["docket_id"]},
        {"group": True, "facet": False, "highlight": False},
    )
    main_query["group.limit"] = 0
    main_query["sort"] = "dateFiled asc"
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    search = si.query().add_extra(**main_query)
    page_size = 1000
    paginator = Paginator(search, page_size)
    d_pks = set()
    for page_number in paginator.page_range:
        page = paginator.page(page_number)
        for item in page:
            d_pks.add(item["groupValue"])
    logger.info(
        "After %s pages, got back %s results.",
        len(paginator.page_range),
        len(d_pks),
    )
    return d_pks
Exemplo n.º 2
0
 def items(self, obj):
     """Do a Solr query here. Return the first 20 results"""
     search_form = SearchForm(obj.GET)
     if search_form.is_valid():
         cd = search_form.cleaned_data
         order_by = 'dateFiled'
         if cd['type'] == 'o':
             solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r')
         elif cd['type'] == 'r':
             solr = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
         else:
             return []
         main_params = search_utils.build_main_query(cd, highlight=False,
                                                     facet=False)
         main_params.update({
             'sort': '%s desc' % order_by,
             'rows': '20',
             'start': '0',
             'caller': 'SearchFeed',
         })
         # Eliminate items that lack the ordering field.
         main_params['fq'].append('%s:[* TO *]' % order_by)
         return solr.query().add_extra(**main_params).execute()
     else:
         return []
Exemplo n.º 3
0
def index_sitemap_maker(request):
    """Generate a sitemap index page

    Counts the number of cases in the site, divides by `items_per_sitemap` and
    provides links items.
    """
    connection_string_sitemap_path_pairs = (
        (settings.SOLR_OPINION_URL, reverse('opinion_sitemap'), False),
        (settings.SOLR_RECAP_URL, reverse('recap_sitemap'), True),
        (settings.SOLR_AUDIO_URL, reverse('oral_argument_sitemap'), False),
        (settings.SOLR_PEOPLE_URL, reverse('people_sitemap'), False),
    )
    sites = []
    for connection_string, path, group in connection_string_sitemap_path_pairs:
        conn = ExtraSolrInterface(connection_string)
        count = conn.query().add_extra(**make_index_params(group)).count()
        num_pages = count / items_per_sitemap + 1
        for i in range(1, num_pages + 1):
            sites.append('https://www.courtlistener.com%s?p=%s' % (path, i))

    # Random additional sitemaps.
    sites.extend([
        'https://www.courtlistener.com%s' % reverse('simple_pages_sitemap'),
        'https://www.courtlistener.com/sitemap-visualizations.xml',
    ])

    xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites})

    # These links contain case names, so they should get crawled but not
    # indexed
    response = HttpResponse(xml, content_type='application/xml')
    response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'
    return response
Exemplo n.º 4
0
def make_solr_sitemap(request, solr_url, params, changefreq, low_priority_pages,
                      url_field):
    solr = ExtraSolrInterface(solr_url)
    page = int(request.GET.get('p', 1))
    params['start'] = (page - 1) * items_per_sitemap
    results = solr.query().add_extra(**params).execute()

    urls = []
    cl = 'https://www.courtlistener.com'
    for result in results:
        result = normalize_grouping(result)
        url_strs = ['%s%s' % (cl, result[url_field])]
        if result.get('local_path') and \
                not result['local_path'].endswith('.xml'):
            url_strs.append('%s/%s' % (cl, result['local_path']))

        item = {}
        for url_str in url_strs:
            item['location'] = url_str
            item['changefreq'] = changefreq
            item['lastmod'] = result['timestamp']
            if any(s in url_str for s in low_priority_pages):
                item['priority'] = '0.3'
            else:
                item['priority'] = '0.5'
            urls.append(item.copy())

    xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls}))
    response = HttpResponse(xml, content_type='application/xml')
    response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'
    return response
Exemplo n.º 5
0
 def items(self, obj):
     """Do a Solr query here. Return the first 20 results"""
     search_form = SearchForm(obj.GET)
     if search_form.is_valid():
         cd = search_form.cleaned_data
         order_by = "dateFiled"
         if cd["type"] == SEARCH_TYPES.OPINION:
             solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r")
         elif cd["type"] == SEARCH_TYPES.RECAP:
             solr = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
         else:
             return []
         main_params = search_utils.build_main_query(
             cd, highlight=False, facet=False
         )
         main_params.update(
             {
                 "sort": "%s desc" % order_by,
                 "rows": "20",
                 "start": "0",
                 "caller": "SearchFeed",
             }
         )
         # Eliminate items that lack the ordering field.
         main_params["fq"].append("%s:[* TO *]" % order_by)
         items = solr.query().add_extra(**main_params).execute()
         solr.conn.http_connection.close()
         return items
     else:
         return []
Exemplo n.º 6
0
def coverage_data(request, version, court):
    """Provides coverage data for a court.

    Responds to either AJAX or regular requests.
    """

    if court != "all":
        court_str = get_object_or_404(Court, pk=court).pk
    else:
        court_str = "all"
    q = request.GET.get("q")
    si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r")
    facet_field = "dateFiled"
    response = (si.query().add_extra(
        **build_coverage_query(court_str, q, facet_field)).execute())
    si.conn.http_connection.close()
    counts = response.facet_counts.facet_ranges[facet_field]["counts"]
    counts = strip_zero_years(counts)

    # Calculate the totals
    annual_counts = {}
    total_docs = 0
    for date_string, count in counts:
        annual_counts[date_string[:4]] = count
        total_docs += count

    return JsonResponse({
        "annual_counts": annual_counts,
        "total": total_docs
    },
                        safe=True)
Exemplo n.º 7
0
def search_db_for_fullcitation(
    full_citation: FullCaseCitation,
) -> SolrResponse:
    """For a citation object, try to match it to an item in the database using
    a variety of heuristics.
    Returns:
      - a Solr Result object with the results, or an empty list if no hits
    """
    if not hasattr(full_citation, "citing_opinion"):
        full_citation.citing_opinion = None

    # TODO: Create shared solr connection for all queries
    si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r")
    main_params: SearchParam = {
        "q": "*",
        "fq": [
            "status:Precedential",  # Non-precedential documents aren't cited
        ],
        "caller": "citation.match_citations.match_citation",
    }
    if full_citation.citing_opinion is not None:
        # Eliminate self-cites.
        main_params["fq"].append("-id:%s" % full_citation.citing_opinion.pk)
    # Set up filter parameters
    if full_citation.year:
        start_year = end_year = full_citation.year
    else:
        start_year, end_year = get_years_from_reporter(full_citation)
        if (
            full_citation.citing_opinion is not None
            and full_citation.citing_opinion.cluster.date_filed
        ):
            end_year = min(
                end_year, full_citation.citing_opinion.cluster.date_filed.year
            )
    main_params["fq"].append(
        "dateFiled:%s" % build_date_range(start_year, end_year)
    )

    if full_citation.court:
        main_params["fq"].append("court_exact:%s" % full_citation.court)

    # Take 1: Use a phrase query to search the citation field.
    main_params["fq"].append('citation:("%s")' % full_citation.base_citation())
    results = si.query().add_extra(**main_params).execute()
    si.conn.http_connection.close()
    if len(results) == 1:
        return results
    if len(results) > 1:
        if (
            full_citation.citing_opinion is not None
            and full_citation.defendant
        ):  # Refine using defendant, if there is one
            results = case_name_query(
                si, main_params, full_citation, full_citation.citing_opinion
            )
            return results

    # Give up.
    return []
Exemplo n.º 8
0
def index_sitemap_maker(request):
    """Generate a sitemap index page

    Counts the number of cases in the site, divides by `items_per_sitemap` and
    provides links items.
    """
    connection_string_sitemap_path_pairs = (
        (settings.SOLR_OPINION_URL, reverse('opinion_sitemap'), False),
        (settings.SOLR_RECAP_URL, reverse('recap_sitemap'), True),
        (settings.SOLR_AUDIO_URL, reverse('oral_argument_sitemap'), False),
        (settings.SOLR_PEOPLE_URL, reverse('people_sitemap'), False),
    )
    sites = []
    for connection_string, path, group in connection_string_sitemap_path_pairs:
        conn = ExtraSolrInterface(connection_string)
        count = conn.query().add_extra(**make_index_params(group)).count()
        num_pages = count / items_per_sitemap + 1
        for i in range(1, num_pages + 1):
            sites.append('https://www.courtlistener.com%s?p=%s' % (path, i))

    # Random additional sitemaps.
    sites.extend([
        'https://www.courtlistener.com%s' % reverse('simple_pages_sitemap'),
        'https://www.courtlistener.com/sitemap-visualizations.xml',
    ])

    xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites})

    # These links contain case names, so they should get crawled but not
    # indexed
    response = HttpResponse(xml, content_type='application/xml')
    response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'
    return response
Exemplo n.º 9
0
def docket_pks_for_query(query_string):
    """Yield docket PKs for a query by iterating over the full result set

    :param query_string: The query to run as a URL-encoded string (typically
    starts with 'q='). E.g. 'q=foo&type=r&order_by=dateFiled+asc&court=dcd'
    :return: The next docket PK in the results
    """
    main_query = build_main_query_from_query_string(
        query_string,
        {"fl": ["docket_id"]},
        {
            "group": True,
            "facet": False,
            "highlight": False
        },
    )
    main_query["group.limit"] = 0
    main_query["sort"] = "dateFiled asc"
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    search = si.query().add_extra(**main_query)
    si.conn.http_connection.close()
    page_size = 100
    paginator = Paginator(search, page_size)
    for page_number in paginator.page_range:
        page = paginator.page(page_number)
        for item in page:
            yield item["groupValue"]
Exemplo n.º 10
0
def get_citing_clusters_with_cache(
    cluster: OpinionCluster, ) -> Tuple[list, int]:
    """Use Solr to get clusters citing the one we're looking at

    :param cluster: The cluster we're targeting
    :type cluster: OpinionCluster
    :return: A tuple of the list of solr results and the number of results
    """
    cache_key = "citing:%s" % cluster.pk
    cache = caches["db_cache"]
    cached_results = cache.get(cache_key)
    if cached_results is not None:
        return cached_results

    # Cache miss. Get the citing results from Solr
    sub_opinion_pks = cluster.sub_opinions.values_list("pk", flat=True)
    ids_str = " OR ".join([str(pk) for pk in sub_opinion_pks])
    q = {
        "q": "cites:(%s)" % ids_str,
        "rows": 5,
        "start": 0,
        "sort": "citeCount desc",
        "caller": "view_opinion",
        "fl": "absolute_url,caseName,dateFiled",
    }
    conn = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r")
    results = conn.query().add_extra(**q).execute()
    conn.conn.http_connection.close()
    citing_clusters = list(results)
    citing_cluster_count = results.result.numFound
    a_week = 60 * 60 * 24 * 7
    cache.set(cache_key, (citing_clusters, citing_cluster_count), a_week)

    return citing_clusters, citing_cluster_count
Exemplo n.º 11
0
def make_solr_sitemap(request, solr_url, params, changefreq, low_priority_pages,
                      url_field):
    solr = ExtraSolrInterface(solr_url)
    page = int(request.GET.get('p', 1))
    court = request.GET['court']
    params['start'] = (page - 1) * items_per_sitemap
    params['fq'] = ['court_exact:%s' % court]
    results = solr.query().add_extra(**params).execute()

    urls = []
    cl = 'https://www.courtlistener.com'
    for result in results:
        result = normalize_grouping(result)
        url_strs = ['%s%s' % (cl, result[url_field])]
        if result.get('local_path') and \
                not result['local_path'].endswith('.xml'):
            url_strs.append('%s/%s' % (cl, result['local_path']))

        item = {}
        for url_str in url_strs:
            item['location'] = url_str
            item['changefreq'] = changefreq
            item['lastmod'] = result['timestamp']
            if any(s in url_str for s in low_priority_pages):
                item['priority'] = '0.3'
            else:
                item['priority'] = '0.5'
            urls.append(item.copy())

    xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls}))
    response = HttpResponse(xml, content_type='application/xml')
    response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'
    return response
Exemplo n.º 12
0
 def __init__(self, main_query, offset, limit, type=None, length=None):
     super(SolrList, self).__init__()
     self.main_query = main_query
     self.offset = offset
     self.limit = limit
     self.type = type
     self._item_cache = []
     if self.type == 'o':
         self.conn = ExtraSolrInterface(
             settings.SOLR_OPINION_URL,
             mode='r',
         )
     elif self.type == 'oa':
         self.conn = ExtraSolrInterface(
             settings.SOLR_AUDIO_URL,
             mode='r',
         )
     elif self.type == 'r':
         self.conn = ExtraSolrInterface(
             settings.SOLR_RECAP_URL,
             mode='r',
         )
     elif self.type == 'p':
         self.conn = ExtraSolrInterface(
             settings.SOLR_PEOPLE_URL,
             mode='r',
         )
     self._length = length
Exemplo n.º 13
0
def make_solr_sitemap(request, solr_url, params, changefreq,
                      low_priority_pages, url_field):
    solr = ExtraSolrInterface(solr_url)
    page = int(request.GET.get("p", 1))
    court = request.GET["court"]
    params["start"] = (page - 1) * items_per_sitemap
    params["fq"] = ["court_exact:%s" % court]
    results = solr.query().add_extra(**params).execute()
    solr.conn.http_connection.close()
    urls = []
    cl = "https://www.courtlistener.com"
    for result in results:
        result = normalize_grouping(result)
        url_strs = ["%s%s" % (cl, result[url_field])]
        if result.get(
                "local_path") and not result["local_path"].endswith(".xml"):
            url_strs.append("%s/%s" % (cl, result["local_path"]))

        item = {}
        for url_str in url_strs:
            item["location"] = url_str
            item["changefreq"] = changefreq
            item["lastmod"] = result["timestamp"]
            if any(s in url_str for s in low_priority_pages):
                item["priority"] = "0.3"
            else:
                item["priority"] = "0.5"
            urls.append(item.copy())

    xml = smart_str(loader.render_to_string("sitemap.xml", {"urlset": urls}))
    response = HttpResponse(xml, content_type="application/xml")
    response["X-Robots-Tag"] = "noindex, noodp, noarchive, noimageindex"
    return response
Exemplo n.º 14
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        self.verbosity = int(options.get("verbosity", 1))
        self.options = options
        self.noinput = options["noinput"]
        if not self.options["optimize_everything"]:
            self.solr_url = options["solr_url"]
            self.si = ExtraSolrInterface(self.solr_url, mode="rw")
            self.type = options["type"]

        if options["update"]:
            if self.verbosity >= 1:
                self.stdout.write("Running in update mode...\n")
            if options.get("everything"):
                self.add_or_update_all()
            elif options.get("datetime"):
                self.add_or_update_by_datetime(options["datetime"])
            elif options.get("query"):
                self.stderr.write("Updating by query not implemented.")
                sys.exit(1)
            elif options.get("items"):
                self.add_or_update(*options["items"])

        elif options.get("delete"):
            if self.verbosity >= 1:
                self.stdout.write("Running in deletion mode...\n")
            if options.get("everything"):
                self.delete_all()
            elif options.get("datetime"):
                self.delete_by_datetime(options["datetime"])
            elif options.get("query"):
                self.delete_by_query(options["query"])
            elif options.get("items"):
                self.delete(*options["items"])

        if options.get("do_commit"):
            self.si.commit()

        if options.get("optimize"):
            self.optimize()

        if options.get("optimize_everything"):
            self.optimize_everything()

        self.si.conn.http_connection.close()
        if not any(
            [
                options["update"],
                options.get("delete"),
                options.get("do_commit"),
                options.get("optimize"),
                options.get("optimize_everything"),
            ]
        ):
            self.stderr.write(
                "Error: You must specify whether you wish to "
                "update, delete, commit, or optimize your "
                "index.\n"
            )
            sys.exit(1)
Exemplo n.º 15
0
 def __init__(self, *args, **kwargs):
     super(Command, self).__init__(*args, **kwargs)
     self.connections = {
         'o': ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r'),
         'oa': ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r'),
     }
     self.options = {}
     self.valid_ids = {}
Exemplo n.º 16
0
def make_court_variable():
    courts = Court.objects.exclude(jurisdiction=Court.TESTING_COURT)
    si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r")
    response = si.query().add_extra(**build_court_count_query()).execute()
    si.conn.http_connection.close()
    court_count_tuples = response.facet_counts.facet_fields["court_exact"]
    courts = annotate_courts_with_counts(courts, court_count_tuples)
    return courts
Exemplo n.º 17
0
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {"rows": page_size, "fl": ["id", "docket_id"]},
        {"group": False, "facet": False, "highlight": False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query)
    si.conn.http_connection.close()

    q = options["queue"]
    recap_user = User.objects.get(username="******")
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options["offset"]:
                i += 1
                continue
            if i >= options["limit"] > 0:
                break

            logger.info(
                "Doing row %s: rd: %s, docket: %s",
                i,
                result["id"],
                result["docket_id"],
            )
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(result["id"], session.cookies).set(
                    queue=q
                ),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(result["id"], recap_user.pk).set(
                    queue=q
                ),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(
                    queue=q
                ),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
Exemplo n.º 18
0
 def __init__(self, *args, **kwargs):
     super(Command, self).__init__(*args, **kwargs)
     self.connections = {
         "o": ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r"),
         "oa": ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode="r"),
         "r": ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r"),
     }
     self.options = {}
     self.valid_ids = {}
Exemplo n.º 19
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']

    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 20000
    main_query = build_main_query_from_query_string(
        QUERY_STRING,
        {
            'rows': page_size,
            'fl': ['id', 'docket_id']
        },
        {
            'group': False,
            'facet': False,
            'highlight': False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'],
                    result['docket_id'])

        try:
            rd = RECAPDocument.objects.get(pk=result['id'])
        except RECAPDocument.DoesNotExist:
            logger.warn("Unable to find RECAP Document with id %s",
                        result['id'])
            continue

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG)
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
Exemplo n.º 20
0
 def items(self, obj):
     solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r')
     params = {
         'q': '*',
         'sort': 'dateArgued desc',
         'rows': '20',
         'start': '0',
         'caller': 'AllJurisdictionsPodcast',
     }
     return solr.query().add_extra(**params).execute()
Exemplo n.º 21
0
 def items(self, obj):
     solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode="r")
     params = {
         "q": "*",
         "sort": "dateArgued desc",
         "rows": "20",
         "start": "0",
         "caller": "AllJurisdictionsPodcast",
     }
     return solr.query().add_extra(**params).execute()
Exemplo n.º 22
0
 def items(self, obj):
     solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r')
     params = {
         'q': '*',
         'sort': 'dateArgued desc',
         'rows': '20',
         'start': '0',
         'caller': 'AllJurisdictionsPodcast',
     }
     return solr.query().add_extra(**params).execute()
Exemplo n.º 23
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        self.verbosity = int(options.get('verbosity', 1))
        self.options = options
        self.noinput = options['noinput']
        if not self.options['optimize_everything']:
            self.solr_url = options['solr_url']
            self.si = ExtraSolrInterface(self.solr_url, mode='rw')
            self.type, self.type_str = options['type']

        if options['update']:
            if self.verbosity >= 1:
                self.stdout.write('Running in update mode...\n')
            if options.get('everything'):
                self.add_or_update_all()
            elif options.get('datetime'):
                self.add_or_update_by_datetime(options['datetime'])
            elif options.get('query'):
                self.stderr.write("Updating by query not implemented.")
                sys.exit(1)
            elif options.get('items'):
                self.add_or_update(*options['items'])

        elif options.get('delete'):
            if self.verbosity >= 1:
                self.stdout.write('Running in deletion mode...\n')
            if options.get('everything'):
                self.delete_all()
            elif options.get('datetime'):
                self.delete_by_datetime(options['datetime'])
            elif options.get('query'):
                self.delete_by_query(options['query'])
            elif options.get('items'):
                self.delete(*options['items'])

        if options.get('do_commit'):
            self.si.commit()

        if options.get('optimize'):
            self.optimize()

        if options.get('optimize_everything'):
            self.optimize_everything()

        if not any([
                options['update'],
                options.get('delete'),
                options.get('do_commit'),
                options.get('optimize'),
                options.get('optimize_everything')
        ]):
            self.stderr.write('Error: You must specify whether you wish to '
                              'update, delete, commit, or optimize your '
                              'index.\n')
            sys.exit(1)
Exemplo n.º 24
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {
            "rows": page_size,
            "fl": ["id", "docket_id"]
        },
        {
            "group": False,
            "facet": False,
            "highlight": False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query).execute()
    si.conn.http_connection.close()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result["id"])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result["docket_id"])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
        i += 1
Exemplo n.º 25
0
 def items(self, obj):
     """Do a Solr query here. Return the first 20 results"""
     solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r")
     params = {
         "q": "*",
         "sort": "dateFiled desc",
         "rows": "20",
         "start": "0",
         "caller": "AllJurisdictionsFeed",
     }
     return solr.query().add_extra(**params).execute()
Exemplo n.º 26
0
 def items(self, obj):
     """Do a Solr query here. Return the first 20 results"""
     solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r')
     params = {
         'q': '*',
         'sort': 'dateFiled desc',
         'rows': '20',
         'start': '0',
         'caller': 'AllJurisdictionsFeed',
     }
     return solr.query().add_extra(**params).execute()
Exemplo n.º 27
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.sis = {
         SEARCH_TYPES.OPINION:
         ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r"),
         SEARCH_TYPES.ORAL_ARGUMENT:
         ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode="r"),
         SEARCH_TYPES.RECAP:
         ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r"),
     }
     self.options = {}
     self.valid_ids = {}
Exemplo n.º 28
0
 def optimize_everything(self):
     """Run the optimize command on all indexes."""
     urls = settings.SOLR_URLS.values()
     self.stdout.write("Found %s indexes. Optimizing...\n" % len(urls))
     for url in urls:
         self.stdout.write(" - {url}\n".format(url=url))
         try:
             si = ExtraSolrInterface(url)
         except EnvironmentError:
             self.stderr.write("   Couldn't load schema!")
             continue
         si.optimize()
     self.stdout.write('Done.\n')
Exemplo n.º 29
0
 def __init__(self, main_query, offset, type, length=None):
     super(SolrList, self).__init__()
     self.main_query = main_query
     self.offset = offset
     self.type = type
     self._item_cache = []
     if self.type == SEARCH_TYPES.OPINION:
         self.conn = ExtraSolrInterface(
             settings.SOLR_OPINION_URL,
             mode="r",
         )
     elif self.type == SEARCH_TYPES.ORAL_ARGUMENT:
         self.conn = ExtraSolrInterface(
             settings.SOLR_AUDIO_URL,
             mode="r",
         )
     elif self.type == SEARCH_TYPES.RECAP:
         self.conn = ExtraSolrInterface(
             settings.SOLR_RECAP_URL,
             mode="r",
         )
     elif self.type == SEARCH_TYPES.PEOPLE:
         self.conn = ExtraSolrInterface(
             settings.SOLR_PEOPLE_URL,
             mode="r",
         )
     self._length = length
Exemplo n.º 30
0
 def __init__(self, main_query, offset, limit, type=None, length=None):
     super(SolrList, self).__init__()
     self.main_query = main_query
     self.offset = offset
     self.limit = limit
     self.type = type
     self._item_cache = []
     if self.type == 'o':
         self.conn = ExtraSolrInterface(
             settings.SOLR_OPINION_URL,
             mode='r',
         )
     elif self.type == 'oa':
         self.conn = ExtraSolrInterface(
             settings.SOLR_AUDIO_URL,
             mode='r',
         )
     elif self.type == 'r':
         self.conn = ExtraSolrInterface(
             settings.SOLR_RECAP_URL,
             mode='r',
         )
     elif self.type == 'p':
         self.conn = ExtraSolrInterface(
             settings.SOLR_PEOPLE_URL,
             mode='r',
         )
     self._length = length
Exemplo n.º 31
0
 def items(self, obj):
     """
     Returns a list of items to publish in this feed.
     """
     solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode="r")
     params = {
         "q": "*",
         "fq": "court_exact:%s" % obj.pk,
         "sort": "dateArgued desc",
         "rows": "20",
         "start": "0",
         "caller": "JurisdictionPodcast",
     }
     return solr.query().add_extra(**params).execute()
Exemplo n.º 32
0
def get_docket_ids(main_query):
    """Get the docket IDs for a query dict.

    :returns: a set() of docket IDs
    """
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    docket_ids = set()

    for result in results:
        docket_ids.add(result['docket_id'])

    logger.info("Got %s docket IDs back from Solr." % len(docket_ids))
    return docket_ids
Exemplo n.º 33
0
 def items(self, obj):
     """
     Returns a list of items to publish in this feed.
     """
     solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r')
     params = {
         'q': '*',
         'fq': 'court_exact:%s' % obj.pk,
         'sort': 'dateArgued desc',
         'rows': '20',
         'start': '0',
         'caller': 'JurisdictionPodcast',
     }
     return solr.query().add_extra(**params).execute()
Exemplo n.º 34
0
 def items(self, obj):
     """
     Returns a list of items to publish in this feed.
     """
     solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r')
     params = {
         'q': '*',
         'fq': 'court_exact:%s' % obj.pk,
         'sort': 'dateArgued desc',
         'rows': '20',
         'start': '0',
         'caller': 'JurisdictionPodcast',
     }
     return solr.query().add_extra(**params).execute()
def get_docket_ids(main_query):
    """Get the docket IDs for a query dict.

    :returns: a set() of docket IDs
    """
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    docket_ids = set()

    for result in results:
        docket_ids.add(result['docket_id'])

    logger.info("Got %s docket IDs back from Solr." % len(docket_ids))
    return docket_ids
Exemplo n.º 36
0
 def items(self, obj):
     """Do a Solr query here. Return the first 20 results"""
     solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r")
     params = {
         "q": "*",
         "fq": "court_exact:%s" % obj.pk,
         "sort": "dateFiled desc",
         "rows": "20",
         "start": "0",
         "caller": "JurisdictionFeed",
     }
     items = solr.query().add_extra(**params).execute()
     solr.conn.http_connection.close()
     return items
Exemplo n.º 37
0
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query)

    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options['offset']:
                i += 1
                continue
            if i >= options['limit'] > 0:
                break

            logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'],
                        result['docket_id'])
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(
                    result['id'], session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(
                    result['id'], recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
Exemplo n.º 38
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result['id'])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result['docket_id'])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
        i += 1
Exemplo n.º 39
0
 def items(self, obj):
     search_form = SearchForm(obj.GET)
     if search_form.is_valid():
         cd = search_form.cleaned_data
         solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r')
         main_params = search_utils.build_main_query(cd, highlight=False,
                                                     facet=False)
         main_params.update({
             'sort': 'dateArgued desc',
             'rows': '20',
             'start': '0',
             'caller': 'SearchFeed',
         })
         return solr.query().add_extra(**main_params).execute()
     else:
         return []
Exemplo n.º 40
0
 def items(self, obj):
     """Do a Solr query here. Return the first 20 results"""
     search_form = SearchForm(obj.GET)
     if search_form.is_valid():
         cd = search_form.cleaned_data
         if cd['type'] == 'o':
             solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r')
         elif cd['type'] == 'r':
             solr = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
         main_params = search_utils.build_main_query(cd, highlight=False)
         main_params.update({
             'sort': 'dateFiled desc',
             'rows': '20',
             'start': '0',
             'caller': 'SearchFeed',
         })
         return solr.query().add_extra(**main_params).execute()
     else:
         return []
Exemplo n.º 41
0
    def get_expected_item_count(self):
        # OpinionsSitemap uses the solr index to generate the page, so the only
        # accurate count comes from the index itself which will also be based
        # on the fixtures.
        conn = ExtraSolrInterface(settings.SOLR_OPINION_URL)
        params = make_sitemap_solr_params('dateFiled asc', 'o_sitemap')
        params['rows'] = items_per_sitemap
        params['fq'] = ['court_exact:%s' % self.court_id]

        r = conn.query().add_extra(**params).execute()

        # the underlying SitemapTest relies on counting url elements in the xml
        # response...this logic mimics the creation of the xml, so we at least
        # know what we *should* get getting for a count if the SiteMapTest's
        # HTTP client-based test gets an HTTP 200
        count = 0
        for result in r:
            if result.get('local_path'):
                count += 2
            else:
                count += 1
        return count
Exemplo n.º 42
0
def get_solr_result_objects(cd, facet):
    """Note that this doesn't run the query yet. Not until the
    pagination is run.
    """
    search_type = cd['type']
    if search_type == 'o':
        si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r')
        results = si.query().add_extra(**build_main_query(cd, facet=facet))
    elif search_type == 'r':
        si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
        results = si.query().add_extra(**build_main_query(cd, facet=facet))
    elif search_type == 'oa':
        si = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r')
        results = si.query().add_extra(**build_main_query(cd, facet=facet))
    elif search_type == 'p':
        si = ExtraSolrInterface(settings.SOLR_PEOPLE_URL, mode='r')
        results = si.query().add_extra(**build_main_query(cd, facet=facet))
    else:
        raise NotImplementedError("Unknown search type: %s" % search_type)

    return results
Exemplo n.º 43
0
def do_search(request, rows=20, order_by=None, type=None, facet=True):

    query_citation = None
    error = False
    paged_results = None
    search_form = SearchForm(request.GET)
    courts = Court.objects.filter(in_use=True)

    if search_form.is_valid():
        cd = search_form.cleaned_data
        # Allows an override by calling methods.
        if order_by is not None:
            cd['order_by'] = order_by
        if type is not None:
            cd['type'] = type
        search_form = _clean_form(request, cd, courts)

        if cd['type'] == 'o':
            si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r')
            results = si.query().add_extra(**build_main_query(cd, facet=facet))
            query_citation = get_query_citation(cd)
        elif cd['type'] == 'r':
            si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
            results = si.query().add_extra(**build_main_query(cd, facet=facet))
        elif cd['type'] == 'oa':
            si = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r')
            results = si.query().add_extra(**build_main_query(cd, facet=facet))
        elif cd['type'] == 'p':
            si = ExtraSolrInterface(settings.SOLR_PEOPLE_URL, mode='r')
            results = si.query().add_extra(**build_main_query(cd, facet=facet))

        # Set up pagination
        try:
            if cd['type'] == 'r':
                rows = 10
            paginator = Paginator(results, rows)
            page = request.GET.get('page', 1)
            try:
                paged_results = paginator.page(page)
            except PageNotAnInteger:
                paged_results = paginator.page(1)
            except EmptyPage:
                # Page is out of range (e.g. 9999), deliver last page.
                paged_results = paginator.page(paginator.num_pages)
        except Exception as e:
            # Catches any Solr errors, and aborts.
            logger.warning("Error loading pagination on search page with "
                           "request: %s" % request.GET)
            logger.warning("Error was: %s" % e)
            if settings.DEBUG is True:
                traceback.print_exc()
            error = True

        # Post processing of the results
        regroup_snippets(paged_results)

    else:
        error = True

    courts, court_count_human, court_count = merge_form_with_courts(courts,
                                                                    search_form)
    return {
        'results': paged_results,
        'search_form': search_form,
        'courts': courts,
        'court_count_human': court_count_human,
        'court_count': court_count,
        'query_citation': query_citation,
        'facet_fields': make_stats_variable(search_form, paged_results),
        'error': error,
    }
Exemplo n.º 44
0
def do_search(request, rows=20, order_by=None, type=None):

    search_form = SearchForm(request.GET)
    if search_form.is_valid():
        cd = search_form.cleaned_data
        # Allows an override by calling methods.
        if order_by is not None:
            cd['order_by'] = order_by
        if type is not None:
            cd['type'] = type
        search_form = _clean_form(request, cd)

        try:
            query_citation = None
            status_facets = None
            if cd['type'] == 'o':
                si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r')
                stat_facet_fields = place_facet_queries(cd, si)
                status_facets = make_stats_variable(stat_facet_fields,
                                                    search_form)
                query_citation = get_query_citation(cd)
                results = si.query().add_extra(**build_main_query(cd))
            elif cd['type'] == 'r':
                si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
                results = si.query().add_extra(**build_main_query(cd))
            elif cd['type'] == 'oa':
                si = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r')
                results = si.query().add_extra(**build_main_query(cd))
            elif cd['type'] == 'p':
                si = ExtraSolrInterface(settings.SOLR_PEOPLE_URL, mode='r')
                results = si.query().add_extra(**build_main_query(cd))

            courts = Court.objects.filter(in_use=True)
            courts, court_count_human, court_count = merge_form_with_courts(
                courts,
                search_form
            )

        except Exception as e:
            if settings.DEBUG is True:
                traceback.print_exc()
            logger.warning("Error loading search with request: %s" % request.GET)
            logger.warning("Error was %s" % e)
            return {'error': True}

    else:
        # Invalid form, send it back
        logger.warning("Invalid form when loading search page with request: %s" % request.GET)
        return {'error': True}

    # Set up pagination
    try:
        paginator = Paginator(results, rows)
        page = request.GET.get('page', 1)
        try:
            paged_results = paginator.page(page)
        except PageNotAnInteger:
            # If page is not an integer, deliver first page.
            paged_results = paginator.page(1)
        except EmptyPage:
            # If page is out of range (e.g. 9999), deliver last page of results.
            paged_results = paginator.page(paginator.num_pages)
    except Exception, e:
        # Catches any Solr errors, and aborts.
        logger.warning("Error loading pagination on search page with request: %s" % request.GET)
        logger.warning("Error was: %s" % e)
        if settings.DEBUG is True:
            traceback.print_exc()
        return {'error': True}
Exemplo n.º 45
0
class SolrList(object):
    """This implements a yielding list object that fetches items as they are
    queried.
    """

    def __init__(self, main_query, offset, limit, type=None, length=None):
        super(SolrList, self).__init__()
        self.main_query = main_query
        self.offset = offset
        self.limit = limit
        self.type = type
        self._item_cache = []
        if self.type == 'o':
            self.conn = ExtraSolrInterface(
                settings.SOLR_OPINION_URL,
                mode='r',
            )
        elif self.type == 'oa':
            self.conn = ExtraSolrInterface(
                settings.SOLR_AUDIO_URL,
                mode='r',
            )
        elif self.type == 'r':
            self.conn = ExtraSolrInterface(
                settings.SOLR_RECAP_URL,
                mode='r',
            )
        elif self.type == 'p':
            self.conn = ExtraSolrInterface(
                settings.SOLR_PEOPLE_URL,
                mode='r',
            )
        self._length = length

    def __len__(self):
        if self._length is None:
            mq = self.main_query.copy()  # local copy for manipulation
            mq['caller'] = 'api_search_count'
            count = self.conn.query().add_extra(**mq).count()
            self._length = count
        return self._length

    def __iter__(self):
        for item in range(0, len(self)):
            try:
                yield self._item_cache[item]
            except IndexError:
                yield self.__getitem__(item)

    def __getitem__(self, item):
        self.main_query['start'] = self.offset
        r = self.conn.query().add_extra(**self.main_query).execute()

        if r.group_field is None:
            # Pull the text snippet up a level
            for result in r.result.docs:
                result['snippet'] = '&hellip;'.join(
                        result['solr_highlights']['text'])
                self._item_cache.append(SolrObject(initial=result))
        else:
            # Grouped results
            for group in getattr(r.groups, r.group_field)['groups']:
                snippets = []
                for doc in group['doclist']['docs']:
                    for snippet in doc['solr_highlights']['text']:
                        if snippet not in snippets:
                            snippets.append(snippet)

                doc0 = group['doclist']['docs'][0]
                doc0['snippet'] = '&hellip;'.join(snippets)
                self._item_cache.append(SolrObject(initial=doc0))

        # Now, assuming our _item_cache is all set, we just get the item.
        if isinstance(item, slice):
            s = slice(item.start - int(self.offset),
                      item.stop - int(self.offset),
                      item.step)
            return self._item_cache[s]
        else:
            # Not slicing.
            try:
                return self._item_cache[item]
            except IndexError:
                # No results!
                return []

    def append(self, p_object):
        """Lightly override the append method so we get items duplicated in
        our cache.
        """
        self._item_cache.append(p_object)