def handle(self, *args, **options): self.verbosity = int(options.get('verbosity', 1)) self.options = options self.rate = options.get('rate') if not self.rate: self.stderr.write("You must specify a rate") exit(1) if self.rate not in dict(FREQUENCY).keys(): self.stderr.write("Invalid rate. Rate must be one of: %s" % ', '.join(dict(FREQUENCY).keys())) exit(1) self.connections = { 'o': sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r'), 'oa': sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r'), } if self.rate == 'rt': self.valid_ids = self.get_new_ids() if self.options['simulate']: logger.info("******************************************\n" "* SIMULATE MODE - NO EMAILS WILL BE SENT *\n" "******************************************\n") self.send_emails() self.clean_rt_queue()
def run_query(self, alert, cut_off_date): results = None error = False try: if self.verbosity >= 1: print "Now running the query: %s" % alert.alertText # Set up the data data = search_utils.get_string_to_dict(alert.alertText) try: del data['filed_before'] except KeyError: pass data['order_by'] = 'score desc' if self.verbosity >= 1: print " Data sent to SearchForm is: %s" % data search_form = SearchForm(data) if search_form.is_valid(): cd = search_form.cleaned_data if cd['type'] == 'o': cd['filed_after'] = cut_off_date elif cd['type'] == 'oa': cd['argued_after'] = cut_off_date main_params = search_utils.build_main_query(cd) main_params.update({ 'rows': '20', 'start': '0', 'hl.tag.pre': '<em><strong>', 'hl.tag.post': '</strong></em>', 'caller': 'cl_send_alerts', }) if cd['type'] == 'o': conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r') elif cd['type'] == 'oa': conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r') results = conn.raw_query(**main_params).execute() else: print " Query for alert %s was invalid" % alert.alertText print " Errors from the SearchForm: %s" % search_form.errors error = True except: traceback.print_exc() print " Search for this alert failed: %s" % alert.alertText error = True if self.verbosity >= 1: if results: print " There were %s results" % len(results) else: print " There were no results" if self.verbosity >= 2: print " The value of results is: %s" % results return error, cd['type'], results,
def add_or_update_items(items, solr_url=settings.SOLR_OPINION_URL): """Adds an item to a solr index. This function is for use with the update_index command. It's slightly different than the commands below because it expects a Django object, rather than a primary key. This rejects the standard Celery advice about not passing objects around, but thread safety shouldn't be an issue since this is only used by the update_index command, and we want to query and build the SearchDocument objects in the task, not in its caller. """ si = sunburnt.SolrInterface(solr_url, mode='w') if hasattr(items, "items") or not hasattr(items, "__iter__"): # If it's a dict or a single item make it a list items = [items] search_item_list = [] for item in items: try: if type(item) == Audio: search_item_list.append(SearchAudioFile(item)) elif type(item) == Document: search_item_list.append(SearchDocument(item)) except AttributeError: print "AttributeError trying to add doc.pk: %s" % item.pk except InvalidDocumentError: print "Unable to parse document %s" % item.pk try: si.add(search_item_list) except socket.error, exc: add_or_update_items.retry(exc=exc, countdown=120)
def add_or_update_audio_files(item_pks): si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='w') item_list = [] for pk in item_pks: item = Audio.objects.get(pk=pk) item_list.append(SearchDocument(item)) si.add(item_list) si.commit()
def opinion_sitemap_maker(request): conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r') page = request.GET.get("p") start = (int(page) - 1) * items_per_sitemap params = { 'q': '*:*', 'rows': items_per_sitemap, 'start': start, 'fl': ','.join([ 'absolute_url', 'dateFiled', 'local_path', 'citeCount', 'timestamp', ]), 'sort': 'dateFiled asc', 'caller': 'opinion_sitemap_maker', } search_results_object = conn.raw_query(**params).execute() # Translate Solr object into something Django's template can use urls = [] for result in search_results_object: url_strs = ['https://www.courtlistener.com%s' % result['absolute_url']] if int(result['citeCount']) > 0: # Only include this page if there are citations. url_strs.append('https://www.courtlistener.com%scited-by/' % result['absolute_url']) url_strs.append('https://www.courtlistener.com%sauthorities/' % result['absolute_url']) if result.get('local_path') and result.get('local_path') != '': url_strs.append('https://www.courtlistener.com/%s' % result['local_path']) sitemap_item = {} for url_str in url_strs: sitemap_item['location'] = url_str sitemap_item['changefreq'] = 'yearly' sitemap_item['lastmod'] = result['timestamp'] if any(s in url_str for s in ['authorities', 'cited-by', 'pdf', 'doc', 'wpd']): sitemap_item['priority'] = '0.3' else: sitemap_item['priority'] = '0.5' urls.append(dict(sitemap_item)) xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls})) # These links contain case names, so they should get crawled but not # indexed response = HttpResponse(xml, mimetype='application/xml') response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex' return response
def setUp(self): # Set up a testing core in Solr and swap it in self.core_name = '%s.test-%s' % (self.__module__, time.time()) create_solr_core(self.core_name) swap_solr_core('collection1', self.core_name) self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') # Set up a handy court object self.court = Court.objects.get(pk='test')
def items(self, obj): conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r') params = { 'q': '*:*', 'sort': 'dateArgued desc', 'rows': '20', 'start': '0', 'caller': 'AllJurisdictionsPodcast', } return conn.raw_query(**params).execute()
def add_or_update_audio_file(pk, force_commit=True): """Updates the document in the index. Called by Document save function. """ si = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='w') try: si.add(SearchAudioFile(Audio.objects.get(pk=pk))) if force_commit: si.commit() except SolrError, exc: add_or_update_audio_file.retry(exc=exc, countdown=30)
def add_or_update_doc(pk, commit=True): """Updates the document in the index. Called by Document save function. """ si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='w') try: si.add(SearchDocument(Document.objects.get(pk=pk))) if commit: si.commit() except SolrError, exc: add_or_update_doc.retry(exc=exc, countdown=30)
def place_facet_queries(cd, conn=sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r')): """Get facet values for the status filters Using the search form, query Solr and get the values for the status filters. """ # Build up all the queries needed facet_params = { 'rows': '0', 'facet': 'true', 'facet.mincount': 0, 'facet.field': '{!ex=dt}status_exact', 'q': cd['q'] or '*:*', 'caller': 'facet_parameters', } fq = [] # Case Name and judges if cd['case_name']: fq.append(make_fq(cd, 'caseName', 'case_name')) if cd['judge']: fq.append(make_fq(cd, 'judge', 'judge')) # Citations if cd['citation']: fq.append(make_fq(cd, 'citation', 'citation')) if cd['docket_number']: fq.append(make_fq(cd, 'docketNumber', 'docket_number')) if cd['neutral_cite']: fq.append(make_fq(cd, 'neutralCite', 'neutral_cite')) fq.append( make_date_query('dateFiled', cd['filed_before'], cd['filed_after'])) fq.append(make_cite_count_query(cd)) # Faceting selected_courts_string = get_selected_field_string( cd, 'court_') # Status facets depend on court checkboxes selected_stats_string = get_selected_field_string(cd, 'stat_') if len(selected_stats_string) > 0: fq.extend([ '{!tag=dt}status_exact:(%s)' % selected_stats_string, 'court_exact:(%s)' % selected_courts_string ]) # If a param has been added to the fq variables, then we add them to the # main_params var. Otherwise, we don't, as doing so throws an error. if len(fq) > 0: facet_params['fq'] = fq stat_facet_fields = conn.raw_query( **facet_params).execute().facet_counts.facet_fields return stat_facet_fields
def update_cite(citation_id, commit=True): """If a citation and a document are both updated simultaneously, we will needlessly update the index twice. No easy way around it. """ si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='w') cite = Citation.objects.get(pk=citation_id) for doc in cite.parent_documents.all(): search_doc = SearchDocument(doc) si.add(search_doc) if commit: si.commit()
def items(self, obj): """Do a Solr query here. Return the first 20 results""" conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r') params = { 'q': '*:*', 'sort': 'dateFiled desc', 'rows': '20', 'start': '0', 'caller': 'AllJurisdictionsFeed', } return conn.raw_query(**params).execute()
def match_citation(citation, citing_doc): # TODO: Create shared solr connection to use across multiple citations/ # documents conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r') main_params = {'fq': []} # Set up filter parameters start_year = 1750 end_year = date.today().year if citation.year: start_year = end_year = citation.year else: if citation.lookup_index: # Some cases can't be disambiguated. reporter_dates = REPORTERS[citation.canonical_reporter][ citation.lookup_index]['editions'][citation.reporter] if hasattr(reporter_dates['start'], 'year'): start_year = reporter_dates['start'].year else: start_year = 1750 if hasattr(reporter_dates['end'], 'year'): end_year = reporter_dates['end'].year else: end_year = 2030 if citing_doc.date_filed: end_year = min(end_year, citing_doc.date_filed.year) date_param = 'dateFiled:%s' % build_date_range(start_year, end_year) main_params['fq'].append(date_param) if citation.court: court_param = 'court_exact:%s' % citation.court main_params['fq'].append(court_param) # Non-precedential documents shouldn't be cited main_params['fq'].append('status:Precedential') # Take 1: Use citation citation_param = 'citation:"%s"' % citation.base_citation() main_params['fq'].append(citation_param) main_params['caller'] = 'citations' results = conn.raw_query(**main_params).execute() if len(results) == 1: return results, True if len(results) > 1: if citation.defendant: # Refine using defendant, if there is one results = case_name_query(conn, main_params, citation, citing_doc) return results, True # Take 2: Use case name if not citation.defendant: return [], False # Remove citation parameter main_params['fq'].remove(citation_param) return case_name_query(conn, main_params, citation, citing_doc), False
def items(self, obj): """ Returns a list of items to publish in this feed. """ conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r') params = { 'q': '*:*', 'fq': 'court_exact:%s' % obj.pk, 'sort': 'dateArgued desc', 'rows': '20', 'start': '0', 'caller': 'JurisdictionPodcast', } return conn.raw_query(**params).execute()
def do_search(request, rows=20, order_by=None, type=None): # Bind the search form. search_form = SearchForm(request.GET) if search_form.is_valid(): cd = search_form.cleaned_data # Allows an override by calling methods. if order_by: cd['order_by'] = order_by if type: cd['type'] = type search_form = _clean_form(request, cd) try: if cd['type'] == 'o': conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r') stat_facet_fields = search_utils.place_facet_queries(cd, conn) status_facets = search_utils.make_stats_variable( stat_facet_fields, search_form) elif cd['type'] == 'oa': conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r') status_facets = None results_si = conn.raw_query(**search_utils.build_main_query(cd)) courts = Court.objects.filter(in_use=True).values( 'pk', 'short_name', 'jurisdiction', 'has_oral_argument_scraper') courts, court_count_human, court_count = search_utils\ .merge_form_with_courts(courts, search_form) except Exception, e: logger.warning("Error loading search page with request: %s" % request.GET) logger.warning("Error was %s" % e) return {'error': True}
def items(self, obj): search_form = SearchForm(obj.GET) if search_form.is_valid(): cd = search_form.cleaned_data conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r') main_params = search_utils.build_main_query(cd, highlight=False) main_params.update({ 'sort': 'dateArgued desc', 'rows': '20', 'start': '0', 'caller': 'SearchFeed', }) return conn.raw_query(**main_params).execute() else: return []
def index_sitemap_maker(request): """Generate a sitemap index page Counts the number of cases in the site, divides by `items_per_sitemap` and provides links items. """ params = { 'q': '*:*', 'rows': '0', # just need the count 'start': '0', 'caller': 'sitemap_index', } connection_string_obj_type_pairs = ( (settings.SOLR_OPINION_URL, 'opinions'), (settings.SOLR_AUDIO_URL, 'oral-arguments'), ) sites = [] for connection_string, obj_type in connection_string_obj_type_pairs: conn = sunburnt.SolrInterface(connection_string, mode='r') search_results_object = conn.raw_query(**params).execute() count = search_results_object.result.numFound num_pages = count / items_per_sitemap + 1 for i in range(1, num_pages + 1): sites.append( 'https://www.courtlistener.com/sitemap-%s.xml?p=%s' % (obj_type, i) ) # Random additional sitemaps. sites.extend([ 'https://www.courtlistener.com/sitemap-donate.xml', ]) xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites}) # These links contain case names, so they should get crawled but not # indexed response = HttpResponse(xml, mimetype='application/xml') response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex' return response
def handle(self, *args, **options): both_list_and_endpoints = (options.get('doc_id') is not None and (options.get('start_id') is not None or options.get('end_id') is not None or options.get('filed_after') is not None)) no_option = (not any([options.get('doc_id') is None, options.get('start_id') is None, options.get('end_id') is None, options.get('filed_after') is None, options.get('all') is False])) if both_list_and_endpoints or no_option: raise CommandError('Please specify either a list of documents, a ' 'range of ids, a range of dates, or ' 'everything.') if options.get('filed_after'): start_date = make_aware(datetime.strptime(options['filed_after'], '%Y-%m-%d'), utc) self.index = options['index'].lower() self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') # Use query chaining to build the query query = Document.objects.all() if options.get('doc_id'): query = query.filter(pk=options.get('doc_id')) if options.get('end_id'): query = query.filter(pk__lte=options.get('end_id')) if options.get('start_id'): query = query.filter(pk__gte=options.get('start_id')) if options.get('filed_after'): query = query.filter(date_filed__gte=start_date) if options.get('all'): query = Document.objects.all() count = query.count() docs = queryset_generator(query, chunksize=10000) self.update_documents(docs, count)
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') # Set up testing cores in Solr and swap them in self.core_name_opinion = '%s.opinion-test-%s' % \ (self.__module__, time.time()) self.core_name_audio = '%s.audio-test-%s' % \ (self.__module__, time.time()) create_solr_core(self.core_name_opinion) create_solr_core( self.core_name_audio, schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf', 'audio_schema.xml'), instance_dir='/usr/local/solr/example/solr/audio', ) swap_solr_core('collection1', self.core_name_opinion) swap_solr_core('audio', self.core_name_audio) self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='rw') # Add three documents and three audio files to the index, but don't # extract their contents self.site_opinion = test_opinion_scraper.Site().parse() self.site_audio = test_oral_arg_scraper.Site().parse() cite_counts = (4, 6, 8) self.docs = {} for i in range(0, 3): cite = Citation( case_name=self.site_opinion.case_names[i], docket_number=self.site_opinion.docket_numbers[i], neutral_cite=self.site_opinion.neutral_citations[i], federal_cite_one=self.site_opinion.west_citations[i], ) cite.save(index=False) docket = Docket( case_name=self.site_opinion.case_names[i], court=self.court, ) docket.save() self.docs[i] = Document( date_filed=self.site_opinion.case_dates[i], citation=cite, docket=docket, precedential_status=self.site_opinion.precedential_statuses[i], citation_count=cite_counts[i], nature_of_suit=self.site_opinion.nature_of_suit[i], judges=self.site_opinion.judges[i], ) self.docs[i].save() # Create citations between the documents # 0 ---cites--> 1, 2 # 1 ---cites--> 2 # 2 ---cites--> 0 self.docs[0].cases_cited.add(self.docs[1].citation) self.docs[0].cases_cited.add(self.docs[2].citation) self.docs[1].cases_cited.add(self.docs[2].citation) self.docs[2].cases_cited.add(self.docs[0].citation) for doc in self.docs.itervalues(): doc.save() # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() Command().scrape_court(site, full_crawl=True) self.expected_num_results_opinion = 3 self.expected_num_results_audio = 2 self.si_opinion.commit() self.si_audio.commit()
def get_dup_stats(doc): """The heart of the duplicate algorithm. Returns stats about the case as compared to other cases already in the system. Other methods can call this one, and can make decisions based on the stats generated here. If no likely duplicates are encountered, stats are returned as zeroes. Process: 1. Refine the possible result set down to just a few candidates. 2. Determine their likelihood of being duplicates according to a number of measures: - Similarity of case name - Similarity of docket number - Comparison of content length """ conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r') DEBUG = True ########################################## # 1: Refine by date, court and case name # ########################################## main_params = make_case_name_solr_query( doc.citation.case_name, doc.docket.court_id, doc.date_filed, DEBUG=DEBUG, ) main_params['caller'] = 'corpus_importer' if DEBUG: print " - main_params are: %s" % main_params candidates = conn.raw_query(**main_params).execute() if not len(candidates) and doc.citation.docket_number is not None: # Try by docket number rather than case name clean_docket_number_words = [] for word in doc.citation.docket_number.split(): if not re.search('\d', word): # Must have numbers. continue word = word.strip(string.punctuation) regex = re.compile('[%s]' % re.escape(string.punctuation)) if regex.search(re.sub('-', '', word)): # Can only have hyphens after stripping continue clean_docket_number_words.append(word) docket_q = ' OR '.join(clean_docket_number_words) if docket_q: main_params = { 'fq': [ 'court_exact:%s' % doc.docket.court_id, 'dateFiled:%s' % build_date_range(doc.date_filed, range=15), 'docketNumber:(%s)' % docket_q ], 'rows': 100, 'caller': 'corpus_importer', } if DEBUG: print " - main_params are: %s" % main_params candidates = conn.raw_query(**main_params).execute() if not len(candidates) and doc.docket.court_id == 'scotus': if doc.citation.federal_cite_one: # Scotus case, try by citation. main_params = { 'fq': [ 'court_exact:%s' % doc.docket.court_id, 'dateFiled:%s' % build_date_range( doc.date_filed, range=90), # Creates ~6 month span. 'citation:(%s)' % ' '.join([ re.sub(r"\D", '', w) for w in doc.citation.federal_cite_one.split() ]) ], 'rows': 100, 'caller': 'corpus_importer', } if DEBUG: print " - main_params are: %s" % main_params candidates = conn.raw_query(**main_params).execute() stats = {'candidate_count': len(candidates)} if not len(candidates): return stats, candidates ######################################### # 2: Attempt filtering by docket number # ######################################### # Two-step process. First we see if we have any exact hits. # Second, if there were exact hits, we forward those onwards. If not, we # forward everything. remaining_candidates = [] if doc.citation.docket_number: new_docket_number = re.sub("(\D|0)", "", doc.citation.docket_number) for candidate in candidates: if candidate.get('docketNumber'): # Get rid of anything in the docket numbers that's not a digit result_docket_number = re.sub("(\D|0)", "", candidate['docketNumber']) # Get rid of zeroes too. if new_docket_number == result_docket_number: remaining_candidates.append(candidate) if len(remaining_candidates) > 0: # We had one or more exact hits! Use those. candidates = remaining_candidates else: # We just let candidates from step one get passed through by doing nothing. pass stats = {'candidate_count': len(candidates)} ############################## # 3: Find the best case name # ############################## confidences = find_confidences(candidates, doc.citation.case_name) stats['case_name_similarities'] = confidences ##################################################################### # 4: Check content length, gestalt difference and cosine similarity # ##################################################################### percent_diffs, gestalt_diffs, cos_sims = [], [], [] new_stripped_content = re.sub('\W', '', doc.body_text).lower() for candidate in candidates: candidate_stripped_content = re.sub('\W', '', candidate['text']).lower() # Calculate the difference in text length and their gestalt difference try: length_diff = abs( len(candidate_stripped_content) - len(new_stripped_content)) except ZeroDivisionError: length_diff = 0 try: percent_diff = float(length_diff) / len(new_stripped_content) except ZeroDivisionError: percent_diff = 0 cos_sim = get_cosine_similarity(doc.body_text, candidate['text']) percent_diffs.append(percent_diff) gestalt_diffs.append( gen_diff_ratio(candidate_stripped_content, new_stripped_content)) cos_sims.append(cos_sim) stats['length_diffs'] = percent_diffs stats['gestalt_diffs'] = gestalt_diffs stats['cos_sims'] = cos_sims return stats, candidates
def handle(self, *args, **options): self.verbosity = int(options.get('verbosity', 1)) if options.get('solr_url'): self.solr_url = options.get('solr_url') self.si = sunburnt.SolrInterface(options.get('solr_url'), mode='rw') else: self.stderr.write("solr-url is a required parameter.\n") exit(1) t = options.get('type') if t is not None and t.lower() == 'opinions': self.type = Document elif t is not None and t == 'audio': self.type = Audio else: self.stderr.write('Unable to parse --type argument. See help for ' 'details.') exit(1) if options.get('datetime'): try: # Parse the date string into a datetime object dt = make_aware(datetime.datetime( *time.strptime(args[0], '%Y-%m-%d %H:%M:%S')[0:6]), utc) except ValueError: try: dt = make_aware(datetime.datetime( *time.strptime(args[0], '%Y-%m-%d')[0:5]), utc) except ValueError: self.stderr.write('Unable to parse time. Please use ' 'format: YYYY-MM-DD HH:MM:SS or ' 'YYYY-MM-DD.\n') sys.exit(1) if options.get('update_mode'): if self.verbosity >= 1: self.stdout.write('Running in update mode...\n') if options.get('everything'): self.add_or_update_all() elif options.get('datetime'): self.add_or_update_by_datetime(dt) elif options.get('query'): self.stderr.write("Updating by query not yet implemented.") sys.exit(1) elif options.get('item'): for item in args: try: int(item) except ValueError: self.stderr.write('Error: Item "%s" could not be ' 'converted to an ID.\n' % item) sys.exit(1) self.add_or_update(*args) else: self.stderr.write('Error: You must specify what you wish to ' 'update.\n') sys.exit(1) elif options.get('delete_mode'): if self.verbosity >= 1: self.stdout.write('Running in deletion mode...\n') if options.get('everything'): self.delete_all() elif options.get('datetime'): self.delete_by_datetime(dt) elif options.get('query'): self.delete_by_query(options.get('query')) elif options.get('item'): for item in args: try: int(item) except ValueError: self.stderr.write('Error: Item "%s" could not be ' 'converted to an ID.\n' % item) sys.exit(1) self.delete(*args) else: self.stderr.write('Error: You must specify what you wish to ' 'delete.\n') sys.exit(1) elif options.get('do_commit'): self.commit() elif options.get('optimize_mode'): self.optimize() else: self.stderr.write('Error: You must specify whether you wish to ' 'update, delete, commit, or optimize your ' 'index.\n') sys.exit(1)
def delete_item(pk, solr_url): """Deletes the item from the index. """ si = sunburnt.SolrInterface(solr_url, mode='w') si.delete(pk) si.commit()
# -*- coding: utf-8 -*- import os import sys execfile('/etc/courtlistener') sys.path.append(INSTALL_ROOT) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings") from django.conf import settings from alert.lib.mojibake import fix_mojibake from alert.lib import sunburnt from alert.search.models import Document from optparse import OptionParser conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r') def cleaner(simulate=False, verbose=True): """Fix cases that have mojibake as a result of pdffactory 3.51.""" # Find all the cases using Solr results_si = conn.raw_query(**{ 'q': u'ÚÑÎ', 'caller': 'mojibake', }) for result in results_si: # For each document doc = Document.objects.get(pk=result['id']) if verbose: print "https://www.courtlistener.com" + doc.get_absolute_url()
def delete_items(items): si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='w') si.delete(list(items)) si.commit()