def search(request, url=None, uuid=None): url = request.GET.get('url') or request.POST.get('url') if url: (url_is_blacklisted, pattern_match) = check_url_blacklist(url) if url_is_blacklisted: return search_page(request, error="{} is one of our press release sources so checking pages from that site would yield false results.".format(pattern_match.string)) if request.method == 'GET': url = url or request.GET.get('url') if url not in ('', None): return search_against_url(request, url) uuid = uuid or request.GET.get('uuid') if uuid not in ('', None): return search_against_uuid(request, uuid) elif request.method == 'POST': url = request.POST.get('url') if url not in ('', None): return search_against_url(request, url) text = request.POST.get('text') if text not in ('', None): return search_against_text(request, text) raise Http404()
def search(request, url=None, uuid=None): url = request.GET.get('url') or request.POST.get('url') if url: (url_is_blacklisted, pattern_match) = check_url_blacklist(url) if url_is_blacklisted: return search_page( request, error= "{} is one of our press release sources so checking pages from that site would yield false results." .format(pattern_match.string)) if request.method == 'GET': url = url or request.GET.get('url') if url not in ('', None): return search_against_url(request, url) uuid = uuid or request.GET.get('uuid') if uuid not in ('', None): return search_against_uuid(request, uuid) elif request.method == 'POST': url = request.POST.get('url') if url not in ('', None): return search_against_url(request, url) text = request.POST.get('text') if text not in ('', None): return search_against_text(request, text) raise Http404()
def search(request, doctype=None): """ Proxies /search/ to Superfastmatch, returning the response with a few embellishments. If the text is given, it is stored in the database for later retrieval. It is assigned a UUID for later retrieval. This same view handles search via UUID recall. """ # This avoids useless matches when someone searches against a URL # that is from one of our corpus sources. url = request.GET.get('url') or request.POST.get('url') if url: (url_is_blacklisted, _) = check_url_blacklist(url) if url_is_blacklisted: return EmptySearchResult() if request.method == 'GET': uuid = request.GET.get('uuid') url = request.GET.get('url') if uuid: return uuid_search(request, uuid, doctype) elif url: return url_search(request, doctype) else: return HttpResponseBadRequest('GET request with no URL or UUID') if request.method != 'POST': return HttpResponseBadRequest( 'Only GET and POST search requests allowed.') text = request.POST.get('text') url = request.POST.get('url') uuid = request.POST.get('uuid') title = request.POST.get('title') doc = None if not text and not url and not uuid: return HttpResponseBadRequest() elif url or uuid or text: try: doc = recall_document(title, url, uuid, text) if not title: title = doc.title if not url: url = doc.url text = doc.text except UnicodeDecodeError: raise except SearchDocument.DoesNotExist: pass except Exception, e: return HttpResponseServerError(str(e))
def search(request, doctype=None): """ Proxies /search/ to Superfastmatch, returning the response with a few embellishments. If the text is given, it is stored in the database for later retrieval. It is assigned a UUID for later retrieval. This same view handles search via UUID recall. """ # This avoids useless matches when someone searches against a URL # that is from one of our corpus sources. url = request.GET.get('url') or request.POST.get('url') if url: (url_is_blacklisted, _) = check_url_blacklist(url) if url_is_blacklisted: return EmptySearchResult() if request.method == 'GET': uuid = request.GET.get('uuid') url = request.GET.get('url') if uuid: return uuid_search(request, uuid, doctype) elif url: return url_search(request, doctype) else: return HttpResponseBadRequest('GET request with no URL or UUID') if request.method != 'POST': return HttpResponseBadRequest('Only GET and POST search requests allowed.') text = request.POST.get('text') url = request.POST.get('url') uuid = request.POST.get('uuid') title = request.POST.get('title') doc = None if not text and not url and not uuid: return HttpResponseBadRequest() elif url or uuid or text: try: doc = recall_document(title, url, uuid, text) if not title: title = doc.title if not url: url = doc.url text = doc.text except UnicodeDecodeError: raise except SearchDocument.DoesNotExist: pass except Exception, e: return HttpResponseServerError(str(e))
def handle(self, *args, **options): if len(UrlBlacklist) == 0: raise CommandError( "No sites are blacklisted.\nDouble-check that you've configured settings.APIPROXY['blacklisted_news_urls']" ) docs_with_url = SearchDocument.objects.filter(~Q(url=''), url__isnull=False) cnt = docs_with_url.count() chunk_size = 1000 offset = 0 progress = progressbar.ProgressBar( maxval=cnt, widgets=[ progressbar.widgets.AnimatedMarker(), ' ', progressbar.widgets.Counter(), '/{0} '.format(cnt), progressbar.widgets.Percentage(), ' ', progressbar.widgets.ETA(), ]) progress.start() ordered_docs = docs_with_url.order_by('uuid') wrtr = csv.writer(sys.stdout) wrtr.writerow(['MatchCount', 'UUID', 'URL']) while True: chunk = ordered_docs[offset:offset + chunk_size] processed = 0 for doc in chunk: (url_is_blacklisted, match) = check_url_blacklist(doc.url) if url_is_blacklisted: wrtr.writerow([doc.match_set.count(), doc.uuid, doc.url]) processed += 1 progress.update(offset + processed) if processed == 0: break offset += chunk_size progress.finish()
def handle(self, *args, **options): if len(UrlBlacklist) == 0: raise CommandError("No sites are blacklisted.\nDouble-check that you've configured settings.APIPROXY['blacklisted_news_urls']") docs_with_url = SearchDocument.objects.filter(~Q(url=''), url__isnull=False) cnt = docs_with_url.count() chunk_size = 1000 offset = 0 progress = progressbar.ProgressBar(maxval=cnt, widgets=[ progressbar.widgets.AnimatedMarker(), ' ', progressbar.widgets.Counter(), '/{0} '.format(cnt), progressbar.widgets.Percentage(), ' ', progressbar.widgets.ETA(), ]) progress.start() ordered_docs = docs_with_url.order_by('uuid') wrtr = csv.writer(sys.stdout) wrtr.writerow(['MatchCount', 'UUID', 'URL']) while True: chunk = ordered_docs[offset:offset+chunk_size] processed = 0 for doc in chunk: (url_is_blacklisted, match) = check_url_blacklist(doc.url) if url_is_blacklisted: wrtr.writerow([doc.match_set.count(), doc.uuid, doc.url]) processed += 1 progress.update(offset + processed) if processed == 0: break offset += chunk_size progress.finish()