Пример #1
0
def search(request, url=None, uuid=None):
    url = request.GET.get('url') or request.POST.get('url')
    if url:
        (url_is_blacklisted, pattern_match) = check_url_blacklist(url)
        if url_is_blacklisted:
            return search_page(request, error="{} is one of our press release sources so checking pages from that site would yield false results.".format(pattern_match.string))

    if request.method == 'GET':
        url = url or request.GET.get('url')
        if url not in ('', None):
            return search_against_url(request, url)

        uuid = uuid or request.GET.get('uuid')
        if uuid not in ('', None):
            return search_against_uuid(request, uuid)

    elif request.method == 'POST':
        url = request.POST.get('url')
        if url not in ('', None):
            return search_against_url(request, url)

        text = request.POST.get('text')
        if text not in ('', None):
            return search_against_text(request, text)

    raise Http404()
Пример #2
0
def search(request, url=None, uuid=None):
    url = request.GET.get('url') or request.POST.get('url')
    if url:
        (url_is_blacklisted, pattern_match) = check_url_blacklist(url)
        if url_is_blacklisted:
            return search_page(
                request,
                error=
                "{} is one of our press release sources so checking pages from that site would yield false results."
                .format(pattern_match.string))

    if request.method == 'GET':
        url = url or request.GET.get('url')
        if url not in ('', None):
            return search_against_url(request, url)

        uuid = uuid or request.GET.get('uuid')
        if uuid not in ('', None):
            return search_against_uuid(request, uuid)

    elif request.method == 'POST':
        url = request.POST.get('url')
        if url not in ('', None):
            return search_against_url(request, url)

        text = request.POST.get('text')
        if text not in ('', None):
            return search_against_text(request, text)

    raise Http404()
Пример #3
0
def search(request, doctype=None):
    """
    Proxies /search/ to Superfastmatch, returning the response with 
    a few embellishments. If the text is given, it is stored in the
    database for later retrieval. It is assigned a UUID for later
    retrieval. This same view handles search via UUID recall.
    """

    # This avoids useless matches when someone searches against a URL
    # that is from one of our corpus sources.
    url = request.GET.get('url') or request.POST.get('url')
    if url:
        (url_is_blacklisted, _) = check_url_blacklist(url)
        if url_is_blacklisted:
            return EmptySearchResult()

    if request.method == 'GET':
        uuid = request.GET.get('uuid')
        url = request.GET.get('url')
        if uuid:
            return uuid_search(request, uuid, doctype)
        elif url:
            return url_search(request, doctype)
        else:
            return HttpResponseBadRequest('GET request with no URL or UUID')

    if request.method != 'POST':
        return HttpResponseBadRequest(
            'Only GET and POST search requests allowed.')

    text = request.POST.get('text')
    url = request.POST.get('url')
    uuid = request.POST.get('uuid')
    title = request.POST.get('title')
    doc = None

    if not text and not url and not uuid:
        return HttpResponseBadRequest()

    elif url or uuid or text:
        try:
            doc = recall_document(title, url, uuid, text)

            if not title:
                title = doc.title
            if not url:
                url = doc.url
            text = doc.text
        except UnicodeDecodeError:
            raise

        except SearchDocument.DoesNotExist:
            pass

        except Exception, e:
            return HttpResponseServerError(str(e))
Пример #4
0
def search(request, doctype=None):
    """
    Proxies /search/ to Superfastmatch, returning the response with 
    a few embellishments. If the text is given, it is stored in the
    database for later retrieval. It is assigned a UUID for later
    retrieval. This same view handles search via UUID recall.
    """

    # This avoids useless matches when someone searches against a URL
    # that is from one of our corpus sources.
    url = request.GET.get('url') or request.POST.get('url')
    if url:
        (url_is_blacklisted, _) = check_url_blacklist(url)
        if url_is_blacklisted:
            return EmptySearchResult()

    if request.method == 'GET':
        uuid = request.GET.get('uuid')
        url = request.GET.get('url')
        if uuid:
            return uuid_search(request, uuid, doctype)
        elif url:
            return url_search(request, doctype)
        else:
            return HttpResponseBadRequest('GET request with no URL or UUID')

    if request.method != 'POST':
        return HttpResponseBadRequest('Only GET and POST search requests allowed.')

    text = request.POST.get('text')
    url = request.POST.get('url')
    uuid = request.POST.get('uuid')
    title = request.POST.get('title')
    doc = None

    if not text and not url and not uuid:
        return HttpResponseBadRequest()

    elif url or uuid or text:
        try:
            doc = recall_document(title, url, uuid, text)

            if not title:
                title = doc.title
            if not url:
                url = doc.url
            text = doc.text
        except UnicodeDecodeError:
            raise

        except SearchDocument.DoesNotExist:
            pass

        except Exception, e:
            return HttpResponseServerError(str(e))
    def handle(self, *args, **options):
        if len(UrlBlacklist) == 0:
            raise CommandError(
                "No sites are blacklisted.\nDouble-check that you've configured settings.APIPROXY['blacklisted_news_urls']"
            )

        docs_with_url = SearchDocument.objects.filter(~Q(url=''),
                                                      url__isnull=False)
        cnt = docs_with_url.count()
        chunk_size = 1000
        offset = 0

        progress = progressbar.ProgressBar(
            maxval=cnt,
            widgets=[
                progressbar.widgets.AnimatedMarker(),
                '  ',
                progressbar.widgets.Counter(),
                '/{0}  '.format(cnt),
                progressbar.widgets.Percentage(),
                '  ',
                progressbar.widgets.ETA(),
            ])
        progress.start()

        ordered_docs = docs_with_url.order_by('uuid')

        wrtr = csv.writer(sys.stdout)
        wrtr.writerow(['MatchCount', 'UUID', 'URL'])
        while True:
            chunk = ordered_docs[offset:offset + chunk_size]
            processed = 0
            for doc in chunk:
                (url_is_blacklisted, match) = check_url_blacklist(doc.url)
                if url_is_blacklisted:
                    wrtr.writerow([doc.match_set.count(), doc.uuid, doc.url])
                processed += 1
                progress.update(offset + processed)
            if processed == 0:
                break
            offset += chunk_size

        progress.finish()
    def handle(self, *args, **options):
        if len(UrlBlacklist) == 0:
            raise CommandError("No sites are blacklisted.\nDouble-check that you've configured settings.APIPROXY['blacklisted_news_urls']")

        docs_with_url = SearchDocument.objects.filter(~Q(url=''), url__isnull=False)
        cnt = docs_with_url.count()
        chunk_size = 1000
        offset = 0


        progress = progressbar.ProgressBar(maxval=cnt,
                                           widgets=[
                                               progressbar.widgets.AnimatedMarker(),
                                               '  ',
                                               progressbar.widgets.Counter(),
                                               '/{0}  '.format(cnt),
                                               progressbar.widgets.Percentage(),
                                               '  ',
                                               progressbar.widgets.ETA(),
                                           ])
        progress.start()

        ordered_docs = docs_with_url.order_by('uuid')

        wrtr = csv.writer(sys.stdout)
        wrtr.writerow(['MatchCount', 'UUID', 'URL'])
        while True:
            chunk = ordered_docs[offset:offset+chunk_size]
            processed = 0
            for doc in chunk:
                (url_is_blacklisted, match) = check_url_blacklist(doc.url)
                if url_is_blacklisted:
                    wrtr.writerow([doc.match_set.count(),
                                   doc.uuid,
                                   doc.url])
                processed += 1
                progress.update(offset + processed)
            if processed == 0:
                break
            offset += chunk_size

        progress.finish()