def handle(self, *args, **options): sfm = from_django_conf('default') response = sfm.queue() if response['success'] == True: queued = [r for r in response['rows'] if r['status'] == 'Queued'] active = [r for r in response['rows'] if r['status'] == 'Active'] print 'Length: {0}{1}'.format(len(queued) + len(active), '' if response.get('cursors', {}).get('next') == '' else '+') for (action, cnt) in freq([r['action'] for r in queued]).iteritems(): print ' {action!s:<20} {cnt!s:>10}'.format(action=action, cnt=cnt) if len(active) == 0: print 'No active tasks' else: print 'Active task(s):' for r in active: fmtstr = ' Task #{id}: {action} (priority {priority})' if r['action'] == 'Add Association': fmtstr += ' for document ({doctype}, {docid})' elif r['action'] == 'Add Associations': fmtstr += ' from {source} => {target}' print fmtstr.format(**r)
def handle(self, *args, **options): sfm = from_django_conf('default') response = sfm.queue() if response['success'] == True: queued = [r for r in response['rows'] if r['status'] == 'Queued'] active = [r for r in response['rows'] if r['status'] == 'Active'] print 'Length: {0}{1}'.format( len(queued) + len(active), '' if response.get('cursors', {}).get('next') == '' else '+') for (action, cnt) in freq([r['action'] for r in queued]).iteritems(): print ' {action!s:<20} {cnt!s:>10}'.format(action=action, cnt=cnt) if len(active) == 0: print 'No active tasks' else: print 'Active task(s):' for r in active: fmtstr = ' Task #{id}: {action} (priority {priority})' if r['action'] == 'Add Association': fmtstr += ' for document ({doctype}, {docid})' elif r['action'] == 'Add Associations': fmtstr += ' from {source} => {target}' print fmtstr.format(**r)
def execute_search(doc, doctype=None): sfm = from_django_conf() response = sfm.search(doc.text, doctype) if isinstance(response, str): # Pass the SFM error back to the client return HttpResponse(response, content_type='text/html') drop_common_fragments(settings.APIPROXY.get('commonality_threshold', 0.4), response) ignore_proper_nouns(settings.APIPROXY.get('proper_noun_threshold', 0.8), doc.text, response) ignore_repetitious_characters(settings.APIPROXY.get('minimum_unique_characters', 3), doc.text, response) if doc.url: response['documents']['rows'][:] = [r for r in response['documents']['rows'] if r.get('url') != doc.url] embellish(doc.text, response, **settings.APIPROXY.get('embellishments', {})) return response
def document(request, doctype, docid): """ Proxies requests for specific documents to Superfastmatch. Does not implement the DELETE method so as to avoid access control issues. """ sfm = from_django_conf() if request.method == 'POST' or request.method == 'PUT': params = QueryDict( request.raw_post_data) if request.method == 'PUT' else request.POST defer = (request.method == 'PUT') text = params['text'] params = dict([(k, v) for (k, v) in params.items() if k not in ['put', 'text']]) response = sfm.add(doctype, docid, text=text, defer=defer, **params) http_status = 202 elif request.method == 'GET': response = sfm.document(doctype, docid) http_status = 200 else: return HttpResponseBadRequest( 'Only the GET and POST methods are supported.') if isinstance(response, str): return HttpResponse(response, content_type='text/html') else: return HttpResponse(json.dumps(response), status=http_status, content_type='application/json')
def execute_search(doc, doctype=None): sfm = from_django_conf() response = sfm.search(doc.text, doctype) if isinstance(response, str): # Pass the SFM error back to the client return HttpResponse(response, content_type='text/html') drop_common_fragments(settings.APIPROXY.get('commonality_threshold', 0.4), response) ignore_proper_nouns(settings.APIPROXY.get('proper_noun_threshold', 0.8), doc.text, response) ignore_repetitious_characters( settings.APIPROXY.get('minimum_unique_characters', 3), doc.text, response) if doc.url: response['documents']['rows'][:] = [ r for r in response['documents']['rows'] if r.get('url') != doc.url ] embellish(doc.text, response, **settings.APIPROXY.get('embellishments', {})) return response
def handle(self, *args, **options): self.errors = set() sfm = from_django_conf() docs = DocumentIterator(sfm, order_by='docid', doctype=options['doctype'], chunksize=1000, fetch_text=False) try: for doc in docs: try: release = Release.objects.get(id=doc['docid']) doctype = release.source.doc_type or settings.DEFAULT_DOCTYPE if doctype != doc['doctype']: logging.warning("Doctype mismatch for document ({0[doctype]},{0[docid]}) and release #{1.id} (source: {1.source}, doctype: {2}).".format(doc, release, doctype)) except Release.DoesNotExist: if options['dry_run'] == False: sfm.delete(doc['doctype'], doc['docid']) logging.warning("Deleting document ({0[doctype]},{0[docid]}) because there is no corresponding press release.".format(doc)) else: logging.warning("Document ({0[doctype]},{0[docid]}) does not have a corresponding press release.".format(doc)) except ValueError: logging.error("Failed on document {0},{1}".format(doc['doctype'], doc['docid']))
def handle(self, *args, **kwargs): if not hasattr(settings, 'SUPERFASTMATCH'): raise CommandError('You must configure SUPERFASTMATCH in your project settings.') if not hasattr(settings, 'DEFAULT_DOCTYPE'): raise CommandError('You must specify a DEFAULT_DOCTYPE in your project settings.') self.sfm = from_django_conf() sources = Source.objects.filter(source_type=2) for source in sources: try: if source.is_stale(): self.scrape_releases(source) source.last_retrieved = now() source.save() failures = SourceScrapeFailure.objects.filter(resolved__isnull=True, source=source) for f in failures: f.resolved = now() f.save() except SourceScrapeFailure as failure: failure.save() except Exception as e: failure = SourceScrapeFailure.objects.create(source=source, description=unicode(e))
def document(request, doctype, docid): """ Proxies requests for specific documents to Superfastmatch. """ sfm = from_django_conf() if request.method == 'POST': params = request.POST text = params['text'] defer = ('put' not in params) or (params['put'] == 'False') params = dict([(k, v) for (k, v) in params.items() if k not in ['put', 'text']]) response = sfm.add(doctype, docid, text=text, defer=defer, **params) http_status = 202 else: response = sfm.document(doctype, docid) http_status = 200 if isinstance(response, str): return HttpResponse(response, content_type='text/html') else: return HttpResponse(json.dumps(response), status=http_status, content_type='application/json')
def document(request, doctype, docid): """ Proxies requests for specific documents to Superfastmatch. Does not implement the DELETE method so as to avoid access control issues. """ sfm = from_django_conf() if request.method == 'POST' or request.method == 'PUT': params = QueryDict(request.raw_post_data) if request.method == 'PUT' else request.POST defer = (request.method == 'PUT') text = params['text'] params = dict([(k, v) for (k, v) in params.items() if k not in ['put', 'text']]) response = sfm.add(doctype, docid, text=text, defer=defer, **params) http_status = 202 elif request.method == 'GET': response = sfm.document(doctype, docid) http_status = 200 else: return HttpResponseBadRequest('Only the GET and POST methods are supported.') if isinstance(response, str): return HttpResponse(response, content_type='text/html') else: return HttpResponse(json.dumps(response), status=http_status, content_type='application/json')
def handle(self, *args, **options): for (key, cfg) in settings.SUPERFASTMATCH.iteritems(): sfm = from_django_conf(key) try: if isinstance(sfm, superfastmatch.federated.FederatedClient): print "{0} (federated)".format(key) for (doctypes, client) in sfm.clients().iteritems(): print " doctypes: {0}".format( doctypes.replace(':', ', ')) print " url: {0}".format(client.url) documents = client.documents(doctype=doctypes, order_by='docid', limit=1) print " documents: {0}".format(documents['total']) else: print "{0}".format(key) print ' url: {0}'.format(sfm.url) documents = sfm.documents() if documents['success'] == True: print ' documents: {0}'.format(documents['total']) else: print ' Unable to query for documents.' except (superfastmatch.SuperFastMatchError, socket.error) as e: print ' Unable to query for documents: {0}'.format(str(e))
def handle(self, server, inpath, *args, **options): if not os.path.exists(inpath): raise CommandError("No such file: {0}".format(inpath)) sfm = from_django_conf(server) restore(sfm, inpath, doctype_mappingstr=options.get('doctypes'), dryrun=options.get('dryrun'))
def recall(request, uuid, doctype, docid): sfm = from_django_conf('sidebyside') try: sfm_results = sfm.search(text=None, uuid=uuid) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: raise Http404('Article {uuid} not found'.format(uuid=uuid)) else: raise
def handle(self, server, outpath, doctype_rangestr=None, *args, **options): sfm = from_django_conf(server) if os.path.exists(outpath): raise CommandError("I have nothing against {0}, why would I overwrite it?".format(outpath)) if doctype_rangestr is not None: parse_doctype_range(doctype_rangestr) backup(sfm, outpath, doctype_rangestr)
def handle(self, server, outpath, doctype_rangestr=None, *args, **options): sfm = from_django_conf(server) if os.path.exists(outpath): raise CommandError( "I have nothing against {0}, why would I overwrite it?".format( outpath)) if doctype_rangestr is not None: parse_doctype_range(doctype_rangestr) backup(sfm, outpath, doctype_rangestr)
def attach_document_text(results, maxdocs=None): sfm = from_django_conf('sidebyside') if maxdocs: results['documents']['rows'].sort(key=itemgetter('characters')) for (idx, row) in enumerate(results['documents']['rows']): if maxdocs and idx >= maxdocs: return doc_result = sfm.document(row['doctype'], row['docid']) if doc_result['success'] == True: row['text'] = doc_result['text']
def association(request, doctype=None): """ Proxies requests for lists of associations to Superfastmatch. """ sfm = from_django_conf() page = request.GET.get('cursor') response = sfm.associations(doctype, page) if isinstance(response, str): return HttpResponse(response, content_type='text/html') else: return HttpResponse(json.dumps(response), content_type='application/json')
def document_list(request, doctype=None): """ Proxies requests for lists of documents to Superfastmatch. """ sfm = from_django_conf() page = request.GET.get('cursor') order_by = request.GET.get('order_by', 'docid') limit = request.GET.get('limit', '100') response = sfm.documents(doctype, page=page, order_by=order_by, limit=limit) if isinstance(response, str): return HttpResponse(response, content_type='text/html') else: return HttpResponse(json.dumps(response), content_type='application/json')
def search_against_url(request, url): """ Accepts a URL as either a suffix of the URI or a POST request parameter. Downloads the content, feeds it through the readability article grabber, then submits the article text to superfastmatch for comparison. """ (scheme, _1, _2, _3, _4, _5) = urlparse(url) if scheme not in ('http', 'https'): return search_page( request, error='The URL must begin with either http or https.') sfm = from_django_conf('sidebyside') try: (title, text) = fetch_and_clean(url) except requests.exceptions.Timeout: return search_page( request, error="Sorry, that news article couldn't be retrieved.") try: sfm_results = sfm.search(text=text, title=title, url=url) drop_silly_results(sfm_results) sort_by_coverage(sfm_results) #if they submit a url, don't return the exact same url in the results for r in sfm_results['documents']['rows']: if r.get('url') == url: sfm_results['documents']['rows'].remove(r) if sfm_results.has_key('text'): text = sfm_results['text'] else: text = '' if sfm_results.has_key('title'): title = sfm_results['title'] else: title = 'No Title' return search_result_page(request, sfm_results, text, source_title=title, source_url=url) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: raise HttpResponse('No such article {0}'.format(url)) elif settings.DEBUG == True: return HttpResponse(e.response[1], status=e.response[0]) else: raise
def search_against_uuid(request, uuid): sfm = from_django_conf('sidebyside') try: sfm_results = sfm.search(text=None, uuid=uuid) drop_silly_results(sfm_results) sort_by_coverage(sfm_results) return search_result_page(request, sfm_results, source_text=sfm_results.get('text'), source_title=sfm_results.get('title')) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: logging.critical(u'Error communicating with the superfastmatch server: {}'.format(unicode(e))) raise Http404('No such article {0}'.format(uuid)) elif settings.DEBUG == True: return HttpResponse(e.response[1], status=e.response[0]) else: raise
def search_against_url(request, url): """ Accepts a URL as either a suffix of the URI or a POST request parameter. Downloads the content, feeds it through the readability article grabber, then submits the article text to superfastmatch for comparison. """ (scheme, _1, _2, _3, _4, _5) = urlparse(url) if scheme not in ('http', 'https'): return search_page(request, error='The URL must begin with either http or https.') sfm = from_django_conf('sidebyside') try: (title, text) = fetch_and_clean(url) except requests.exceptions.Timeout: return search_page(request, error="Sorry, that news article couldn't be retrieved.") try: sfm_results = sfm.search(text=text, title=title, url=url) drop_silly_results(sfm_results) sort_by_coverage(sfm_results) #if they submit a url, don't return the exact same url in the results for r in sfm_results['documents']['rows']: if r.get('url') == url: sfm_results['documents']['rows'].remove(r) if sfm_results.has_key('text'): text = sfm_results['text'] else: text = '' if sfm_results.has_key('title'): title = sfm_results['title'] else: title='No Title' return search_result_page(request, sfm_results, text, source_title=title, source_url=url) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: raise HttpResponse('No such article {0}'.format(url)) elif settings.DEBUG == True: return HttpResponse(e.response[1], status=e.response[0]) else: raise
def handle(self, *args, **options): if not hasattr(settings, "SUPERFASTMATCH"): raise CommandError("You must configure SUPERFASTMATCH in your project settings.") self.sfm = from_django_conf() for url in args: try: if url.startswith("http://") or url.startswith("https://"): release = Release.objects.get(url=url) body = get_link_content(release.url) release.title = kill_control_characters(release.title) release.body = body release.updated = now() release.save() logging.info("Updated release {0}: {1}".format(release.id, release.url)) else: logging.warning("Skipping non-HTTP link {0}".format(release.url)) except Exception as e: logging.error("Failed to rescrape {0}: {1}".format(url, str(e)))
def search_against_uuid(request, uuid): sfm = from_django_conf('sidebyside') try: sfm_results = sfm.search(text=None, uuid=uuid) drop_silly_results(sfm_results) sort_by_coverage(sfm_results) return search_result_page(request, sfm_results, source_text=sfm_results.get('text'), source_title=sfm_results.get('title')) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: logging.critical( u'Error communicating with the superfastmatch server: {}'. format(unicode(e))) raise Http404('No such article {0}'.format(uuid)) elif settings.DEBUG == True: return HttpResponse(e.response[1], status=e.response[0]) else: raise
def handle(self, *args, **options): for (key, cfg) in settings.SUPERFASTMATCH.iteritems(): sfm = from_django_conf(key) try: if isinstance(sfm, superfastmatch.federated.FederatedClient): print "{0} (federated)".format(key) for (doctypes, client) in sfm.clients().iteritems(): print " doctypes: {0}".format(doctypes.replace(':', ', ')) print " url: {0}".format(client.url) documents = client.documents(doctype=doctypes, order_by='docid', limit=1) print " documents: {0}".format(documents['total']) else: print "{0}".format(key) print ' url: {0}'.format(sfm.url) documents = sfm.documents() if documents['success'] == True: print ' documents: {0}'.format(documents['total']) else: print ' Unable to query for documents.' except (superfastmatch.SuperFastMatchError, socket.error) as e: print ' Unable to query for documents: {0}'.format(str(e))
def handle(self, sample_size, *args, **options): logging.basicConfig(level=getattr(logging, options['loglevel'].upper())) self.errors = set() try: sample_size = int(sample_size) except ValueError: raise CommandError("sample_size must be an integer.") self.sfm = from_django_conf() sample = random_sample(sample_size) for release in sample: self.check_release(release) log_fn = logging.error if len(self.errors) > 0 else logging.info log_fn(repr({ 'Sample size': len(sample), 'Errors': len(self.errors), 'Error rate': round(len(self.errors) / len(sample), 2) }))
def permalink(request, uuid, doctype, docid): sfm = from_django_conf('sidebyside') try: sfm_results = sfm.search(text=None, uuid=uuid) drop_silly_results(sfm_results) if len(sfm_results['documents']['rows']) == 0: raise Http404('No such article {0}'.format(uuid)) sort_by_coverage(sfm_results) try: matching_row = [ r for r in sfm_results['documents']['rows'] if r['doctype'] == int(doctype) and r['docid'] == int(docid) ][0] except IndexError: return redirect('sidebyside-uuid-search', uuid=uuid) if not matching_row.get('text'): try: md = MatchedDocument.objects.get(doc_type=doctype, doc_id=docid) matching_row['text'] = md.text except MatchedDocument.DoesNotExist: doc = sfm.document(doctype, docid) if doc: matching_row['text'] = doc['text'] return search_result_page(request, sfm_results, source_text=sfm_results['text'], source_title=sfm_results.get('title'), source_url=sfm_results.get('url')) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: raise Http404('No such article {0}'.format(uuid)) else: raise
def record_matches(doc, response, update_matches=False): sfm = from_django_conf() for r in response['documents']['rows']: if 'url' not in r: # We don't want to record a match for a document we don't have a URL for # because we cannot provide a link back to the original. continue if r['url'] == doc.url: continue (md, created) = MatchedDocument.objects.get_or_create( doc_id=r['docid'], doc_type=r['doctype']) if created or update_matches: sfm_doc = sfm.document(r['doctype'], r['docid']) if sfm_doc['success'] == False: # If we can't fetch the text, the site probably won't be able # to either, so just ignore this result row. continue md.text = sfm_doc['text'] md.source_url = r['url'] md.source_name = r.get('source') md.source_headline = r['title'] md.save() (match, created) = Match.objects.get_or_create(search_document=doc, matched_document=md) if created or update_matches: stats = calculate_coverage(doc.text, r) match.percent_churned = str(stats[1]) match.overlapping_characters = stats[0] density = r.get('density') match.fragment_density = Decimal(str(density)) if density else None match.response = json.dumps(response) match.number_matches += 1 match.save() r['match_id'] = match.id
def record_matches(doc, response, update_matches=False): sfm = from_django_conf() for r in response['documents']['rows']: if 'url' not in r: # We don't want to record a match for a document we don't have a URL for # because we cannot provide a link back to the original. continue if r['url'] == doc.url: continue (md, created) = MatchedDocument.objects.get_or_create(doc_id=r['docid'], doc_type=r['doctype']) if created or update_matches: sfm_doc = sfm.document(r['doctype'], r['docid']) if sfm_doc['success'] == False: # If we can't fetch the text, the site probably won't be able # to either, so just ignore this result row. continue md.text = sfm_doc['text'] md.source_url = r['url'] md.source_name = r.get('source') md.source_headline = r['title'] md.save() (match, created) = Match.objects.get_or_create(search_document=doc, matched_document=md) if created or update_matches: stats = calculate_coverage(doc.text, r) match.percent_churned = str(stats[1]) match.overlapping_characters = stats[0] density = r.get('density') match.fragment_density = Decimal(str(density)) if density else None match.response = json.dumps(response) match.number_matches += 1 match.save() r['match_id'] = match.id
def handle(self, *args, **options): if not hasattr(settings, 'SUPERFASTMATCH'): raise CommandError('You must configure SUPERFASTMATCH in your project settings.') if not hasattr(settings, 'DEFAULT_DOCTYPE'): raise CommandError('You must specify a DEFAULT_DOCTYPE in your project settings.') self.sfm = from_django_conf() sources = Source.objects.filter(source_type=2) if len(args) == 1: arg = args[0] if arg.startswith('http://') or arg.startswith('https://'): sources = sources.filter(url=arg) else: try: sources = sources.filter(id=int(arg)) except ValueError: raise CommandError("Arguments must be source IDs or feed URLs") for source in sources: try: if source.is_stale() or options['including_stale']: self.scrape_releases(source) source.last_retrieved = now() source.last_failure = None source.save() except SourceScrapeFailure as failure: failure.save() except Exception as e: buf = StringIO() print_exc(1000, buf) failure = SourceScrapeFailure.objects.create(source=source, traceback=buf.getvalue(), description=unicode(e))
def permalink(request, uuid, doctype, docid): sfm = from_django_conf('sidebyside') try: sfm_results = sfm.search(text=None, uuid=uuid) drop_silly_results(sfm_results) if len(sfm_results['documents']['rows']) == 0: raise Http404('No such article {0}'.format(uuid)) sort_by_coverage(sfm_results) try: matching_row = [r for r in sfm_results['documents']['rows'] if r['doctype'] == int(doctype) and r['docid'] == int(docid)][0] except IndexError: return redirect('sidebyside-uuid-search', uuid=uuid) if not matching_row.get('text'): try: md = MatchedDocument.objects.get(doc_type=doctype, doc_id=docid) matching_row['text'] = md.text except MatchedDocument.DoesNotExist: doc = sfm.document(doctype, docid) if doc: matching_row['text'] = doc['text'] return search_result_page(request, sfm_results, source_text=sfm_results['text'], source_title=sfm_results.get('title'), source_url=sfm_results.get('url')) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: raise Http404('No such article {0}'.format(uuid)) else: raise
def search_against_text(request, text): sfm = from_django_conf('sidebyside') sfm_results = sfm.search(text) drop_silly_results(sfm_results) sort_by_coverage(sfm_results) return search_result_page(request, sfm_results, text)