def handle(self, inpath=None, *args, **options): super(Command, self).handle(*args, **options) if inpath is None: raise CommandError(u"You must specify an input file.") if not os.path.exists(inpath): raise CommandError(u"File does not exist: {}".format(inpath)) sfm = superfastmatch.from_django_conf() prev_docpair = None with open(inpath, "rb") as outfile: rdr = unicodecsv.DictReader(outfile) for (idx, docrow) in enumerate(rdr): docpair = (int(docrow["doctype"]), int(docrow["docid"])) logging.warn(u"Deleting superfastmatch document {}".format(docpair)) if options["dryrun"] == False: sfm.delete(*docpair) # Poll on and delete the previous document if idx > 0: logging.warn(u"Deleting matches for {}".format(prev_docpair)) if options["dryrun"] == False: while document_exists(sfm, *prev_docpair) == True: logging.info(u"Polling on document {}".format(prev_docpair)) time.sleep(1) delete_matches_for_document(*prev_docpair) prev_docpair = docpair
def handle(self, inpath=None, *args, **options): super(Command, self).handle(*args, **options) if inpath is None: raise CommandError(u"You must specify an input file.") if not os.path.exists(inpath): raise CommandError(u"File does not exist: {}".format(inpath)) sfm = superfastmatch.from_django_conf() prev_docpair = None with open(inpath, 'rb') as outfile: rdr = unicodecsv.DictReader(outfile) for (idx, docrow) in enumerate(rdr): docpair = (int(docrow['doctype']), int(docrow['docid'])) logging.warn( u"Deleting superfastmatch document {}".format(docpair)) if options['dryrun'] == False: sfm.delete(*docpair) # Poll on and delete the previous document if idx > 0: logging.warn( u"Deleting matches for {}".format(prev_docpair)) if options['dryrun'] == False: while document_exists(sfm, *prev_docpair) == True: logging.info( u"Polling on document {}".format(prev_docpair)) time.sleep(1) delete_matches_for_document(*prev_docpair) prev_docpair = docpair
def handle(self, doctype, outpath=None, *args, **options): super(Command, self).handle(*args, **options) if doctype is None: raise CommandError(u"You must specify a doctype to prune.") try: doctype = int(doctype) except ValueError: raise CommandError(u"The doctype must be an integer.") if outpath is None: raise CommandError(u"You must specify an output file.") if os.path.exists(outpath): raise CommandError(u"File already exists: {}".format(outpath)) if options['apply_sidebyside_thresholds'] == True: minimum_pct = settings.SIDEBYSIDE['minimum_coverage_pct'] minimum_chars = settings.SIDEBYSIDE['minimum_coverage_chars'] sfm = superfastmatch.from_django_conf() docs = superfastmatch.DocumentIterator(sfm, 'docid', doctype=doctype, fetch_text=False) with open(outpath, 'wb') as outfile: wrtr = unicodecsv.DictWriter(outfile, ['doctype', 'docid']) wrtr.writeheader() for doc in docs: apiproxy_matches = list(Match.objects.filter(matched_document__doc_type=doctype, matched_document__doc_id=doc['docid'])) sidebyside_matches = [] if options['apply_sidebyside_thresholds'] == True: sidebyside_matches = [m for m in apiproxy_matches if m.overlapping_characters >= minimum_chars and m.percent_churned >= minimum_pct] if (len(apiproxy_matches) == 0) or (options['apply_sidebyside_thresholds'] == True and len(sidebyside_matches) == 0): if options['apply_sidebyside_thresholds'] == True: for m in apiproxy_matches: logging.info(u"Scheduling deletion for match between {uuid} and ({doctype},{docid}) because it matches {uuid} with [{chars},{pct}%]".format(uuid=m.search_document.uuid, doctype=m.matched_document.doc_type, docid=m.matched_document.doc_id, chars=m.overlapping_characters, pct=m.percent_churned)) logging.info(u"Scheduling deletion for document ({doctype},{docid})".format(**doc)) wrtr.writerow({'doctype': doc['doctype'], 'docid': doc['docid']}) else: for m in sidebyside_matches or apiproxy_matches: logging.info(u"Skipping deletion of document ({doctype},{docid}) because it matches {uuid} with [{chars},{pct}%]".format(doctype=doc['doctype'], docid=doc['docid'], uuid=m.search_document.uuid, chars=m.overlapping_characters, pct=m.percent_churned)) file_status = os.stat(outpath) print(u"{sz!s: >12} {path}".format(sz=file_status.st_size, path=outpath))
def handle(self, *args, **options): doctype = None docid = None if options['document'] is not None: try: (doctype, docid) = options['document'].split(',') doctype = int(doctype) docid = int(docid) except ValueError: raise CommandError(u"You must specify a doctype and docid separated by a comma for the --document option.") wrtr = unicodecsv.writer(sys.stdout, encoding='utf-8') if options['autofix'] == True: sfm = superfastmatch.from_django_conf('sidebyside') if doctype is None and docid is None: query = MatchedDocument.objects.filter(Q(text__isnull=True) | Q(text='')) cnt = query.count() if cnt == 0: raise CommandError(u"No MatchedDocuments are missing text.") docs = batched_results(query, batch_size=1000) else: docs = list(MatchedDocument.objects.filter(doc_type=doctype, doc_id=docid)) if len(docs) == 0: raise CommandError(u"No such MatchedDocument: ({},{})".format(doctype, docid)) elif len(docs[0].text.strip()) > 0: raise CommandError(u"MatchedDocument ({},{}) already has non-empty text.".format(doctype, docid)) wrtr.writerow(['doctype', 'docid', 'result'] if options['autofix'] == True else ['doctype', 'docid']) for matched_doc in docs: if options['autofix'] == True: sfm_doc = sfm.document(matched_doc.doc_type, matched_doc.doc_id) if sfm_doc[u'success'] == False: fix_result = "No such document in SFM." elif sfm_doc[u'success'] == True: if len(sfm_doc[u'text'].strip()) > 0: matched_doc.text = sfm_doc[u'text'] matched_doc.save() fix_result = "Added text of length {}".format(len(sfm_doc[u'text'])) else: fix_result = "Document has no text in SFM." wrtr.writerow([matched_doc.doc_type, matched_doc.doc_id, fix_result]) else: wrtr.writerow([matched_doc.doc_type, matched_doc.doc_id])
def handle(self, doctype, outpath=None, *args, **options): super(Command, self).handle(*args, **options) if doctype is None: raise CommandError(u"You must specify a doctype to prune.") try: doctype = int(doctype) except ValueError: raise CommandError(u"The doctype must be an integer.") if outpath is None: raise CommandError(u"You must specify an output file.") if os.path.exists(outpath): raise CommandError(u"File already exists: {}".format(outpath)) if options['apply_sidebyside_thresholds'] == True: minimum_pct = settings.SIDEBYSIDE['minimum_coverage_pct'] minimum_chars = settings.SIDEBYSIDE['minimum_coverage_chars'] sfm = superfastmatch.from_django_conf() docs = superfastmatch.DocumentIterator(sfm, 'docid', doctype=doctype, fetch_text=False) with open(outpath, 'wb') as outfile: wrtr = unicodecsv.DictWriter(outfile, ['doctype', 'docid']) wrtr.writeheader() for doc in docs: apiproxy_matches = list( Match.objects.filter( matched_document__doc_type=doctype, matched_document__doc_id=doc['docid'])) sidebyside_matches = [] if options['apply_sidebyside_thresholds'] == True: sidebyside_matches = [ m for m in apiproxy_matches if m.overlapping_characters >= minimum_chars and m.percent_churned >= minimum_pct ] if (len(apiproxy_matches) == 0) or (options['apply_sidebyside_thresholds'] == True and len(sidebyside_matches) == 0): if options['apply_sidebyside_thresholds'] == True: for m in apiproxy_matches: logging.info( u"Scheduling deletion for match between {uuid} and ({doctype},{docid}) because it matches {uuid} with [{chars},{pct}%]" .format(uuid=m.search_document.uuid, doctype=m.matched_document.doc_type, docid=m.matched_document.doc_id, chars=m.overlapping_characters, pct=m.percent_churned)) logging.info( u"Scheduling deletion for document ({doctype},{docid})" .format(**doc)) wrtr.writerow({ 'doctype': doc['doctype'], 'docid': doc['docid'] }) else: for m in sidebyside_matches or apiproxy_matches: logging.info( u"Skipping deletion of document ({doctype},{docid}) because it matches {uuid} with [{chars},{pct}%]" .format(doctype=doc['doctype'], docid=doc['docid'], uuid=m.search_document.uuid, chars=m.overlapping_characters, pct=m.percent_churned)) file_status = os.stat(outpath) print(u"{sz!s: >12} {path}".format(sz=file_status.st_size, path=outpath))