예제 #1
0
    def handle(self, inpath=None, *args, **options):
        super(Command, self).handle(*args, **options)

        if inpath is None:
            raise CommandError(u"You must specify an input file.")

        if not os.path.exists(inpath):
            raise CommandError(u"File does not exist: {}".format(inpath))

        sfm = superfastmatch.from_django_conf()

        prev_docpair = None
        with open(inpath, "rb") as outfile:
            rdr = unicodecsv.DictReader(outfile)

            for (idx, docrow) in enumerate(rdr):
                docpair = (int(docrow["doctype"]), int(docrow["docid"]))
                logging.warn(u"Deleting superfastmatch document {}".format(docpair))
                if options["dryrun"] == False:
                    sfm.delete(*docpair)

                # Poll on and delete the previous document
                if idx > 0:
                    logging.warn(u"Deleting matches for {}".format(prev_docpair))
                    if options["dryrun"] == False:
                        while document_exists(sfm, *prev_docpair) == True:
                            logging.info(u"Polling on document {}".format(prev_docpair))
                            time.sleep(1)
                        delete_matches_for_document(*prev_docpair)

                prev_docpair = docpair
예제 #2
0
    def handle(self, inpath=None, *args, **options):
        super(Command, self).handle(*args, **options)

        if inpath is None:
            raise CommandError(u"You must specify an input file.")

        if not os.path.exists(inpath):
            raise CommandError(u"File does not exist: {}".format(inpath))

        sfm = superfastmatch.from_django_conf()

        prev_docpair = None
        with open(inpath, 'rb') as outfile:
            rdr = unicodecsv.DictReader(outfile)

            for (idx, docrow) in enumerate(rdr):
                docpair = (int(docrow['doctype']), int(docrow['docid']))
                logging.warn(
                    u"Deleting superfastmatch document {}".format(docpair))
                if options['dryrun'] == False:
                    sfm.delete(*docpair)

                # Poll on and delete the previous document
                if idx > 0:
                    logging.warn(
                        u"Deleting matches for {}".format(prev_docpair))
                    if options['dryrun'] == False:
                        while document_exists(sfm, *prev_docpair) == True:
                            logging.info(
                                u"Polling on document {}".format(prev_docpair))
                            time.sleep(1)
                        delete_matches_for_document(*prev_docpair)

                prev_docpair = docpair
    def handle(self, doctype, outpath=None, *args, **options):
        super(Command, self).handle(*args, **options)

        if doctype is None:
            raise CommandError(u"You must specify a doctype to prune.")
        try:
            doctype = int(doctype)
        except ValueError:
            raise CommandError(u"The doctype must be an integer.")

        if outpath is None:
            raise CommandError(u"You must specify an output file.")

        if os.path.exists(outpath):
            raise CommandError(u"File already exists: {}".format(outpath))

        if options['apply_sidebyside_thresholds'] == True:
            minimum_pct = settings.SIDEBYSIDE['minimum_coverage_pct']
            minimum_chars = settings.SIDEBYSIDE['minimum_coverage_chars']

        sfm = superfastmatch.from_django_conf()

        docs = superfastmatch.DocumentIterator(sfm, 'docid',
                                               doctype=doctype,
                                               fetch_text=False)

        with open(outpath, 'wb') as outfile:
            wrtr = unicodecsv.DictWriter(outfile, ['doctype',
                                                   'docid'])
            wrtr.writeheader()

            for doc in docs:
                apiproxy_matches = list(Match.objects.filter(matched_document__doc_type=doctype,
                                                             matched_document__doc_id=doc['docid']))
                sidebyside_matches = []
                if options['apply_sidebyside_thresholds'] == True:
                    sidebyside_matches = [m
                                          for m in apiproxy_matches
                                          if m.overlapping_characters >= minimum_chars
                                          and m.percent_churned >= minimum_pct]

                if (len(apiproxy_matches) == 0) or (options['apply_sidebyside_thresholds'] == True and len(sidebyside_matches) == 0):
                    if options['apply_sidebyside_thresholds'] == True:
                        for m in apiproxy_matches:
                            logging.info(u"Scheduling deletion for match between {uuid} and ({doctype},{docid}) because it matches {uuid} with [{chars},{pct}%]".format(uuid=m.search_document.uuid, doctype=m.matched_document.doc_type, docid=m.matched_document.doc_id, chars=m.overlapping_characters, pct=m.percent_churned))
                    logging.info(u"Scheduling deletion for document ({doctype},{docid})".format(**doc))
                    wrtr.writerow({'doctype': doc['doctype'], 'docid': doc['docid']})

                else:
                    for m in sidebyside_matches or apiproxy_matches:
                        logging.info(u"Skipping deletion of document ({doctype},{docid}) because it matches {uuid} with [{chars},{pct}%]".format(doctype=doc['doctype'], docid=doc['docid'], uuid=m.search_document.uuid, chars=m.overlapping_characters, pct=m.percent_churned))

        file_status = os.stat(outpath)
        print(u"{sz!s: >12} {path}".format(sz=file_status.st_size, path=outpath))
예제 #4
0
    def handle(self, *args, **options):
        doctype = None
        docid = None
        if options['document'] is not None:
            try:
                (doctype, docid) = options['document'].split(',')
                doctype = int(doctype)
                docid = int(docid)
            except ValueError:
                raise CommandError(u"You must specify a doctype and docid separated by a comma for the --document option.")

        wrtr = unicodecsv.writer(sys.stdout, encoding='utf-8')

        if options['autofix'] == True:
            sfm = superfastmatch.from_django_conf('sidebyside')

        if doctype is None and docid is None:
            query = MatchedDocument.objects.filter(Q(text__isnull=True) | Q(text=''))
            cnt = query.count()
            if cnt == 0:
                raise CommandError(u"No MatchedDocuments are missing text.")
            docs = batched_results(query, batch_size=1000)

        else:
            docs = list(MatchedDocument.objects.filter(doc_type=doctype, doc_id=docid))
            if len(docs) == 0:
                raise CommandError(u"No such MatchedDocument: ({},{})".format(doctype, docid))
            elif len(docs[0].text.strip()) > 0:
                raise CommandError(u"MatchedDocument ({},{}) already has non-empty text.".format(doctype, docid))

        wrtr.writerow(['doctype', 'docid', 'result']
                      if options['autofix'] == True
                      else ['doctype', 'docid'])
        for matched_doc in docs:
            if options['autofix'] == True:
                sfm_doc = sfm.document(matched_doc.doc_type, matched_doc.doc_id)
                if sfm_doc[u'success'] == False:
                    fix_result = "No such document in SFM."
                elif sfm_doc[u'success'] == True:
                    if len(sfm_doc[u'text'].strip()) > 0:
                        matched_doc.text = sfm_doc[u'text']
                        matched_doc.save()
                        fix_result = "Added text of length {}".format(len(sfm_doc[u'text']))
                    else:
                        fix_result = "Document has no text in SFM."

                wrtr.writerow([matched_doc.doc_type, matched_doc.doc_id, fix_result])
            else:
                wrtr.writerow([matched_doc.doc_type, matched_doc.doc_id])
    def handle(self, doctype, outpath=None, *args, **options):
        super(Command, self).handle(*args, **options)

        if doctype is None:
            raise CommandError(u"You must specify a doctype to prune.")
        try:
            doctype = int(doctype)
        except ValueError:
            raise CommandError(u"The doctype must be an integer.")

        if outpath is None:
            raise CommandError(u"You must specify an output file.")

        if os.path.exists(outpath):
            raise CommandError(u"File already exists: {}".format(outpath))

        if options['apply_sidebyside_thresholds'] == True:
            minimum_pct = settings.SIDEBYSIDE['minimum_coverage_pct']
            minimum_chars = settings.SIDEBYSIDE['minimum_coverage_chars']

        sfm = superfastmatch.from_django_conf()

        docs = superfastmatch.DocumentIterator(sfm,
                                               'docid',
                                               doctype=doctype,
                                               fetch_text=False)

        with open(outpath, 'wb') as outfile:
            wrtr = unicodecsv.DictWriter(outfile, ['doctype', 'docid'])
            wrtr.writeheader()

            for doc in docs:
                apiproxy_matches = list(
                    Match.objects.filter(
                        matched_document__doc_type=doctype,
                        matched_document__doc_id=doc['docid']))
                sidebyside_matches = []
                if options['apply_sidebyside_thresholds'] == True:
                    sidebyside_matches = [
                        m for m in apiproxy_matches
                        if m.overlapping_characters >= minimum_chars
                        and m.percent_churned >= minimum_pct
                    ]

                if (len(apiproxy_matches)
                        == 0) or (options['apply_sidebyside_thresholds']
                                  == True and len(sidebyside_matches) == 0):
                    if options['apply_sidebyside_thresholds'] == True:
                        for m in apiproxy_matches:
                            logging.info(
                                u"Scheduling deletion for match between {uuid} and ({doctype},{docid}) because it matches {uuid} with [{chars},{pct}%]"
                                .format(uuid=m.search_document.uuid,
                                        doctype=m.matched_document.doc_type,
                                        docid=m.matched_document.doc_id,
                                        chars=m.overlapping_characters,
                                        pct=m.percent_churned))
                    logging.info(
                        u"Scheduling deletion for document ({doctype},{docid})"
                        .format(**doc))
                    wrtr.writerow({
                        'doctype': doc['doctype'],
                        'docid': doc['docid']
                    })

                else:
                    for m in sidebyside_matches or apiproxy_matches:
                        logging.info(
                            u"Skipping deletion of document ({doctype},{docid}) because it matches {uuid} with [{chars},{pct}%]"
                            .format(doctype=doc['doctype'],
                                    docid=doc['docid'],
                                    uuid=m.search_document.uuid,
                                    chars=m.overlapping_characters,
                                    pct=m.percent_churned))

        file_status = os.stat(outpath)
        print(u"{sz!s: >12} {path}".format(sz=file_status.st_size,
                                           path=outpath))