예제 #1
0
 def throttle(self):
     """Return this Site's throttle. Initialize a new one if needed."""
     if not hasattr(self, '_throttle'):
         self._throttle = Throttle(self, multiplydelay=True)
     return self._throttle
예제 #2
0
import pywikibot
from pywikibot.data.api import APIError
from pywikibot.throttle import Throttle
import re
import mysql.connector
import json
import pickle
from redis import Redis
from datetime import date
from common import REDIS_KEY

site = pywikibot.Site('commons', 'commons')

site._throttle = Throttle(site, multiplydelay=False)

# Multi-workers are enough to cause problems, no need for internal
# locking to cause even more problems
site.lock_page = lambda *args, **kwargs: None  # noop
site.unlock_page = lambda *args, **kwargs: None  # noop

#search_generator = site.search('"El Paso Daily Times. (El Paso, Tex.), Vol"')
#igen = iter(search_generator)
#counter = 0
mydb = mysql.connector.connect(host="localhost",
                               user="******",
                               password="******",
                               database="commons_task3_run")
mycursor = mydb.cursor()

search_sql = "SELECT title FROM runs WHERE title = '%s'"
def run_worker():
    tmpdir = None  # Gets rid of IDE complaint/warning about access before assignment
    try:
        tmpdir = tempfile.mkdtemp()

        site = pywikibot.Site(user="******")
        site._throttle = Throttle(site, multiplydelay=False)

        # Multi-workers are enough to cause problems, no need for internal
        # locking to cause even more problems
        site.lock_page = lambda *args, **kwargs: None  # noop
        site.unlock_page = lambda *args, **kwargs: None  # noop

        redis = Redis(host="localhost")

        while True:
            _, picklemsg = redis.blpop(REDIS_KEY)
            change = pickle.loads(picklemsg) # Need to unpickle and build object once more - T99
            file_page = pywikibot.FilePage(site, change.title)
            global logger
            logger.info(change.title)
            if not allow_bots(file_page.text, "TheSandBot"):
                logger.critical("Not to edit " + file_page.title())
                continue

            if not file_page.exists():
                logger.debug(pywikibot.warning('File page does not exist ' + change.title))
                continue

            for i in range(8):
                try:
                    file_page.get_file_history()
                except pywikibot.exceptions.PageRelatedError as e:
                    # pywikibot.exceptions.PageRelatedError:
                    # loadimageinfo: Query on ... returned no imageinfo
                    pywikibot.exception(e)
                    site.throttle(write=True)
                else:
                    break
            else:
                raise

            # try:
            #     revision = file_page.get_file_history()[
            #         pywikibot.Timestamp.fromtimestampformat(
            #             change['log_params']['img_timestamp'])]
            # except KeyError:
            #     try:
            #         # From rcbacklog
            #         revision = file_page.get_file_history()[
            #             pywikibot.Timestamp.fromISOformat(
            #                 change['params']['img_timestamp'])]
            #     except KeyError:
            #         try:
            #             revision = file_page.get_file_history()[
            #                 pywikibot.Timestamp.fromtimestamp(
            #                     change['timestamp'])]
            #         except KeyError:
            #             revision = file_page.latest_file_info
            #             pywikibot.warning(
            #                 'Cannot fetch specified revision, falling back to '
            #                 'latest revision.')
            revision = change.getRevision(file_page)

            #pywikibot.output('Working on: %s at %s' % (change['title'],
            #                                           revision.timestamp))
            pywikibot.output('Working on: %s at %s' % (change.title, revision.timestamp))

            path = os.path.join(tmpdir, str(uuid.uuid1()))

            # Download image
            try:
                for i in range(8):  # Attempt to download 8 times. If it fails after this many, move on
                    try:
                        # returns download success result (True or False)
                        success = file_page.download(path, revision=revision)
                    except Exception as e:
                        pywikibot.exception(e)
                        success = False
                    if success:
                        break   # if we have a success, no point continuing to try and download
                    else:
                        pywikibot.warning(
                            'Possibly corrupted download on attempt %d' % i)
                        site.throttle(write=True)
                else:
                    pywikibot.warning('FIXME: Download attempt exhausted')
                    pywikibot.warning('FIXME: Download of ' + str(file_page.title() + ' failed. Aborting...'))
                    continue  # move on to the next file

                del success
                try:
                    corrupt_result = image_is_corrupt(path)
                except UnidentifiedImageError as e:
                    logger.debug(change.title + " ::: is not an image (or at very least not currently supported by PIL)")
                    os.remove(path)  # file not an image
                    store_image(change.title, False, img_hash=change.hash, not_image=True)  # store in database
                    # Previously the idea was to just raise the error,
                    # but since this is a constant running loop, just move on
                    # to the next file (once local removed)
                    continue
                if corrupt_result:
                    handle_result(site, file_page, change, logger)
                    # nom_date = str(get_next_month(7)).split('/')
                    # pwb_wrappers.tag_page(file_page,
                    #                       "{{TSB image identified corrupt|"
                    #                       + datetime.now(
                    #                           timezone.utc).strftime("%m/%d/%Y") + "|day=" +
                    #                       nom_date[1] + "|month=" + nom_date[0] + "|year=" + nom_date[2] + "}}",
                    #                       "Image detected as corrupt, tagging.")
                    # #store_image(file_page.title(), True, img_hash=img_hash, day_count=7)  # store in database
                    # store_image(file_page.title(), True, img_hash=change.hash, day_count=7)  # store in database
                    # logger.info("Saved page and logged in database")
                    # number_saved += 1  # FIXME: This MUST be removed once trials done and approved
                    # # Notify the user that the file needs updating
                    # try:  # TODO: Add record to database about successful notification?
                    #     notify_user(site, file_page, EDayCount.DAYS_7, EJobType.MONITOR, minor=False)
                    # except:  # TODO: Add record to database about failed notification?
                    #     logger.error("ERROR: Could not notify user about " + str(file_page.title()) + " being corrupt.")
                else:  # image not corrupt
                    #store_image(file_page.title(), False, img_hash=img_hash)  # store in database
                    store_image(file_page.title(), False, img_hash=change.hash)  # store in database
                    logger.info(file_page.title() + " :Not corrupt. Stored")

            except Exception:
                traceback.print_exc()
            finally:
                if os.path.exists(path):
                    os.remove(path)

        pywikibot.output("Exit - THIS SHOULD NOT HAPPEN")
    finally:
        shutil.rmtree(tmpdir)
예제 #4
0
def process_file2():
    tmpdir = None
    global logger
    try:
        tmpdir = tempfile.mkdtemp()
        site = pywikibot.Site(user="******")
        site._throttle = Throttle(site, multiplydelay=False)

        # Multi-workers are enough to cause problems, no need for internal
        # locking to cause even more problems
        site.lock_page = lambda *args, **kwargs: None  # noop
        site.unlock_page = lambda *args, **kwargs: None  # noop

        global skip
        # T111
        if os.path.exists(
                "./corrupt_have_seen_count.txt"
        ) and not file_is_empty("./corrupt_have_seen_count.txt"):
            with open("./corrupt_have_seen_count.txt", 'r') as f:
                try:
                    count_have_seen = int(f.readline())
                except (TypeError, ValueError):
                    logger.critical(
                        "Cannot cast string to int. Check corrupt_have_seen_count.txt format."
                    )
                    raise
        else:
            count_have_seen = 0
        tmp_count = copy.deepcopy(count_have_seen)
        for image_page in pwb_wrappers.allimages():
            if skip and tmp_count > 0:
                tmp_count -= 1
                logger.debug("Skipping check on " + image_page.title())
                continue

            if have_seen_image(site, image_page.title()):
                logger.debug("Have seen:: " + image_page.title())
                count_have_seen += 1
                continue

            if not allow_bots(image_page.text, "TheSandBot"):
                logger.critical("Not to edit " + image_page.title())
                continue

            #with open("./corrupt_have_seen_count.txt", 'w+') as f:
            #    f.write('{}'.format(count_have_seen))

            if not image_page.exists():
                logger.warning('File page does not exist:: ' +
                               image_page.title())
                continue
            for i in range(8):
                try:
                    image_page.get_file_history()
                except pywikibot.exceptions.PageRelatedError as e:
                    # pywikibot.exceptions.PageRelatedError:
                    # loadimageinfo: Query on ... returned no imageinfo
                    pywikibot.exception(e)
                    site.throttle(write=True)
                else:
                    break
            else:
                raise

            path = os.path.join(tmpdir, str(uuid.uuid1()))
            revision = image_page.latest_file_info
            # Download image
            try:
                for i in range(
                        8
                ):  # Attempt to download 8 times. If it fails after this many, move on
                    try:
                        # returns download success result (True or False)
                        success = image_page.download(path, revision=revision)
                    except Exception as e:
                        logger.exception(e)
                        success = False
                    if success:
                        break  # if we have a success, no point continuing to try and download
                    else:
                        logger.warning(
                            'Possibly corrupted download on attempt %d' % i)
                        site.throttle(write=True)
                else:
                    logger.warning('FIXME: Download attempt exhausted')
                    logger.warning('FIXME: Download of ' +
                                   str(image_page.title() +
                                       ' failed. Aborting...'))
                    continue  # move on to the next file

                del success
                img_hash = get_local_hash(path)
                try:
                    corrupt_result = image_is_corrupt(path)
                except UnidentifiedImageError as e:
                    logger.debug(
                        image_page.title() +
                        " ::: is not an image (or at very least not currently supported by PIL)"
                    )
                    os.remove(path)  # file not an image
                    store_image(image_page.title(),
                                False,
                                img_hash=img_hash,
                                not_image=True)  # store in database
                    # Previously the idea was to just raise the error,
                    # but since this is a constant running loop, just move on
                    # to the next file (once local removed)
                    continue

                if corrupt_result:
                    pwb_wrappers.tag_page(
                        image_page, "{{TSB image identified corrupt|" +
                        datetime.now(timezone.utc).strftime("%m/%d/%Y") +
                        "|day=" + gen_nom_date()[1] + "|month=" +
                        gen_nom_date()[0] + "|year=" + gen_nom_date()[2] +
                        "}}", "Image detected as corrupt, tagging.")
                    store_image(image_page.title(),
                                True,
                                img_hash=img_hash,
                                day_count=30)  # store in database

                    try:  # TODO: Add record to database about successful notification?
                        notify_user(site,
                                    image_page,
                                    EDayCount.DAYS_30,
                                    EJobType.FULL_SCAN,
                                    minor=False)
                    except:  # TODO: Add record to database about failed notification?
                        logger.error("ERROR: Could not notify user about " +
                                     str(image_page.title()) +
                                     " being corrupt.")
                else:  # image not corrupt
                    # store_image(file_page.title(), False, img_hash=img_hash)  # store in database
                    store_image(image_page.title(), False,
                                img_hash=img_hash)  # store in database
                    logger.info(image_page.title() + " :Not corrupt. Stored")

            except Exception:
                traceback.print_exc()
            finally:
                if os.path.exists(path):
                    os.remove(path)
                count_have_seen += 1
                with open("./corrupt_have_seen_count.txt", 'w+') as f:
                    f.write('{}'.format(count_have_seen))

        logger.critical("Exit - THIS SHOULD NOT HAPPEN")
    finally:
        shutil.rmtree(tmpdir)
예제 #5
0
    def process_file(self):
        tmpdir = None
        try:
            tmpdir = tempfile.mkdtemp()
            site = pywikibot.Site(user="******")
            site._throttle = Throttle(site, multiplydelay=False)

            # Multi-workers are enough to cause problems, no need for internal
            # locking to cause even more problems
            site.lock_page = lambda *args, **kwargs: None  # noop
            site.unlock_page = lambda *args, **kwargs: None  # noop

            count_have_seen = self.determine_file_count()
            tmp_count = copy.deepcopy(count_have_seen)
            for image_page in pwb_wrappers.allimages(reverse=self.reverse):
                if not self.run:
                    break

                if not image_page.exists():
                    self.logger.warning('File page does not exist:: ' +
                                        image_page.title())
                    continue

                if self.skip and tmp_count > 0:
                    tmp_count -= 1
                    self.logger.debug("Skipping check on " +
                                      image_page.title())
                    continue

                # T125
                if image_page.isRedirectPage():
                    logger.debug(
                        pywikibot.warning('File page is redirect' +
                                          image_page.title()))
                    continue

                if have_seen_image(site, image_page.title()):
                    self.logger.debug("Have seen:: " + image_page.title())
                    count_have_seen += 1
                    continue

                if not allow_bots(image_page.text, "TheSandBot"):
                    self.logger.critical("Not to edit " + image_page.title())
                    continue

                # for i in range(8):
                #     try:
                #         image_page.get_file_history()
                #     except pywikibot.exceptions.PageRelatedError as e:
                #         # pywikibot.exceptions.PageRelatedError:
                #         # loadimageinfo: Query on ... returned no imageinfo
                #         pywikibot.exception(e)
                #         site.throttle(write=True)
                #     else:
                #         break
                # else:
                #     raise

                path = os.path.join(tmpdir, str(uuid.uuid1()))
                #revision = image_page.latest_file_info
                # Download image
                try:
                    for i in range(
                            8
                    ):  # Attempt to download 8 times. If it fails after this many, move on
                        download_result = self.download_image(image_page, path)
                        if download_result:
                            break
                        if not download_result:
                            self.logger.warning(
                                'Possibly corrupted download on attempt %d' %
                                i)
                            site.throttle(write=True)
                    else:
                        self.logger.warning(
                            'FIXME: Download attempt exhausted')
                        self.logger.warning('FIXME: Download of ' +
                                            str(image_page.title() +
                                                ' failed. Aborting...'))
                        continue  # move on to the next file

                    del download_result
                    img_hash = get_local_hash(path)
                    try:
                        corrupt_result = image_is_corrupt(path)
                    except UnidentifiedImageError as e:
                        self.logger.debug(
                            image_page.title() +
                            " ::: is not an image (or at very least not currently supported by PIL)"
                        )
                        os.remove(path)  # file not an image
                        store_image(image_page.title(),
                                    False,
                                    img_hash=img_hash,
                                    not_image=True)  # store in database
                        continue  # move onto next file
                    except FileNotFoundError as e2:
                        continue

                    if corrupt_result:
                        pwb_wrappers.tag_page(
                            image_page, "{{TSB image identified corrupt|" +
                            datetime.now(timezone.utc).strftime("%m/%d/%Y") +
                            "|day=" + gen_nom_date()[1] + "|month=" +
                            gen_nom_date()[0] + "|year=" + gen_nom_date()[2] +
                            "}}", "Image detected as corrupt, tagging.")
                        store_image(image_page.title(),
                                    True,
                                    img_hash=img_hash,
                                    day_count=30)  # store in database

                        try:  # TODO: Add record to database about successful notification?
                            notify_user(site,
                                        image_page,
                                        EDayCount.DAYS_30,
                                        EJobType.FULL_SCAN,
                                        minor=False)
                        except:  # TODO: Add record to database about failed notification?
                            self.logger.error(
                                "ERROR: Could not notify user about " +
                                str(image_page.title()) + " being corrupt.")
                    else:  # image not corrupt
                        # store_image(file_page.title(), False, img_hash=img_hash)  # store in database
                        store_image(image_page.title(),
                                    False,
                                    img_hash=img_hash)  # store in database
                        self.logger.info(image_page.title() +
                                         " :Not corrupt. Stored")

                except Exception:
                    traceback.print_exc()
                finally:
                    if os.path.exists(path):
                        os.remove(path)
                    count_have_seen += 1
                    with open(self.file_count, 'w+') as f:
                        f.write('{}'.format(count_have_seen))
            if self.run:
                self.logger.critical("Exit - THIS SHOULD NOT HAPPEN")
            else:
                self.logger.critical("Exit - SHUTTING DOWN")
        finally:
            shutil.rmtree(tmpdir)