def throttle(self): """Return this Site's throttle. Initialize a new one if needed.""" if not hasattr(self, '_throttle'): self._throttle = Throttle(self, multiplydelay=True) return self._throttle
import pywikibot from pywikibot.data.api import APIError from pywikibot.throttle import Throttle import re import mysql.connector import json import pickle from redis import Redis from datetime import date from common import REDIS_KEY site = pywikibot.Site('commons', 'commons') site._throttle = Throttle(site, multiplydelay=False) # Multi-workers are enough to cause problems, no need for internal # locking to cause even more problems site.lock_page = lambda *args, **kwargs: None # noop site.unlock_page = lambda *args, **kwargs: None # noop #search_generator = site.search('"El Paso Daily Times. (El Paso, Tex.), Vol"') #igen = iter(search_generator) #counter = 0 mydb = mysql.connector.connect(host="localhost", user="******", password="******", database="commons_task3_run") mycursor = mydb.cursor() search_sql = "SELECT title FROM runs WHERE title = '%s'"
def run_worker(): tmpdir = None # Gets rid of IDE complaint/warning about access before assignment try: tmpdir = tempfile.mkdtemp() site = pywikibot.Site(user="******") site._throttle = Throttle(site, multiplydelay=False) # Multi-workers are enough to cause problems, no need for internal # locking to cause even more problems site.lock_page = lambda *args, **kwargs: None # noop site.unlock_page = lambda *args, **kwargs: None # noop redis = Redis(host="localhost") while True: _, picklemsg = redis.blpop(REDIS_KEY) change = pickle.loads(picklemsg) # Need to unpickle and build object once more - T99 file_page = pywikibot.FilePage(site, change.title) global logger logger.info(change.title) if not allow_bots(file_page.text, "TheSandBot"): logger.critical("Not to edit " + file_page.title()) continue if not file_page.exists(): logger.debug(pywikibot.warning('File page does not exist ' + change.title)) continue for i in range(8): try: file_page.get_file_history() except pywikibot.exceptions.PageRelatedError as e: # pywikibot.exceptions.PageRelatedError: # loadimageinfo: Query on ... returned no imageinfo pywikibot.exception(e) site.throttle(write=True) else: break else: raise # try: # revision = file_page.get_file_history()[ # pywikibot.Timestamp.fromtimestampformat( # change['log_params']['img_timestamp'])] # except KeyError: # try: # # From rcbacklog # revision = file_page.get_file_history()[ # pywikibot.Timestamp.fromISOformat( # change['params']['img_timestamp'])] # except KeyError: # try: # revision = file_page.get_file_history()[ # pywikibot.Timestamp.fromtimestamp( # change['timestamp'])] # except KeyError: # revision = file_page.latest_file_info # pywikibot.warning( # 'Cannot fetch specified revision, falling back to ' # 'latest revision.') revision = change.getRevision(file_page) #pywikibot.output('Working on: %s at %s' % (change['title'], # revision.timestamp)) pywikibot.output('Working on: %s at %s' % (change.title, revision.timestamp)) path = os.path.join(tmpdir, str(uuid.uuid1())) # Download image try: for i in range(8): # Attempt to download 8 times. If it fails after this many, move on try: # returns download success result (True or False) success = file_page.download(path, revision=revision) except Exception as e: pywikibot.exception(e) success = False if success: break # if we have a success, no point continuing to try and download else: pywikibot.warning( 'Possibly corrupted download on attempt %d' % i) site.throttle(write=True) else: pywikibot.warning('FIXME: Download attempt exhausted') pywikibot.warning('FIXME: Download of ' + str(file_page.title() + ' failed. Aborting...')) continue # move on to the next file del success try: corrupt_result = image_is_corrupt(path) except UnidentifiedImageError as e: logger.debug(change.title + " ::: is not an image (or at very least not currently supported by PIL)") os.remove(path) # file not an image store_image(change.title, False, img_hash=change.hash, not_image=True) # store in database # Previously the idea was to just raise the error, # but since this is a constant running loop, just move on # to the next file (once local removed) continue if corrupt_result: handle_result(site, file_page, change, logger) # nom_date = str(get_next_month(7)).split('/') # pwb_wrappers.tag_page(file_page, # "{{TSB image identified corrupt|" # + datetime.now( # timezone.utc).strftime("%m/%d/%Y") + "|day=" + # nom_date[1] + "|month=" + nom_date[0] + "|year=" + nom_date[2] + "}}", # "Image detected as corrupt, tagging.") # #store_image(file_page.title(), True, img_hash=img_hash, day_count=7) # store in database # store_image(file_page.title(), True, img_hash=change.hash, day_count=7) # store in database # logger.info("Saved page and logged in database") # number_saved += 1 # FIXME: This MUST be removed once trials done and approved # # Notify the user that the file needs updating # try: # TODO: Add record to database about successful notification? # notify_user(site, file_page, EDayCount.DAYS_7, EJobType.MONITOR, minor=False) # except: # TODO: Add record to database about failed notification? # logger.error("ERROR: Could not notify user about " + str(file_page.title()) + " being corrupt.") else: # image not corrupt #store_image(file_page.title(), False, img_hash=img_hash) # store in database store_image(file_page.title(), False, img_hash=change.hash) # store in database logger.info(file_page.title() + " :Not corrupt. Stored") except Exception: traceback.print_exc() finally: if os.path.exists(path): os.remove(path) pywikibot.output("Exit - THIS SHOULD NOT HAPPEN") finally: shutil.rmtree(tmpdir)
def process_file2(): tmpdir = None global logger try: tmpdir = tempfile.mkdtemp() site = pywikibot.Site(user="******") site._throttle = Throttle(site, multiplydelay=False) # Multi-workers are enough to cause problems, no need for internal # locking to cause even more problems site.lock_page = lambda *args, **kwargs: None # noop site.unlock_page = lambda *args, **kwargs: None # noop global skip # T111 if os.path.exists( "./corrupt_have_seen_count.txt" ) and not file_is_empty("./corrupt_have_seen_count.txt"): with open("./corrupt_have_seen_count.txt", 'r') as f: try: count_have_seen = int(f.readline()) except (TypeError, ValueError): logger.critical( "Cannot cast string to int. Check corrupt_have_seen_count.txt format." ) raise else: count_have_seen = 0 tmp_count = copy.deepcopy(count_have_seen) for image_page in pwb_wrappers.allimages(): if skip and tmp_count > 0: tmp_count -= 1 logger.debug("Skipping check on " + image_page.title()) continue if have_seen_image(site, image_page.title()): logger.debug("Have seen:: " + image_page.title()) count_have_seen += 1 continue if not allow_bots(image_page.text, "TheSandBot"): logger.critical("Not to edit " + image_page.title()) continue #with open("./corrupt_have_seen_count.txt", 'w+') as f: # f.write('{}'.format(count_have_seen)) if not image_page.exists(): logger.warning('File page does not exist:: ' + image_page.title()) continue for i in range(8): try: image_page.get_file_history() except pywikibot.exceptions.PageRelatedError as e: # pywikibot.exceptions.PageRelatedError: # loadimageinfo: Query on ... returned no imageinfo pywikibot.exception(e) site.throttle(write=True) else: break else: raise path = os.path.join(tmpdir, str(uuid.uuid1())) revision = image_page.latest_file_info # Download image try: for i in range( 8 ): # Attempt to download 8 times. If it fails after this many, move on try: # returns download success result (True or False) success = image_page.download(path, revision=revision) except Exception as e: logger.exception(e) success = False if success: break # if we have a success, no point continuing to try and download else: logger.warning( 'Possibly corrupted download on attempt %d' % i) site.throttle(write=True) else: logger.warning('FIXME: Download attempt exhausted') logger.warning('FIXME: Download of ' + str(image_page.title() + ' failed. Aborting...')) continue # move on to the next file del success img_hash = get_local_hash(path) try: corrupt_result = image_is_corrupt(path) except UnidentifiedImageError as e: logger.debug( image_page.title() + " ::: is not an image (or at very least not currently supported by PIL)" ) os.remove(path) # file not an image store_image(image_page.title(), False, img_hash=img_hash, not_image=True) # store in database # Previously the idea was to just raise the error, # but since this is a constant running loop, just move on # to the next file (once local removed) continue if corrupt_result: pwb_wrappers.tag_page( image_page, "{{TSB image identified corrupt|" + datetime.now(timezone.utc).strftime("%m/%d/%Y") + "|day=" + gen_nom_date()[1] + "|month=" + gen_nom_date()[0] + "|year=" + gen_nom_date()[2] + "}}", "Image detected as corrupt, tagging.") store_image(image_page.title(), True, img_hash=img_hash, day_count=30) # store in database try: # TODO: Add record to database about successful notification? notify_user(site, image_page, EDayCount.DAYS_30, EJobType.FULL_SCAN, minor=False) except: # TODO: Add record to database about failed notification? logger.error("ERROR: Could not notify user about " + str(image_page.title()) + " being corrupt.") else: # image not corrupt # store_image(file_page.title(), False, img_hash=img_hash) # store in database store_image(image_page.title(), False, img_hash=img_hash) # store in database logger.info(image_page.title() + " :Not corrupt. Stored") except Exception: traceback.print_exc() finally: if os.path.exists(path): os.remove(path) count_have_seen += 1 with open("./corrupt_have_seen_count.txt", 'w+') as f: f.write('{}'.format(count_have_seen)) logger.critical("Exit - THIS SHOULD NOT HAPPEN") finally: shutil.rmtree(tmpdir)
def process_file(self): tmpdir = None try: tmpdir = tempfile.mkdtemp() site = pywikibot.Site(user="******") site._throttle = Throttle(site, multiplydelay=False) # Multi-workers are enough to cause problems, no need for internal # locking to cause even more problems site.lock_page = lambda *args, **kwargs: None # noop site.unlock_page = lambda *args, **kwargs: None # noop count_have_seen = self.determine_file_count() tmp_count = copy.deepcopy(count_have_seen) for image_page in pwb_wrappers.allimages(reverse=self.reverse): if not self.run: break if not image_page.exists(): self.logger.warning('File page does not exist:: ' + image_page.title()) continue if self.skip and tmp_count > 0: tmp_count -= 1 self.logger.debug("Skipping check on " + image_page.title()) continue # T125 if image_page.isRedirectPage(): logger.debug( pywikibot.warning('File page is redirect' + image_page.title())) continue if have_seen_image(site, image_page.title()): self.logger.debug("Have seen:: " + image_page.title()) count_have_seen += 1 continue if not allow_bots(image_page.text, "TheSandBot"): self.logger.critical("Not to edit " + image_page.title()) continue # for i in range(8): # try: # image_page.get_file_history() # except pywikibot.exceptions.PageRelatedError as e: # # pywikibot.exceptions.PageRelatedError: # # loadimageinfo: Query on ... returned no imageinfo # pywikibot.exception(e) # site.throttle(write=True) # else: # break # else: # raise path = os.path.join(tmpdir, str(uuid.uuid1())) #revision = image_page.latest_file_info # Download image try: for i in range( 8 ): # Attempt to download 8 times. If it fails after this many, move on download_result = self.download_image(image_page, path) if download_result: break if not download_result: self.logger.warning( 'Possibly corrupted download on attempt %d' % i) site.throttle(write=True) else: self.logger.warning( 'FIXME: Download attempt exhausted') self.logger.warning('FIXME: Download of ' + str(image_page.title() + ' failed. Aborting...')) continue # move on to the next file del download_result img_hash = get_local_hash(path) try: corrupt_result = image_is_corrupt(path) except UnidentifiedImageError as e: self.logger.debug( image_page.title() + " ::: is not an image (or at very least not currently supported by PIL)" ) os.remove(path) # file not an image store_image(image_page.title(), False, img_hash=img_hash, not_image=True) # store in database continue # move onto next file except FileNotFoundError as e2: continue if corrupt_result: pwb_wrappers.tag_page( image_page, "{{TSB image identified corrupt|" + datetime.now(timezone.utc).strftime("%m/%d/%Y") + "|day=" + gen_nom_date()[1] + "|month=" + gen_nom_date()[0] + "|year=" + gen_nom_date()[2] + "}}", "Image detected as corrupt, tagging.") store_image(image_page.title(), True, img_hash=img_hash, day_count=30) # store in database try: # TODO: Add record to database about successful notification? notify_user(site, image_page, EDayCount.DAYS_30, EJobType.FULL_SCAN, minor=False) except: # TODO: Add record to database about failed notification? self.logger.error( "ERROR: Could not notify user about " + str(image_page.title()) + " being corrupt.") else: # image not corrupt # store_image(file_page.title(), False, img_hash=img_hash) # store in database store_image(image_page.title(), False, img_hash=img_hash) # store in database self.logger.info(image_page.title() + " :Not corrupt. Stored") except Exception: traceback.print_exc() finally: if os.path.exists(path): os.remove(path) count_have_seen += 1 with open(self.file_count, 'w+') as f: f.write('{}'.format(count_have_seen)) if self.run: self.logger.critical("Exit - THIS SHOULD NOT HAPPEN") else: self.logger.critical("Exit - SHUTTING DOWN") finally: shutil.rmtree(tmpdir)