def generateLinks(listOfSiteObjects, tag): pageLinkList = [] totalNumberOfImages = 0 for obj in listOfSiteObjects: if obj.queryType == 'JSON': #Retrieve source of the regular page temporarySource = sourceRequester.getSource(obj.siteRoot + '?tags=%s' % tag) try: searchResult = re.search(r'<link href="/post\?page=(\d+)&tags=%s" rel="last" title="Last Page"' % tag, temporarySource).groups() except: continue temporarySource = sourceRequester.getSource(obj.siteRoot + '?tags=%s' % tag + obj.pageFlag + searchResult[0]) numberOfImages = (20 * (int(searchResult[0]) - 1)) + len(re.findall(r'Post\.register\(\{', temporarySource)) totalNumberOfImages += numberOfImages for pageNumber in [p + 1 for p in range(numberOfImages / 20 + 1) if True]: pageLinkList.append(obj.siteRoot + obj.siteQuery + tag + obj.pageFlag + str(pageNumber)) elif obj.queryType == 'XML': temporarySource = sourceRequester.getSource(obj.siteRoot + obj.siteQuery + tag + '&limit=1') try: numberOfImages = int(re.search(r'<posts count="(\d+)" offset="0">', temporarySource).groups()[0]) except: continue totalNumberOfImages += numberOfImages for pageNumber in [p + 1 for p in range(numberOfImages / 100 + 1) if True]: pageLinkList.append(obj.siteRoot + obj.siteQuery + tag + obj.pageFlag + str(pageNumber)) return (pageLinkList, totalNumberOfImages)
def work(appObject): appObject.enabler('DISABLED') printToLabel(appObject, 'Gathering links to all the pages...') objectList = booruInitializer.initialize(appObject.cacheSites) pageLinks = linkGenerator.generateLinks(objectList, appObject.cacheTags) imageLinkDictionary = {} # gathering links to the images we want visited = 0 for page in pageLinks[0]: if appObject.is_running: pageSource = sourceRequester.getSource(page) currentPageDictionary = sourceParser.parse(pageSource) for key in currentPageDictionary.keys(): if not key in imageLinkDictionary: imageLinkDictionary[key] = currentPageDictionary[key] visited += 1 printToLabel(appObject, 'Visited %d out of %d pages so far.\nGot %d links to unique images so far.' % (visited, len(pageLinks[0]), len(imageLinkDictionary)), freeEndLine = 0) else: # cancel button was hit printToLabel(appObject, "Link gathering cancelled") appObject.enabler('NORMAL') return None # download images llink = list(imageLinkDictionary.values()) l_thread = [] # spawn multiple threads for i_thread in range(appObject.num_threads): dl_thread = threading.Thread(target = downloadWorker, args = (appObject, llink)) l_thread.append(dl_thread) dl_thread.start() # wait for all threads to finish for dl_thread in l_thread: dl_thread.join() appObject.enabler('NORMAL') printToLabel(appObject, "Downloading terminated, %d links remaining" % (len(llink)))