def run_thread(thread): general_result = { "author_floor_count": 0, "image_count": 0, } if thread.get_crawled_status(): print("Thread already crawled, passing") general_result["image_count"] = thread.get_image_count() return general_result thread_url = MAIN_URL + thread.get_href() ff = Selenium.get_instance() retry_count = 0 # Attempt to get results while True: try: retry_count += 1 soup = ff.crawl(thread_url, 20, AUTHOR_FLAG) break except KeyboardInterrupt: raise except: if retry_count < 5: print("RETRYING for {} time".format(retry_count)) continue print("Can't access thread: {}".format(thread.get_href())) dao = create.get_dao() crawled_thread = thread.mark_as_crawled(-1) dao.update_item(crawled_thread) return general_result # Attempt to read results try: posts = soup.select(MAIN_BLOCK_SELECTOR)[0] postlist = soup.select(FLOOR_SELECTOR) for post in postlist: post_result = run_post(post, thread) general_result["author_floor_count"] += int(post_result["authfl"]) general_result["image_count"] += post_result["images"] except KeyboardInterrupt: raise except: print("Cannot parse soup: {}".format(thread.get_href())) dao = create.get_dao() crawled_thread = thread.mark_as_crawled(-1) dao.update_item(crawled_thread) return general_result dao = create.get_dao() crawled_thread = thread.mark_as_crawled(general_result["image_count"]) dao.update_item(crawled_thread) return general_result
def judge_author_id(): request_data = request.get_json() dao = create.get_dao() mandatory_fields = ["author", "author_id", "update_count", "thread_count", "image_count", "l_judge", "x_judge", "avg_replies"] if not all(f in request_data for f in mandatory_fields): return jsonify(util.fail_with_reason("Need to pass in author and author_id")) author = request_data["author"] author_id = request_data["author_id"] thread_count = request_data["thread_count"] image_count = request_data["image_count"] update_count = request_data["update_count"] avg_replies = request_data["avg_replies"] l_judge = request_data["l_judge"] x_judge = request_data["x_judge"] existing_judgement = dao.find_item(Judgement(author_id=author_id)) # Sanity check if not type(l_judge) == type(x_judge) == int: return jsonify(util.fail_with_reason("Judgements need to be delivered in integers")) if l_judge > 10 or x_judge > 10: return jsonify(util.fail_with_reason("Judgements should not exceed 10")) if l_judge < 0 or x_judge < 0: return jsonify(util.fail_with_reason("Judgement should not be less than 0")) if existing_judgement: modified_judgement = existing_judgement.update(thread_count, image_count, avg_replies, update_count, l_judge, x_judge) dao.update_item(modified_judgement) else: new_judgement = Judgement(author_id=author_id, author=author, l_judge=l_judge, x_judge=x_judge, thread_count=thread_count, image_count=image_count, avg_replies=avg_replies, update_count=update_count) dao.insert_item(new_judgement) return jsonify(util.SUCCESS_RESULT)
def sample(author): dao = create.get_dao() images = dao.get_items(ForumImage, {"author": author}) sample = random.choice(images) ff_vis = Selenium.get_instance(False) ff_vis.crawl(sample.get_src(), manual_wait=10) print("AUTHOR_ID: {}".format(sample.get_author_id()))
def get_random_images(limit=500, amount=5, threshold=10): """ Deposit info of images found in threads of authors given that they passed the scoring threshold. args: limit: int - only look in the top $limit authors amount: int - only take ingestion from $amount authors threshold: float - only clear an author for ingestion after they score higher than $threshold """ if type(limit) is not int: limit = int(limit) if type(threshold) is not float: threshold = float(threshold) if type(amount) is not int: amount = int(amount) dao = create.get_dao() besties = dao.search_table("threads", {}, group_by=["author_id", "author"], limit=limit) random.shuffle(besties) # Add randomness counter = 0 for each in besties: spoils = get_author_aggregate_stat(dao, each["author_id"], False) score = score_agg_stats(spoils["data"]) if score["total"] > threshold: print("RUNNING INGEST FOR AUTHOR: {}".format(each["author"])) print(spoils["data"]) summary = run_threads(spoils["threads"]) if summary["author_floor_count"] > 0: print("NEW RESULTS FOR {}: {}".format(each["author"], summary)) counter += 1 if counter >= amount: break
def score_id(author_id, vis=True): """ Determine the score of an author based on his/her thread popularity and picture count args: author_id: str - id of the author vis: bool - print the result to stdout """ dao = create.get_dao() threads = dao.get_items(ForumThread, {"author_id": author_id}) images = dao.get_items(ForumImage, {"author_id": author_id}) update_counts = sum([t.get_update_count() for t in threads]) replies = sum([t.get_replies() for t in threads]) avg_popularity = replies/len(threads) score = math.log(len(images)+1) + (math.log(avg_popularity+1)-2)*2 score += 0 if (update_counts < 10) else math.log(update_counts+1) - 2 if len(threads) < 3 and avg_popularity < 150: score -= 5 if len(images) < 20: score -= 5 if avg_popularity < 15: score -= 5 if vis: print("AVG_POPULARITY: {}, ASSET_COUNT: {}".format(avg_popularity, len(images))) print("SCORE: {}".format(score)) return score
def load_random_images(limit=1, threshold=10): """ Load images from random author who score pass threshold args: limit: int - number of authors to load from threshold: float - score threshold """ if type(limit) is not int: limit = int(limit) if type(threshold) is not float: threshold = float(threshold) dao = create.get_dao() authors = dao.search_table("images", {}, group_by=["author", "author_id"]) random.shuffle(authors) counter = 0 for author in authors: score = score_id(author["author_id"], vis=False) if score > threshold: if author["author"] in BLACKLIST: print("AUTHOR {} IS IN BLACKLIST, SKIPPING".format(author["author"])) continue loaded = load_id_images(author["author_id"], author["author"], verbose=False) if loaded > 0: counter += 1 if counter >= limit: break
def get_author_from_id(author_id): dao = create.get_dao() threads = dao.get_items(ForumThread, {"author_id": author_id}) author = set() for t in threads: author.add(t.get_author()) return list(author)
def get_threads(limit=500, top=False): """ Get a list of thread and its info from the forum. args: limit: int - the amount of newest thread to ingest based on forum ranking """ if type(limit) is not int: limit = int(limit) if type(top) is not bool: top = top == "true" if top: print("getting top threads instead") dao = create.get_dao() create.create_tables() threadgen = runthreads(limit, top) for threadbatch in threadgen: record = { "NEW_ITEM": 0, "NOT_UPDATED": 0, "UPDATED": 0, "CANNOT_UPDATE": 0, "SHIT": 0, } for thread in threadbatch: record[thread_helper(dao, thread)] += 1 print("Batch record: {}".format(record)) print("Total of {} threads stored".format(dao.get_row_count("threads")))
def handle_request(data): dao = create.get_dao() spread = 0 l_arg = int(data['l_arg']) x_arg = int(data['x_arg']) possibles = [] while True: l_lower = l_arg - (spread + 1) // 4 x_lower = x_arg - spread // 4 l_higher = l_arg + (spread + 2) // 4 x_higher = x_arg + (spread + 3) // 4 terms = SearchDict() terms.add_between('l_judge', l_lower, l_higher) terms.add_between('x_judge', x_lower, x_higher) possibles = dao.search_table(TABLE_NAME, terms) print("Found {} matching".format(len(possibles))) count = len(possibles) if count > 40: break else: if random.random() < count / 40: break spread += 1 selected = random.choice(possibles) result = { 'author': selected['author'], 'author_id': selected['author_id'], 'l_judge': selected['l_judge'], 'x_judge': selected['x_judge'], } result.update(util.SUCCESS_RESULT) return jsonify(result)
def induct_history(author_id): dao = create.get_dao() # Find all inductee images = dao.get_items(ForumImage, {"author_id": author_id}) # Move forward if all author agree: author_set = set() for image in images: author_set.add(image.get_author()) if len(author_set) > 1: print("Conflicting author information, exiting") return author = author_set.pop() crawled = [ im for im in images if im.get_crawled_status() and not im.is_duplicate() ] uuid_set = {im.get_uuid() for im in crawled} if not crawled: print("No crawled images belong to this author") return ops_dir = os.path.dirname(crawled[0].get_file_path()) img_files = [f for f in os.listdir(ops_dir) if f.endswith("jpg")] historics = list( filter(lambda f: f.split(".jpg")[0] not in uuid_set, img_files)) print("Found {} historic files to induct".format(len(historics))) if not historics: return for f in historics: fname = os.path.join(ops_dir, f) print("FILEPATH: {}".format(fname)) if not str(input("Convert? y/n")) == "y": print("Not converting history today") return flist = [os.path.join(ops_dir, f) for f in historics] add_historic_thread(dao, author, author_id, flist)
def get_auth_information(author_id): dao = create.get_dao() threads = dao.get_items(ForumThread, {"author_id": author_id}) images = dao.get_items(ForumImage, {"author_id": author_id}) util.populate_auth_image(images) result = {"auth_info": auth_info_block(threads, images)} result.update(random_image_block(images)) return result
def get_author_id(author, vis=True): dao = create.get_dao() threads = dao.get_items(ForumThread, {"author": author}) if not threads: return "NO_SUCH_AUTHOR" if vis: print("AUTHOR_ID: {}".format(threads[0].get_author_id())) return threads[0].get_author_id()
def get_id_images(author_id, vis=True): """ Get pictures info from an author args: author_id: str - id of author """ dao = create.get_dao() spoils = get_author_aggregate_stat(dao, author_id, True) score = score_agg_stats(spoils["data"]) if vis: print(score) result = run_threads(spoils["threads"]) print(result)
def recon_id(author_id, remove_duplicate=False, remove_deleted=False): if remove_duplicate: remove_duplicate = True dao = create.get_dao() images = dao.get_items(ForumImage, {"author_id": author_id}) crawled_images = [ im for im in images if im.get_crawled_status() and not im.is_duplicate() ] print("Found {} images to recon".format(len(crawled_images))) failed_images = [] fubar_images = [] for image in crawled_images: real_path = image.get_file_path() if not os.path.isfile(real_path): print("{} is missing".format(real_path)) failed_images.append(image) continue if filecmp.cmp(FAIL_FILE_PATH, real_path): print("{} has failed".format(real_path)) os.remove(real_path) failed_images.append(image) continue try: Image.open(real_path) except IOError: print("{} is fubar".format(real_path)) fubar_images.append(image) for each in failed_images: if remove_deleted: each.mark_as_duplicate() else: each.mark_as_uncrawled() print("Found {} images in wrong state".format(len(failed_images))) if failed_images: dao.update_items(failed_images) for each in fubar_images: each.mark_as_duplicate() print("Found {} images in fubar".format(len(fubar_images))) if fubar_images: dao.update_items(fubar_images) dup_im_count = reconcile_duplicates(author_id, remove_duplicate) return { 'total_image_count': len(crawled_images), 'total_failed_count': len(failed_images), 'total_fubar_count': len(fubar_images), 'total_duplicate_found': dup_im_count, }
def reconcile_multiple_auth_name(limit=500): if type(limit) is not int: limit = int(limit) dao = create.get_dao() auth_ids = dao.search_table("threads", {}, group_by=["author_id"], limit=limit) for auth_id in auth_ids: images = dao.get_items(ForumImage, {"author_id": auth_id["author_id"]}) im_set = {} for image in images: if image.get_author() in im_set: im_set[image.get_author()] += 1 else: im_set[image.get_author()] = 1 if len(im_set) > 1: print("Differences in auth_names identified: {}".format(im_set))
def remove_id_images(author_id): dao = create.get_dao() images = dao.get_items(ForumImage, {"author_id": author_id}) crawled_images = [im for im in images if im.get_crawled_status()] print("Found {} images to remove".format(len(crawled_images))) pardirs = set() for image in crawled_images: if image.is_duplicate(): continue real_path = image.get_file_path() pardirs.add(os.path.dirname(real_path)) os.remove(real_path) image.mark_as_uncrawled() dao.update_items(crawled_images) print("Removing {}".format(" ".join(pardirs))) for directory in pardirs: os.rmdir(directory)
def auth_find(): author = request.args.get('author', '') author_id = request.args.get('author_id', '') if not author and not author_id: return jsonify( util.fail_with_reason("Did not pass in author or author_id")) dao = create.get_dao() if not author: author_list = authrunner.get_author_from_id(author_id) if not author_list: return jsonify(util.fail_with_reason("Did not find author by id")) return jsonify(build_author_result(author_list[0], author_id)) # There is author but no author_id author_id = authrunner.get_author_id(author) if author_id == "NO_SUCH_AUTHOR": return jsonify(util.fail_with_reason("Did not find author")) return jsonify(build_author_result(author, author_id))
def get_judgement(): author_id = request.args.get("author_id", "") if not author_id: return jsonify(util.fail_with_reason("author_id not passed in")) dao = create.get_dao() dummy = Judgement(author_id=author_id) judgement = dao.find_item(dummy) if judgement: result = { "l_judge": judgement.get_l_judge(), "x_judge": judgement.get_x_judge(), "times_judged": judgement.get_judge_count(), } result.update(util.SUCCESS_RESULT) return jsonify(result) else: return jsonify(util.fail_with_reason("No judgement found"))
def authrandom(): crawled = request.args.get('crawled', '') crawled = bool(crawled) tojudge = request.args.get('tojudge', '') tojudge = bool(tojudge) above_l = request.args.get('above_l', '0') above_l = int(above_l) above_x = request.args.get('above_x', '0') above_x = int(above_x) dao = create.get_dao() top = dao.search_table("images", {}, group_by=["author_id", "author"], limit=1000) auth_select = {} while True: auth_select = random.choice(top) c_flag = False if crawled else True t_flag = False if tojudge else True l_flag = False if tojudge else True x_flag = False if tojudge else True judgement = dao.find_item( Judgement(author_id=auth_select["author_id"])) if crawled: images = dao.get_items(ForumImage, {"author_id": auth_select["author_id"]}) if any(im.get_crawled_status() for im in images): c_flag = True if tojudge: if judgement: t_flag = True else: continue if judgement.get_l_judge() > above_l: l_flag = True if judgement.get_x_judge() > above_x: x_flag = True if all([c_flag, t_flag, l_flag, x_flag]): break result = build_author_result(auth_select["author"], auth_select["author_id"]) return jsonify(result)
def load_id_images(author_id, convenient_name_for_logging=None, verbose=True): """ Load the pictures from author to file based on info stored. args: author_id: id of the author """ bballs = [] dao = create.get_dao() if convenient_name_for_logging is None: threads = dao.get_items(ForumThread, {"author_id": author_id}) if threads: convenient_name_for_logging = threads[0].get_author() else: convenient_name_for_logging = author_id bballs.append("CRAWLING FOR AUTHOR: {}".format(convenient_name_for_logging)) images = dao.get_items(ForumImage, {"author_id": author_id}) bballs.append("Got {} assets".format(len(images))) return imrun(dao, images, bballs, verbose)
def get_most_updated(limit=100): if type(limit) is not int: limit = int(limit) dao = create.get_dao() besties = dao.search_table("threads", {}, group_by=["author_id", "author"], limit=limit) updated = [{ "update_count": get_author_aggregate_stat(dao, e["author_id"], False)['data']['update_count'], "name": e['author'], "id": e['author_id'] } for e in besties] updated.sort(key=lambda x: x['update_count']) for each in updated: print(each)
def reconcile_duplicates(author_id, remove=False): dao = create.get_dao() images = dao.get_items(ForumImage, {"author_id": author_id}) crawled_images = [ im for im in images if im.get_crawled_status() and not im.is_duplicate() ] print("Found {} images to recon".format(len(crawled_images))) hash_records = {} for image in crawled_images: real_path = image.get_file_path() hastr = hashstr(real_path) if hastr in hash_records: hash_records[hastr].append(image) else: hash_records[hastr] = [image] duplicate_image_count = 0 for hastr, imlist in hash_records.items(): if len(imlist) > 1: duplicate_image_count += len(imlist) - 1 print("Found duplicate imgs: {}".format(hastr)) for image in imlist: print("{} belongs to {}".format(image.get_file_path(), image.get_href())) if remove: sourced = [ im for im in imlist if im.get_href().startswith("/p") ] if sourced: kept = sourced.pop() else: kept = imlist[0] imlist.remove(kept) print("Keeping: {} from {}".format(kept.get_file_path(), kept.get_href())) for dup in imlist: os.remove(dup.get_file_path()) dup.mark_as_duplicate() dao.update_items(imlist) return duplicate_image_count
def store_info(image, thread, floor_num, create_time): dao = create.get_dao() src = image.attrs["src"] width = image.attrs["width"] height = image.attrs["height"] if "size" in image.attrs: size = image.attrs["size"] else: size = -1 href = thread.get_href() author = thread.get_author() author_id = thread.get_author_id() imageria = ForumImage(src=src, href=href, image_size=size, image_width=width, image_height=height, author=author, author_id=author_id, floor_num=floor_num, create_time=create_time) return imageria
def run_post(post, thread): post_result = { "authfl": False, "images": 0, } auth_flag = post.select(AUTHOR_FLAG) if not auth_flag: return post_result post_result["authfl"] = True data_field = json.loads(post.attrs["data-field"]) floor_num = data_field["content"]["post_index"] create_time = post.select(CREATE_TIME_SELECTOR)[-1].text content = post.select(CONTENT_SELECTOR)[0] if not content: print("NO CONTENT FOUND, LIKELY BAD PAGE LOAD") images = content.select(IMAGE_SELECTOR) post_result["images"] = len(images) imagelist = [] for image in images: imagelist.append(store_info(image, thread, floor_num, create_time)) if imagelist: dao = create.get_dao() dao.insert_items(imagelist) return post_result
def sample_id(author_id): dao = create.get_dao() images = dao.get_items(ForumImage, {"author_id": author_id}) sample = random.choice(images) ff_vis = Selenium.get_instance(False) ff_vis.crawl(sample.get_src(), manual_wait=5)