Exemplo n.º 1
0
def run_thread(thread):
    general_result = {
        "author_floor_count": 0,
        "image_count": 0,
    }
    if thread.get_crawled_status():
        print("Thread already crawled, passing")
        general_result["image_count"] = thread.get_image_count()
        return general_result
    thread_url = MAIN_URL + thread.get_href()
    ff = Selenium.get_instance()
    retry_count = 0

    # Attempt to get results
    while True:
        try:
            retry_count += 1
            soup = ff.crawl(thread_url, 20, AUTHOR_FLAG)
            break
        except KeyboardInterrupt:
            raise
        except:
            if retry_count < 5:
                print("RETRYING for {} time".format(retry_count))
                continue
            print("Can't access thread: {}".format(thread.get_href()))
            dao = create.get_dao()
            crawled_thread = thread.mark_as_crawled(-1)
            dao.update_item(crawled_thread)
            return general_result

    # Attempt to read results
    try:
        posts = soup.select(MAIN_BLOCK_SELECTOR)[0]
        postlist = soup.select(FLOOR_SELECTOR)
        for post in postlist:
            post_result = run_post(post, thread)
            general_result["author_floor_count"] += int(post_result["authfl"])
            general_result["image_count"] += post_result["images"]
    except KeyboardInterrupt:
        raise
    except:
        print("Cannot parse soup: {}".format(thread.get_href()))
        dao = create.get_dao()
        crawled_thread = thread.mark_as_crawled(-1)
        dao.update_item(crawled_thread)
        return general_result
    dao = create.get_dao()
    crawled_thread = thread.mark_as_crawled(general_result["image_count"])
    dao.update_item(crawled_thread)
    return general_result
Exemplo n.º 2
0
def judge_author_id():
    request_data = request.get_json()
    dao = create.get_dao()
    mandatory_fields = ["author", "author_id", "update_count", "thread_count", "image_count", "l_judge", "x_judge", "avg_replies"]
    if not all(f in request_data for f in mandatory_fields):
        return jsonify(util.fail_with_reason("Need to pass in author and author_id"))
    author = request_data["author"]
    author_id = request_data["author_id"]
    thread_count = request_data["thread_count"]
    image_count = request_data["image_count"]
    update_count = request_data["update_count"]
    avg_replies = request_data["avg_replies"]
    l_judge = request_data["l_judge"]
    x_judge = request_data["x_judge"]
    existing_judgement = dao.find_item(Judgement(author_id=author_id))

    # Sanity check
    if not type(l_judge) == type(x_judge) == int:
        return jsonify(util.fail_with_reason("Judgements need to be delivered in integers"))
    if l_judge > 10 or x_judge > 10:
        return jsonify(util.fail_with_reason("Judgements should not exceed 10"))
    if l_judge < 0 or x_judge < 0:
        return jsonify(util.fail_with_reason("Judgement should not be less than 0"))

    if existing_judgement:
        modified_judgement = existing_judgement.update(thread_count, image_count, avg_replies, update_count, l_judge, x_judge)
        dao.update_item(modified_judgement)
    else:
        new_judgement = Judgement(author_id=author_id, author=author, l_judge=l_judge, x_judge=x_judge, thread_count=thread_count, image_count=image_count, avg_replies=avg_replies, update_count=update_count)
        dao.insert_item(new_judgement)
    return jsonify(util.SUCCESS_RESULT)
Exemplo n.º 3
0
def sample(author):
    dao = create.get_dao()
    images = dao.get_items(ForumImage, {"author": author})
    sample = random.choice(images)
    ff_vis = Selenium.get_instance(False)
    ff_vis.crawl(sample.get_src(), manual_wait=10)
    print("AUTHOR_ID: {}".format(sample.get_author_id()))
Exemplo n.º 4
0
def get_random_images(limit=500, amount=5, threshold=10):
    """
    Deposit info of images found in threads of authors given that they passed the scoring threshold.

    args:
        limit: int - only look in the top $limit authors
        amount: int - only take ingestion from $amount authors
        threshold: float - only clear an author for ingestion after they score higher than $threshold
    """
    if type(limit) is not int:
        limit = int(limit)
    if type(threshold) is not float:
        threshold = float(threshold)
    if type(amount) is not int:
        amount = int(amount)
    dao = create.get_dao()
    besties = dao.search_table("threads", {},
                               group_by=["author_id", "author"],
                               limit=limit)
    random.shuffle(besties)  # Add randomness
    counter = 0
    for each in besties:
        spoils = get_author_aggregate_stat(dao, each["author_id"], False)
        score = score_agg_stats(spoils["data"])
        if score["total"] > threshold:
            print("RUNNING INGEST FOR AUTHOR: {}".format(each["author"]))
            print(spoils["data"])
            summary = run_threads(spoils["threads"])
            if summary["author_floor_count"] > 0:
                print("NEW RESULTS FOR {}: {}".format(each["author"], summary))
                counter += 1
        if counter >= amount:
            break
Exemplo n.º 5
0
def score_id(author_id, vis=True):
    """
    Determine the score of an author based on his/her thread popularity and picture count

    args:
        author_id: str - id of the author
        vis: bool - print the result to stdout
    """
    dao = create.get_dao()
    threads = dao.get_items(ForumThread, {"author_id": author_id})
    images = dao.get_items(ForumImage, {"author_id": author_id})
    update_counts = sum([t.get_update_count() for t in threads])
    replies = sum([t.get_replies() for t in threads])
    avg_popularity = replies/len(threads)
    score = math.log(len(images)+1) + (math.log(avg_popularity+1)-2)*2
    score += 0 if (update_counts < 10) else math.log(update_counts+1) - 2
    if len(threads) < 3 and avg_popularity < 150:
        score -= 5
    if len(images) < 20:
        score -= 5
    if avg_popularity < 15:
        score -= 5
    if vis:
        print("AVG_POPULARITY: {}, ASSET_COUNT: {}".format(avg_popularity, len(images)))
        print("SCORE: {}".format(score))
    return score
Exemplo n.º 6
0
def load_random_images(limit=1, threshold=10):
    """
    Load images from random author who score pass threshold

    args:
        limit: int - number of authors to load from
        threshold: float - score threshold
    """
    if type(limit) is not int:
        limit = int(limit)
    if type(threshold) is not float:
        threshold = float(threshold)
    dao = create.get_dao()
    authors = dao.search_table("images", {}, group_by=["author", "author_id"])
    random.shuffle(authors)
    counter = 0
    for author in authors:
        score = score_id(author["author_id"], vis=False)
        if score > threshold:
            if author["author"] in BLACKLIST:
                print("AUTHOR {} IS IN BLACKLIST, SKIPPING".format(author["author"]))
                continue
            loaded = load_id_images(author["author_id"], author["author"], verbose=False)
            if loaded > 0:
                counter += 1
        if counter >= limit:
            break
Exemplo n.º 7
0
def get_author_from_id(author_id):
    dao = create.get_dao()
    threads = dao.get_items(ForumThread, {"author_id": author_id})
    author = set()
    for t in threads:
        author.add(t.get_author())
    return list(author)
Exemplo n.º 8
0
def get_threads(limit=500, top=False):
    """
    Get a list of thread and its info from the forum.

    args:
        limit: int - the amount of newest thread to ingest based on forum ranking
    """
    if type(limit) is not int:
        limit = int(limit)
    if type(top) is not bool:
        top = top == "true"
        if top:
            print("getting top threads instead")
    dao = create.get_dao()
    create.create_tables()
    threadgen = runthreads(limit, top)
    for threadbatch in threadgen:
        record = {
            "NEW_ITEM": 0,
            "NOT_UPDATED": 0,
            "UPDATED": 0,
            "CANNOT_UPDATE": 0,
            "SHIT": 0,
        }
        for thread in threadbatch:
            record[thread_helper(dao, thread)] += 1
        print("Batch record: {}".format(record))
    print("Total of {} threads stored".format(dao.get_row_count("threads")))
Exemplo n.º 9
0
def handle_request(data):
    dao = create.get_dao()
    spread = 0
    l_arg = int(data['l_arg'])
    x_arg = int(data['x_arg'])
    possibles = []
    while True:
        l_lower = l_arg - (spread + 1) // 4
        x_lower = x_arg - spread // 4
        l_higher = l_arg + (spread + 2) // 4
        x_higher = x_arg + (spread + 3) // 4
        terms = SearchDict()
        terms.add_between('l_judge', l_lower, l_higher)
        terms.add_between('x_judge', x_lower, x_higher)
        possibles = dao.search_table(TABLE_NAME, terms)
        print("Found {} matching".format(len(possibles)))
        count = len(possibles)
        if count > 40:
            break
        else:
            if random.random() < count / 40:
                break
        spread += 1
    selected = random.choice(possibles)
    result = {
        'author': selected['author'],
        'author_id': selected['author_id'],
        'l_judge': selected['l_judge'],
        'x_judge': selected['x_judge'],
    }
    result.update(util.SUCCESS_RESULT)
    return jsonify(result)
Exemplo n.º 10
0
def induct_history(author_id):
    dao = create.get_dao()
    # Find all inductee
    images = dao.get_items(ForumImage, {"author_id": author_id})
    # Move forward if all author agree:
    author_set = set()
    for image in images:
        author_set.add(image.get_author())
    if len(author_set) > 1:
        print("Conflicting author information, exiting")
        return
    author = author_set.pop()
    crawled = [
        im for im in images
        if im.get_crawled_status() and not im.is_duplicate()
    ]
    uuid_set = {im.get_uuid() for im in crawled}
    if not crawled:
        print("No crawled images belong to this author")
        return
    ops_dir = os.path.dirname(crawled[0].get_file_path())
    img_files = [f for f in os.listdir(ops_dir) if f.endswith("jpg")]
    historics = list(
        filter(lambda f: f.split(".jpg")[0] not in uuid_set, img_files))
    print("Found {} historic files to induct".format(len(historics)))
    if not historics:
        return
    for f in historics:
        fname = os.path.join(ops_dir, f)
        print("FILEPATH: {}".format(fname))
    if not str(input("Convert? y/n")) == "y":
        print("Not converting history today")
        return
    flist = [os.path.join(ops_dir, f) for f in historics]
    add_historic_thread(dao, author, author_id, flist)
Exemplo n.º 11
0
def get_auth_information(author_id):
    dao = create.get_dao()
    threads = dao.get_items(ForumThread, {"author_id": author_id})
    images = dao.get_items(ForumImage, {"author_id": author_id})
    util.populate_auth_image(images)
    result = {"auth_info": auth_info_block(threads, images)}
    result.update(random_image_block(images))
    return result
Exemplo n.º 12
0
def get_author_id(author, vis=True):
    dao = create.get_dao()
    threads = dao.get_items(ForumThread, {"author": author})
    if not threads:
        return "NO_SUCH_AUTHOR"
    if vis:
        print("AUTHOR_ID: {}".format(threads[0].get_author_id()))
    return threads[0].get_author_id()
Exemplo n.º 13
0
def get_id_images(author_id, vis=True):
    """
    Get pictures info from an author

    args:
        author_id: str - id of author
    """
    dao = create.get_dao()
    spoils = get_author_aggregate_stat(dao, author_id, True)
    score = score_agg_stats(spoils["data"])
    if vis:
        print(score)
    result = run_threads(spoils["threads"])
    print(result)
Exemplo n.º 14
0
def recon_id(author_id, remove_duplicate=False, remove_deleted=False):
    if remove_duplicate:
        remove_duplicate = True
    dao = create.get_dao()
    images = dao.get_items(ForumImage, {"author_id": author_id})
    crawled_images = [
        im for im in images
        if im.get_crawled_status() and not im.is_duplicate()
    ]
    print("Found {} images to recon".format(len(crawled_images)))
    failed_images = []
    fubar_images = []
    for image in crawled_images:
        real_path = image.get_file_path()
        if not os.path.isfile(real_path):
            print("{} is missing".format(real_path))
            failed_images.append(image)
            continue
        if filecmp.cmp(FAIL_FILE_PATH, real_path):
            print("{} has failed".format(real_path))
            os.remove(real_path)
            failed_images.append(image)
            continue
        try:
            Image.open(real_path)
        except IOError:
            print("{} is fubar".format(real_path))
            fubar_images.append(image)
    for each in failed_images:
        if remove_deleted:
            each.mark_as_duplicate()
        else:
            each.mark_as_uncrawled()
    print("Found {} images in wrong state".format(len(failed_images)))
    if failed_images:
        dao.update_items(failed_images)
    for each in fubar_images:
        each.mark_as_duplicate()
    print("Found {} images in fubar".format(len(fubar_images)))
    if fubar_images:
        dao.update_items(fubar_images)
    dup_im_count = reconcile_duplicates(author_id, remove_duplicate)
    return {
        'total_image_count': len(crawled_images),
        'total_failed_count': len(failed_images),
        'total_fubar_count': len(fubar_images),
        'total_duplicate_found': dup_im_count,
    }
Exemplo n.º 15
0
def reconcile_multiple_auth_name(limit=500):
    if type(limit) is not int:
        limit = int(limit)
    dao = create.get_dao()
    auth_ids = dao.search_table("threads", {},
                                group_by=["author_id"],
                                limit=limit)
    for auth_id in auth_ids:
        images = dao.get_items(ForumImage, {"author_id": auth_id["author_id"]})
        im_set = {}
        for image in images:
            if image.get_author() in im_set:
                im_set[image.get_author()] += 1
            else:
                im_set[image.get_author()] = 1
        if len(im_set) > 1:
            print("Differences in auth_names identified: {}".format(im_set))
Exemplo n.º 16
0
def remove_id_images(author_id):
    dao = create.get_dao()
    images = dao.get_items(ForumImage, {"author_id": author_id})
    crawled_images = [im for im in images if im.get_crawled_status()]
    print("Found {} images to remove".format(len(crawled_images)))
    pardirs = set()
    for image in crawled_images:
        if image.is_duplicate():
            continue
        real_path = image.get_file_path()
        pardirs.add(os.path.dirname(real_path))
        os.remove(real_path)
        image.mark_as_uncrawled()
    dao.update_items(crawled_images)
    print("Removing {}".format(" ".join(pardirs)))
    for directory in pardirs:
        os.rmdir(directory)
Exemplo n.º 17
0
def auth_find():
    author = request.args.get('author', '')
    author_id = request.args.get('author_id', '')
    if not author and not author_id:
        return jsonify(
            util.fail_with_reason("Did not pass in author or author_id"))
    dao = create.get_dao()
    if not author:
        author_list = authrunner.get_author_from_id(author_id)
        if not author_list:
            return jsonify(util.fail_with_reason("Did not find author by id"))
        return jsonify(build_author_result(author_list[0], author_id))
    # There is author but no author_id
    author_id = authrunner.get_author_id(author)
    if author_id == "NO_SUCH_AUTHOR":
        return jsonify(util.fail_with_reason("Did not find author"))
    return jsonify(build_author_result(author, author_id))
Exemplo n.º 18
0
def get_judgement():
    author_id = request.args.get("author_id", "")
    if not author_id:
        return jsonify(util.fail_with_reason("author_id not passed in"))
    dao = create.get_dao()
    dummy = Judgement(author_id=author_id)
    judgement = dao.find_item(dummy)
    if judgement:
        result = {
            "l_judge": judgement.get_l_judge(),
            "x_judge": judgement.get_x_judge(),
            "times_judged": judgement.get_judge_count(),
        }
        result.update(util.SUCCESS_RESULT)
        return jsonify(result)
    else:
        return jsonify(util.fail_with_reason("No judgement found"))
Exemplo n.º 19
0
def authrandom():
    crawled = request.args.get('crawled', '')
    crawled = bool(crawled)
    tojudge = request.args.get('tojudge', '')
    tojudge = bool(tojudge)
    above_l = request.args.get('above_l', '0')
    above_l = int(above_l)
    above_x = request.args.get('above_x', '0')
    above_x = int(above_x)
    dao = create.get_dao()
    top = dao.search_table("images", {},
                           group_by=["author_id", "author"],
                           limit=1000)

    auth_select = {}
    while True:
        auth_select = random.choice(top)
        c_flag = False if crawled else True
        t_flag = False if tojudge else True
        l_flag = False if tojudge else True
        x_flag = False if tojudge else True
        judgement = dao.find_item(
            Judgement(author_id=auth_select["author_id"]))
        if crawled:
            images = dao.get_items(ForumImage,
                                   {"author_id": auth_select["author_id"]})
            if any(im.get_crawled_status() for im in images):
                c_flag = True
        if tojudge:
            if judgement:
                t_flag = True
            else:
                continue
            if judgement.get_l_judge() > above_l:
                l_flag = True
            if judgement.get_x_judge() > above_x:
                x_flag = True
        if all([c_flag, t_flag, l_flag, x_flag]):
            break

    result = build_author_result(auth_select["author"],
                                 auth_select["author_id"])
    return jsonify(result)
Exemplo n.º 20
0
def load_id_images(author_id, convenient_name_for_logging=None, verbose=True):
    """
    Load the pictures from author to file based on info stored.

    args:
        author_id: id of the author
    """
    bballs = []
    dao = create.get_dao()
    if convenient_name_for_logging is None:
        threads = dao.get_items(ForumThread, {"author_id": author_id})
        if threads:
            convenient_name_for_logging = threads[0].get_author()
        else:
            convenient_name_for_logging = author_id
    bballs.append("CRAWLING FOR AUTHOR: {}".format(convenient_name_for_logging))
    images = dao.get_items(ForumImage, {"author_id": author_id})
    bballs.append("Got {} assets".format(len(images)))
    return imrun(dao, images, bballs, verbose)
Exemplo n.º 21
0
def get_most_updated(limit=100):
    if type(limit) is not int:
        limit = int(limit)
    dao = create.get_dao()
    besties = dao.search_table("threads", {},
                               group_by=["author_id", "author"],
                               limit=limit)
    updated = [{
        "update_count":
        get_author_aggregate_stat(dao, e["author_id"],
                                  False)['data']['update_count'],
        "name":
        e['author'],
        "id":
        e['author_id']
    } for e in besties]
    updated.sort(key=lambda x: x['update_count'])
    for each in updated:
        print(each)
Exemplo n.º 22
0
def reconcile_duplicates(author_id, remove=False):
    dao = create.get_dao()
    images = dao.get_items(ForumImage, {"author_id": author_id})
    crawled_images = [
        im for im in images
        if im.get_crawled_status() and not im.is_duplicate()
    ]
    print("Found {} images to recon".format(len(crawled_images)))
    hash_records = {}
    for image in crawled_images:
        real_path = image.get_file_path()
        hastr = hashstr(real_path)
        if hastr in hash_records:
            hash_records[hastr].append(image)
        else:
            hash_records[hastr] = [image]
    duplicate_image_count = 0
    for hastr, imlist in hash_records.items():
        if len(imlist) > 1:
            duplicate_image_count += len(imlist) - 1
            print("Found duplicate imgs: {}".format(hastr))
            for image in imlist:
                print("{} belongs to {}".format(image.get_file_path(),
                                                image.get_href()))
            if remove:
                sourced = [
                    im for im in imlist if im.get_href().startswith("/p")
                ]
                if sourced:
                    kept = sourced.pop()
                else:
                    kept = imlist[0]
                imlist.remove(kept)
                print("Keeping: {} from {}".format(kept.get_file_path(),
                                                   kept.get_href()))
                for dup in imlist:
                    os.remove(dup.get_file_path())
                    dup.mark_as_duplicate()
                dao.update_items(imlist)
    return duplicate_image_count
Exemplo n.º 23
0
def store_info(image, thread, floor_num, create_time):
    dao = create.get_dao()
    src = image.attrs["src"]
    width = image.attrs["width"]
    height = image.attrs["height"]
    if "size" in image.attrs:
        size = image.attrs["size"]
    else:
        size = -1
    href = thread.get_href()
    author = thread.get_author()
    author_id = thread.get_author_id()
    imageria = ForumImage(src=src,
                          href=href,
                          image_size=size,
                          image_width=width,
                          image_height=height,
                          author=author,
                          author_id=author_id,
                          floor_num=floor_num,
                          create_time=create_time)
    return imageria
Exemplo n.º 24
0
def run_post(post, thread):
    post_result = {
        "authfl": False,
        "images": 0,
    }
    auth_flag = post.select(AUTHOR_FLAG)
    if not auth_flag:
        return post_result
    post_result["authfl"] = True
    data_field = json.loads(post.attrs["data-field"])
    floor_num = data_field["content"]["post_index"]
    create_time = post.select(CREATE_TIME_SELECTOR)[-1].text
    content = post.select(CONTENT_SELECTOR)[0]
    if not content:
        print("NO CONTENT FOUND, LIKELY BAD PAGE LOAD")
    images = content.select(IMAGE_SELECTOR)
    post_result["images"] = len(images)
    imagelist = []
    for image in images:
        imagelist.append(store_info(image, thread, floor_num, create_time))
    if imagelist:
        dao = create.get_dao()
        dao.insert_items(imagelist)
    return post_result
Exemplo n.º 25
0
def sample_id(author_id):
    dao = create.get_dao()
    images = dao.get_items(ForumImage, {"author_id": author_id})
    sample = random.choice(images)
    ff_vis = Selenium.get_instance(False)
    ff_vis.crawl(sample.get_src(), manual_wait=5)