def add_to_database(subforum, link, post): # post is a defaultdict that defaults to "" post['home'] = home post['subname'] = subforum post['thread'] = link if not post['plink']: post['plink'] = link dblib.insert_data(con, cur, post)
def scrape_thread(browser, home, con, cur): """This function handles the main loop and parses the data obtained INPUTS: string (path to image directory), Selenium Browser object, string (subforum page), string (home url), string (thread name), int (thread page), string (subforum name), string (subforum link), MySQLdb Connection Object, MySQLdb Cursor object. RETURNS: None""" forum_id = dblib.get_forum_id(con, cur, home) logger.info("got forum id %d", forum_id) restart.restore_state(forum_id) try: browser.get(home) except TimeoutException: logger.info("Timeout: %s", home) sys.stderr.write("TIMEOUT") keypress(browser) if type_flag: scraper = mybb logger.info("using mybb backend") else: scraper = vbulletin logger.info("using vbulletin backend") main_src = browser.page_source main_soup = bs(main_src) subforums = scraper.get_subforums(main_soup) logger.debug("got subforums: %s", str(subforums)) subs = 0 restart.get_cookies(forum_id, browser) sys.stderr.write("REFRESH") restart.dump_cookies(forum_id, browser) ##SUBFORUMS## for sub in subforums[::-1]: print "subforums %f%% DONE" % (float(subs) / len(subforums)) print "subforum %d of %d DONE" % (subs, len(subforums)) logger.info("subforums %f%% DONE", float(subs) / len(subforums)) logger.info("subforum %d of %d DONE", subs, len(subforums)) subs += 1 subforum_id = dblib.get_sub_id(con, cur, sub["name"], forum_id) logger.debug("scraping subforum %s, id #%d", sub["name"], subforum_id) t_done = 0 sub_page = 0 sub_page_count = 1 while sub_page < sub_page_count: # iterate through subforum pages sys.stderr.write("REFRESH") sub_page += 1 sub_link = scraper.get_page(home + sub["link"], sub_page) print "sub link %s DONE" % sub_link logger.info("sub link %s DONE", sub_link) try: browser.get(sub_link) except TimeoutException: logger.info("Timeout: %s", home) sys.stderr.write("TIMEOUT") keypress(browser) sub_src = browser.page_source sub_soup = bs(sub_src) threads, (sub_page, sub_page_count) = scraper.get_threads(sub_soup) print "got threads" logger.debug( "got threads\nsubforum page: %d\nsubforum poge count: %d\nThreads: %s", sub_page, sub_page_count, str(threads), ) ##THREADS## for thread in threads: # iterate through threads on page print "threads %f%% DONE" % (float(t_done) / (len(threads) * sub_page_count)) print "thread %d of %d DONE" % (t_done, len(threads) * sub_page_count) logger.info("threads %f%% DONE", float(t_done) / (len(threads) * sub_page_count)) logger.info("thread %d of %d DONE", t_done, len(threads) * sub_page_count) t_done += 1 sys.stderr.write("REFRESH") thread_id = dblib.get_thread_id(con, cur, thread["name"], subforum_id) logger.debug("scraping thread %s, id %d", thread["name"], thread_id) tc = dblib.get_thread_count(thread["name"], cur) logger.debug("posts in thread: %d\ndownloaded posts from thread: %d", tc, thread["count"]) if (thread["count"] == tc) and (tc != 0): continue # if we have all of the posts, skip this thread if thread_id in restart.threads.keys(): logger.debug( "in thread_keys: starting thread %d scrape at %d", thread_id, restart.threads[thread_id] ) # restart.threads[thread_id][1] += 1 # else: restart.threads[thread_id] = (0, 1) else: restart.threads[thread_id] = 1 logger.debug("not in thread keys: starting thread %d scrape at page 1", thread_id) thread_page = restart.threads[thread_id] - 1 thread_page_count = thread_page + 1 print "thread %d: page %d of %d" % (thread_id, thread_page, thread_page_count) print "%d\% done" while thread_page < thread_page_count: # iterate through thread pages sys.stderr.write("REFRESH") thread_page += 1 thread_link = scraper.get_page(home + thread["link"], thread_page) logger.info( "thread %d: page %d of %d\nLink: %s", thread_id, thread_page, thread_page_count, thread_link ) try: browser.get(thread_link) except TimeoutException: logger.info("Timeout: %s", home) sys.stderr.write("TIMEOUT") keypress(browser) page_src = browser.page_source posts, (thread_page, thread_page_count) = scraper.get_posts(page_src) print "got posts" for post in posts: print "iterate post" user = post["user"] P = dblib.post( home, sub["name"], sub["link"], sub_page, thread["name"], post["date"], post["link"], post["message"], user["name"], user["title"], user["join"], user["link"], user["sig"], post["edit"], post["message images"], ) (post_id, user_id) = dblib.insert_data(con, cur, P) print post_id print user["image"] print type(user["image"]) if user["image"]: if user["image"].find("http") == -1: user["image"] = P.home + user["image"] imaget.get_user_image(user_id, user["image"]) imaget.get_post_images(P, post["message images"], cur) # restart.threads[thread_id][0] = thread_page restart.threads[thread_id] = thread_page restart.save_state(forum_id) """
def scrape_thread(browser, home, con, cur): """This function handles the main loop and parses the data obtained INPUTS: string (path to image directory), Selenium Browser object, string (subforum page), string (home url), string (thread name), int (thread page), string (subforum name), string (subforum link), MySQLdb Connection Object, MySQLdb Cursor object. RETURNS: None""" forum_id = dblib.get_forum_id(con, cur, home) logger.info("got forum id %d", forum_id) restart.restore_state(forum_id) try: browser.get(home) except TimeoutException: logger.info("Timeout: %s", home) sys.stderr.write("TIMEOUT") keypress(browser) if type_flag: scraper = mybb logger.info("using mybb backend") else: scraper = vbulletin logger.info("using vbulletin backend") main_src = browser.page_source main_soup = bs(main_src) subforums = scraper.get_subforums(main_soup) logger.debug("got subforums: %s", str(subforums)) subs = 0 restart.get_cookies(forum_id, browser) sys.stderr.write("REFRESH") restart.dump_cookies(forum_id, browser) ##SUBFORUMS## for sub in subforums[::-1]: print "subforums %f%% DONE" % (float(subs)/len(subforums)) print "subforum %d of %d DONE" % (subs, len(subforums)) logger.info("subforums %f%% DONE", float(subs)/len(subforums)) logger.info("subforum %d of %d DONE", subs, len(subforums)) subs += 1 subforum_id = dblib.get_sub_id(con, cur, sub['name'], forum_id) logger.debug("scraping subforum %s, id #%d", sub['name'], subforum_id) t_done = 0 sub_page = 0 sub_page_count = 1 while sub_page < sub_page_count: #iterate through subforum pages sys.stderr.write("REFRESH") sub_page += 1 sub_link = scraper.get_page(home + sub['link'], sub_page) print "sub link %s DONE" % sub_link logger.info("sub link %s DONE", sub_link) try: browser.get(sub_link) except TimeoutException: logger.info("Timeout: %s", home) sys.stderr.write("TIMEOUT") keypress(browser) sub_src = browser.page_source sub_soup = bs(sub_src) threads, (sub_page, sub_page_count) = scraper.get_threads(sub_soup) print "got threads" logger.debug("got threads\nsubforum page: %d\nsubforum poge count: %d\nThreads: %s", sub_page, sub_page_count, str(threads)) ##THREADS## for thread in threads: #iterate through threads on page print "threads %f%% DONE" % (float(t_done)/(len(threads) * sub_page_count)) print "thread %d of %d DONE" % (t_done, len(threads) * sub_page_count) logger.info("threads %f%% DONE", float(t_done)/(len(threads) * sub_page_count)) logger.info("thread %d of %d DONE", t_done, len(threads) * sub_page_count) t_done += 1 sys.stderr.write("REFRESH") thread_id = dblib.get_thread_id(con, cur, thread['name'], subforum_id) logger.debug("scraping thread %s, id %d", thread['name'], thread_id) tc = dblib.get_thread_count(thread['name'], cur) logger.debug("posts in thread: %d\ndownloaded posts from thread: %d", tc, thread['count']) if (thread['count'] == tc) and (tc != 0): continue #if we have all of the posts, skip this thread if thread_id in restart.threads.keys(): logger.debug("in thread_keys: starting thread %d scrape at %d", thread_id, restart.threads[thread_id]) # restart.threads[thread_id][1] += 1 #else: restart.threads[thread_id] = (0, 1) else: restart.threads[thread_id] = 1 logger.debug("not in thread keys: starting thread %d scrape at page 1", thread_id) thread_page = restart.threads[thread_id] -1 thread_page_count = thread_page + 1 print "thread %d: page %d of %d" % (thread_id, thread_page, thread_page_count) print "%d\% done" while thread_page < thread_page_count: #iterate through thread pages sys.stderr.write("REFRESH") thread_page += 1 thread_link = scraper.get_page(home + thread['link'], thread_page) logger.info("thread %d: page %d of %d\nLink: %s", thread_id, thread_page, thread_page_count, thread_link) try: browser.get(thread_link) except TimeoutException: logger.info("Timeout: %s", home) sys.stderr.write("TIMEOUT") keypress(browser) page_src = browser.page_source posts, (thread_page, thread_page_count) = scraper.get_posts(page_src) print "got posts" for post in posts: print "iterate post" user = post['user'] P = dblib.post(home, sub['name'], sub['link'], sub_page, thread['name'], post['date'], post['link'], post['message'], user['name'], user['title'], user['join'], user['link'], user['sig'], post['edit'], post['message images']) (post_id, user_id) = dblib.insert_data(con, cur, P) print post_id print user['image'] print type(user['image']) if user['image']: if (user['image'].find('http') == -1): user['image'] = P.home + user['image'] imaget.get_user_image(user_id, user['image']) imaget.get_post_images(P, post['message images'], cur) #restart.threads[thread_id][0] = thread_page restart.threads[thread_id] = thread_page restart.save_state(forum_id) """