def main(): print(""" ############################### Douban Top250 Movies (Multi-Threads Version) Author: Ke Yi ############################### """) print("Douban Movie Crawler Begins...") for i in range(THREAD_NUM): thread = Workers(Q_SHARE) thread.daemon = True thread.start() for index in range(PAGE_SIZE): Q_SHARE.put(index) Q_SHARE.join() print("Douban Movie Crawler Ends.") ol = sorted(MY_DIC.items(), key=lambda x: int(x[0])) # ordered list ol = [s[1] for s in ol] my_file = mjson.RWfile(OUTPUT) my_file.write_in(ol) # my_file.read_out() my_db = mdatabase.DB(DB_NAME, TB_NAME) my_db.db_insert(ol) # my_db.db_retrieval() my_db.db_close()
def main(): my_spider = WeiBoSpider(UID) soup = my_spider.retrieve_page() my_spider.retrieve_content(soup) ol = sorted(list(DIC.items()), key=lambda x: int(x[0])) # ordered list ol = [s[1] for s in ol] my_file = mjson.RWfile(OUTPUT) my_file.write_in(ol)
def Workers(item): MY_DIC = collections.OrderedDict() for i in range(item['page_size']): my_spider = model.GenreSpider() my_soup = my_spider.retrieve_page(item['name'], item['order'], i) my_spider.retrieve_content(my_soup, MY_DIC) ol = sorted(MY_DIC.items(), key=lambda x: int(x[0])) ol = [s[1] for s in ol] my_file = mjson.RWfile(item['name'].lower() + '.json') my_file.write_in(ol) # my_file.read_out() my_db = mdatabase.DB(DB_NAME, item['name']) my_db.db_insert(ol) # my_db.db_retrieval() my_db.db_close()
def Workers(item): MY_DIC = collections.OrderedDict() threads = [] for i in range(item['page_size']): threads.append(gevent.spawn(Subworker, i, item, MY_DIC)) gevent.joinall(threads) ol = sorted(MY_DIC.items(), key=lambda x: int(x[0])) ol = [s[1] for s in ol] my_file = mjson.RWfile(item['name'].lower() + '.json') my_file.write_in(ol) # my_file.read_out() my_db = mdatabase.DB(DB_NAME, item['name']) my_db.db_insert(ol) # my_db.db_retrieval() my_db.db_close()
def main(): print(""" ############################### IMDB Top250 Movies Author: Ke Yi ############################### """) print("IMDB Movie Crawler Begins...") my_spider = IMDBSpider() my_soup = my_spider.retrieve_page(0) my_spider.retrieve_content(my_soup) print("IMDB Movie Crawler Ends...") ol = sorted(MY_DIC.items(), key=lambda x: int(x[0])) # ordered list ol = [s[1] for s in ol] my_file = mjson.RWfile(OUTPUT) my_file.write_in(ol) # my_file.read_out() my_db = mdatabase.DB(DB_NAME, TB_NAME) my_db.db_insert(ol) # my_db.db_retrieval() my_db.db_close()
follower = [] weibo = [] img = [] for i in range(len(numbers)): number = numbers[i].text if i % 3 == 0: following.append(number) elif i % 3 == 1: follower.append(number) else: weibo.append(number) for image in images: name.append(image.get_attribute('alt')) img.append(image.get_attribute('src')) DIC = {} OUTPUT = 'output_selphan.json' for i in range(len(name)): content = collections.OrderedDict([("Name", name[i]), ("Following", following[i]), ("Follower", follower[i]), ("Weibo", weibo[i]), ("Image", img[i])]) DIC[str(i)] = content ol = sorted(list(DIC.items()), key=lambda x: int(x[0])) # ordered list ol = [s[1] for s in ol] my_file = mjson.RWfile(OUTPUT) my_file.write_in(ol) driver.quit()