def main(): # project dir create_dir(ROOT) Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT) # 读取url列表 file = open('msglist.json') text = file.read() file.close() urls = json.loads(text) urls_visited = [] if os.path.exists('visited.txt'): file = open('visited.txt', 'r') for line in file: urls_visited.append(line.rstrip()) urlmap = {} for item in urls: title = item['title'] url = item['url'] if url in urls_visited: print 'visited', url continue urlmap[url] = title queue.put(url) # start file = open('visited.txt', 'a') while queue.empty() == False: url = queue.get() print "crawl ", url logging.info('now crawl %s', url) Spider.crawl(url) print "analyse ", url logging.info('now analyse %s', url) images = Spider.analyse() queue.task_done() visited.add(url) save(images, urlmap[url]) file.write(url+'\n') file.flush() file.close() print 'finished' logging.info('finished')
def get_history(): """Download historical vessel schedules.""" src_dir = get_src_dir_path(__file__) data_dir = src_dir.parent / "data" start_date = datetime.date(year=2020, month=1, day=1) end_date = datetime.date(year=2020, month=1, day=10) time_delta = datetime.timedelta(days=1) file_paths = [] jocasta = Spider() while start_date <= end_date: time.sleep(random.randrange(0, 5, 1)) file_path = jocasta.crawl(data_dir, date=start_date) file_paths.append(file_path) start_date += time_delta for file_path in file_paths: if file_path.endswith('.xlsx'): print("unhandled xlsx file!") elif file_path.endswith('.pdf'): tables = Wrangler.parse_pdf(file_path) else: raise ValueError(f"Unexpected file type: {file_path}") return None
def main(): url = "https://www.readmorejoy.com" args = sys.argv[0:] # Execute function depending on arguments # print(len(args)) if len(args) == 1: print_help() elif len(args) == 2: url = args[1] #print(url) print("start a web crawling") spider = Spider(url) spider.crawl(url) print("web crawling done") else: print_help()
def update(data_dir): print("Crawling...") url = "https://civilization.fandom.com/wiki/Leaders_(Civ6)" response = Spider.crawl(url) print("Processing html...") records = Wrangler.process_html(response, data_dir) return records
def run(): req_data = request.get_json() email = None if 'email' in req_data: email = req_data['email'] password = None if 'password' in req_data: password = req_data['password'] url_list = get_urls() if email and password and url_list: spidy = Spider() url_list = spidy.modify_urls(url_list) spidy.crawl(url_list=url_list, email=email, password=password) return ''' Done ''' else: return ''' ERROR in email or password or while fetching urls '''
def main(): # Fetch the arguments; the first elements in sys.argv is this python file itself - so ignore args = sys.argv[1:] # Execute function depending on arguments if len(args) == 1: if args[0] == "-test": # test test = Test() test.test_all() elif args[0] == "-help": # help print_help() else: print_help(True) elif len(args) == 2: if args[0] == "-c": # crawl url = args[1] print("[crawler.py] start crawling") spider = Spider() spider.crawl(url) print("[crawler.py] crawling done") else: print_help(True) else: print_help(True)
def main(): # project dir create_dir(ROOT) Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT) queue.put(URL) # start while queue.empty() == False: url = queue.get() print "crawl ", url logging.info('now crawl %s', url) html = Spider.crawl(url) images = Spider.analyse(html) links = Spider.analyse_links(html) queue.task_done() visited.add(url) save(images) # new urls for link in links: if (link not in visited) and link[0:18] == 'http://pp.163.com/': exist = False for ignore in IGNORES: match = re.search(re.compile(ignore), link) if match: #logging.info("exclude %s", link) exist = True break if exist == False: queue.put(link) print 'done'
class Helper(object): def __init__(self, url, cook): self.spider = Spider(url, cook) self.content = '' self.mail_helper = Mailhelper() def refresh(self): """重新抓取网页内容""" self.content = self.spider.crawl() def is_new_msg(self): """判断抓取的内容是否为最新动态""" if os.path.exists('weibo.txt'): with open('weibo.txt', 'r') as fi: # 这里的split作用是去除微博中的时间信息,避免误判旧消息为新消息 txt_content = '\n'.join(fi.read().split('\n')[:-1]) new_content = '\n'.join(self.content.encode('utf8').split('\n')[:-1]) if new_content is txt_content: return False else: return True else: return True def send_mail(self): """发送邮件到指定邮箱""" to_list = ['*****@*****.**'] # 接收方邮箱 sub = u'微博更新' # 邮件标题 if self.mail_helper.send_mail(to_list, sub, self.content): print u'发送成功!' else: print u'发送失败!'
shortcut = Student(2013217413, '123456789012', 'XC') c = pymongo.MongoClient() db = c['hfut'] init_db(db) # 初始化任务池 # 合适的数据库任务池大小和缓冲区大小能更好的利用带宽 # 数据库记录的最大并发为 db_pool_size * batch_size # 当请求池大小大于20时很容易导致服务器错误抓取不到结果 job_manager = JobManager(pool_size=20) db_manager = DatabaseManager(db, batch_size=80) j = Spider(shortcut, job_manager, db_manager) j.crawl() # def patch(): # for i in range(21, 31): # term = '%03d' % i # yield term, None, '_' # # for args in j.iter_teaching_class(term, course_name='_'): # # yield args # # # jobs = (patch, j.iter_teaching_class, j.sync_students) # job_manager.jobs = jobs # # logger.info('Crawl start!'.center(72, '=')) # job_manager.start() # logger.info('Jobs are all dispatched. Waiting for database requests handling.')
def startCrawl(self): spider = Spider(self.userList) spider.crawl(self.hasProcessed)
log_it("TEMP DIR",temp_dire) if os.path.exists(temp_dire): shutil.rmtree(temp_dire) distutils.dir_util.copy_tree(src_dir,temp_dire) owd = os.getcwd() log_it("LOG","Crawling started") spider = Spider(temp_dire) log_it("LOG","Crawling done") # spider.crawl() log_it("LOG","Compileing pages started") posts_data=[] for post_folder in spider.crawl(): config = json.load(open(os.path.join(post_folder,"__pub.lish"))) t_date = time.strptime(config['date'],"%Y-%m-%d") posts_data.append({ 'title': config['name'].replace('-', ' '), 'url' : post_folder[len(temp_dire)+1:], 'year' : time.strftime("%Y",t_date), 'day' : time.strftime("%d",t_date), 'month': time.strftime("%b",t_date), 'date' : t_date }) compiler = Compilers[config['type']] owd = os.getcwd() os.chdir(post_folder) compiler.compile(config['file']) os.chdir(owd)
user=config.db_user, passwd=config.db_password, db=config.db_database, charset='utf8') cursor = conn.cursor() cursor.execute( 'select configValue from t_spider_config where configKey=%s', (arg_config.get(sys.argv[1]), )) config_values = [row[0] for row in cursor.fetchall()] if sys.argv[1] == 'paper': spider_paper = Spider('paper') for search_exp in config_values: reqs = parser.paper_page_parser(search_exp)[:500] for req in reqs: spider_paper.add_request(req) spider_paper.crawl() if sys.argv[1] == 'news': spider_news = Spider('news') for seed_url in config_values: spider_news.add_request( Request(arg=seed_url, parser=parser.news_parser)) spider_news.crawl() if sys.argv[1] == 'patent': spider_patent = Spider('patent') for search_exp in config_values: spider_patent.add_request( Request(arg=search_exp, parser=parser.patent_parser)) spider_patent.crawl()
def work(): while True: url = queue.get() Spider.crawl(threading.current_thread().name, url) queue.task_done()
conn = MySQLdb.connect(host=config.db_host, user=config.db_user, passwd=config.db_password, db=config.db_database, charset='utf8') cursor = conn.cursor() cursor.execute('select configValue from t_spider_config where configKey=%s', (arg_config.get(sys.argv[1]),)) config_values = [row[0] for row in cursor.fetchall()] if sys.argv[1] == 'paper': spider_paper = Spider('paper') for search_exp in config_values: reqs = parser.paper_page_parser(search_exp)[:500] for req in reqs: spider_paper.add_request(req) spider_paper.crawl() if sys.argv[1] == 'news': spider_news = Spider('news') for seed_url in config_values: spider_news.add_request(Request(arg=seed_url, parser=parser.news_parser)) spider_news.crawl() if sys.argv[1] == 'patent': spider_patent = Spider('patent') for search_exp in config_values: spider_patent.add_request(Request(arg=search_exp, parser=parser.patent_parser)) spider_patent.crawl()
def test2(): sp = Spider('reddit.com') sp.crawl_robots() sp.test() sp.crawl()
def crawl_t(): waiting_list_count=len(Spider.waiting_list) while len(Spider.waiting_list) > 0: # print("No Of Links Waiting To be Further Crawled: " + str(len(Spider.waiting_list))) url=Spider.waiting_list.pop() Spider.crawl("Spider",url)
def search(username): print(username) spider = Spider() spider.crawl(username) return render_template('detail.html')