def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() self._use_log() try: self.args_dict = eval(sys.argv[1:]) if not isinstance(self.args_dict, dict): raise ValueError('args must be like key-value ') except Exception as e: self.args_dict = {} logging.warning('get args failed:{}'.format(e)) self.proxies = self.args_dict.get('proxies') # 代理配置 self.hdfs = self.args_dict.get('hdfs', {}) # hdfs配置 # 如果没有这两个参数 直接报异常 不执行 if not self.hdfs or not self.proxies: raise ValueError('args not have hdfs or proxies') self.sleep_time = self.args_dict.get('sleep_time', 0.2) # 休眠时间 self.service_args = self.args_dict.get('service_args', {}) # PhantomJS代理配置 self.aliyun_log = self.args_dict.get('aliyun_log', {}) self.alilog = AliyunLog( '{}_{}'.format(setting.OTA_NAME, setting.CATEGORY_NAME), endp=self.aliyun_log.get('endpoint', endpoint), accid=self.aliyun_log.get('accessKeyId', accessKeyId), acckey=self.aliyun_log.get('accessKey', accessKey), proj=self.aliyun_log.get('project', project), logst=self.aliyun_log.get('logstore', logstore)) # 阿里云log配置文件,需要校验如果没有该参数会不会报错 try: self.HDFS = HDFileSystem(host=self.hdfs.get( 'ip', '192.168.100.178'), port=self.hdfs.get('port', 8020)) except: pass
def cmd_crawl(args, options): if len(args) != 1: logging.error("Missing build URL") return 1 if options.to_file and not os.path.exists(options.to_file): os.mkdir(options.to_file) if options.from_file and not os.path.exists(options.from_file): os.mkdir(options.from_file) db = open_db(options) crawl = Crawl(db, options) if options.reverse: roots = crawl.reverse_crawl(args[0]) else: roots = crawl.crawl(args[0]) close_db(db) stat = roots[0].extra logging.info("Started: %s\n\tend: %s\n\telapsed: %s\n\tduration: %ss\n\tNb builds: %s\n\ttrhoughput: %s\n" % ( stat['start'], stat['stop'], stat['elapsed'], stat['duration'], stat['count'], stat['throughput'])) if not options.output: svg_file = roots[0].getId() + ".svg" else: svg_file = options.output graphviz(roots, svg_file) logging.info("%s generated." % svg_file) return 0
def thread_func(filename,cur): c = Crawl() #读取文件 f = open('uploads/'+filename,'r') i = 1 while 1: print(cur,i) line = f.readline().strip('\n') if i<=cur: i = i+1 continue rs = Setting.query.filter_by(name='is_crawl').first() if rs.value == '0': break if not line: break time.sleep(1) flag = c.crawl(line) if flag: db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="2",opt_time=int(time.time()))) db.session.commit() else: db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="1",opt_time=int(time.time()))) db.session.commit() pass # do something f.close()
def __init__(self, goal, time): '''goal = 今日订阅目标(增加量 time = 刷新时间 (分钟)''' self.goal = goal self.time_in_seconds = time * 60 self.c = Crawl(goal) # 初始化Crawler # 设置GUI界面 self.root = Tk() ########################### 设置初始windows位置 ################## self.root.geometry('220x45+40+560') # 长 X 宽 + 向右平移 + 向下平移 ##################################################################### self.root.title('就是要莽') top_frame = Frame(self.root) # 上层frame用于显示信息 top_frame.pack(fill=BOTH) self.label_text1 = StringVar() self.label_text1.set('今日订阅:') text_label = Label(top_frame, textvariable=self.label_text1, font="32") text_label.grid(row=0, sticky='w') self.cur_num = StringVar() # 当前订阅数 num_label = Label(top_frame, textvariable=self.cur_num, fg="red", font="28") num_label.grid(row=0, column=1, sticky='e') self.label_text2 = StringVar() self.label_text2.set('/' + str(self.goal)) objective_label = Label(top_frame, textvariable=self.label_text2, font="28") objective_label.grid(row=0, column=2, sticky='w') top_frame.columnconfigure(0, weight=4) # 调整widget位置 top_frame.columnconfigure(1, weight=2) top_frame.columnconfigure(2, weight=2) bottom_frame = Frame(self.root) # 下层frame用于手动获取最新订阅量 bottom_frame.pack(fill=BOTH, side=BOTTOM) refresh_button = Button(bottom_frame, text='手动刷新', font="25") refresh_button.bind('<Button-1>', self.refresh) refresh_button.grid(row=0, column=0, sticky=("N", "S", "E", "W")) fans_button = Button(bottom_frame, text='当前订阅', font="25") fans_button.bind('<Button-1>', self.refresh_total_fans) fans_button.grid(row=0, column=1, sticky=("N", "S", "E", "W")) bottom_frame.columnconfigure(0, weight=1) bottom_frame.columnconfigure(1, weight=1) self.root.rowconfigure(0, weight=3) # 调整widget位置 self.root.rowconfigure(1, weight=1) t = threading.Thread(target=self.start_crawl) # 开始运行 t.daemon = True t.start() self.root.mainloop()
def wrap_crawl(url, threads, user_agent, proxy, timeout, obey_robots, max_urls, data_format): freeze_support() seo = Crawl(url, threads=threads, user_agent=user_agent, proxy=proxy, timeout=timeout, obey_robots=obey_robots, max_urls=max_urls, data_format=data_format) seo.run_crawler()
def crawl(self): crawl = Crawl() proxies = [] self.logger.info('crawl beginning -------') for parser in PARSER_LIST: for url in parser['urls']: self.logger.info('crawling {0}'.format(url)) result = crawl.run(url, parser) proxies.extend(result) self.logger.info('crawl end -------\n' 'crawl {0} ips'.format(len(proxies))) return proxies
class Environment(Base): def __init__(self,root="."): self.search_path = Crawl(root) self.version = '' self.cache = None self.engines = copy.deepcopy(engine_registry) self.mimetypes = copy.deepcopy(mimetype_registry) self.processors = copy.deepcopy(processor_registry) class ctx(Context): pass self.context_class = ctx for path in path_registry.paths: self.search_path.append_path(path) for extension in self.mimetypes.mimetypes.keys(): self.search_path.append_extension(extension) for ext,engine in self.engines.engines.iteritems(): self.add_engine_to_search_path(ext,engine) @property def index(self): return Index(self) def find_asset(self,path,**options): if not options: options = {} if not options.has_key('bundle'): options['bundle'] = True key = self.cache_key_for(path,**options) asset = self.assets[key] if self.assets.has_key(key) else None if asset and asset.is_fresh(self): return asset else: asset = self.index.find_asset(path,**options) if asset: return asset return None def expire_index(self): self._digest = None self.assets = {}
class Environment(Base): def __init__(self, root="."): self.search_path = Crawl(root) self.version = '' self.cache = None self.engines = copy.deepcopy(engine_registry) self.mimetypes = copy.deepcopy(mimetype_registry) self.processors = copy.deepcopy(processor_registry) class ctx(Context): pass self.context_class = ctx for path in path_registry.paths: self.search_path.append_path(path) for extension in self.mimetypes.mimetypes.keys(): self.search_path.append_extension(extension) for ext, engine in self.engines.engines.iteritems(): self.add_engine_to_search_path(ext, engine) @property def index(self): return Index(self) def find_asset(self, path, **options): if not options: options = {} if not options.has_key('bundle'): options['bundle'] = True key = self.cache_key_for(path, **options) asset = self.assets[key] if self.assets.has_key(key) else None if asset and asset.is_fresh(self): return asset else: asset = self.index.find_asset(path, **options) if asset: return asset return None def expire_index(self): self._digest = None self.assets = {}
def __init__(self, name): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() self.options = webdriver.ChromeOptions() # 指定下载位置 prefs = { 'profile.default_content_settings.popups': 0, 'download.default_directory': os.path.abspath('DATA') } self.options.add_experimental_option('prefs', prefs) self.driver = webdriver.Chrome(chrome_options=self.options) self.realname = name self.name = str(name.encode('gbk'))[2:-1].replace('\\x', '%').upper()
def __init__(self): init_params = { 'site_name' : SITE_NAME, 'init_url' : INIT_URL, 'skip_url' : SKIP_URL, 'redis_crawling_urls' : REDIS_CRAWLING_URLS, 'redis_crawled_urls' : REDIS_CRAWLED_URLS, 'redis_product_urls' : REDIS_PRODUCT_URLS, 'product_pattern' : PRODUCT_PATTERN, 'process_num' : PROCESS_NUM, 'use_tor' : USE_TOR } Crawl.__init__(self, **init_params) #select collection self.mongo_collection = self.mongo_conn['nguyenkim_product']
def __init__(self): self.count = { 'count': 0, #爬取总数 'failed_count': 0, #爬取失败总数 'sucess_count': 0, #成功爬取总数 'start_time': time.asctime(), #开始时间 'end_time': 0, #结束时间 } self.endtime = time.localtime().tm_min + 1 self.proxy = next(proxies) self.Crawl = Crawl() self.Crawl.proxy = self.proxy self.Taskqueue = Queue() self.Urlqueue = Queue()
def start_crawl(self): # Validation for the inputs. (if i got time) # Start the crawl Crawl(self.website_url_input.get(), self.crawl_depth_input.get(), self.user_defined_regex_input.get()) print("Crawl finished")
def __init__(self): init_params = { 'site_name' : SITE_NAME, 'init_url' : INIT_URL, 'skip_url' : SKIP_URL, 'redis_crawling_urls' : REDIS_CRAWLING_URLS, 'redis_crawled_urls' : REDIS_CRAWLED_URLS, 'redis_product_urls' : REDIS_PRODUCT_URLS, 'product_pattern' : PRODUCT_PATTERN, 'process_num' : PROCESS_NUM, 'use_tor' : USE_TOR } Crawl.__init__(self, **init_params) #select collection self.mongo_collection = self.mongo_conn['tiki_product'] self.page_link_format = re.compile(r'(.*)\?.*(p=\d+).*', re.MULTILINE|re.DOTALL)
def __init__(self): init_params = { 'site_name': SITE_NAME, 'init_url': INIT_URL, 'skip_url': SKIP_URL, 'redis_crawling_urls': REDIS_CRAWLING_URLS, 'redis_crawled_urls': REDIS_CRAWLED_URLS, 'redis_product_urls': REDIS_PRODUCT_URLS, 'product_pattern': PRODUCT_PATTERN, 'process_num': PROCESS_NUM, 'use_tor': USE_TOR } Crawl.__init__(self, **init_params) #select collection self.mongo_collection = self.mongo_conn['lazada_product'] self.page_link_format = re.compile(r"(.*)\?.*(page=\d+).*", re.MULTILINE | re.DOTALL)
def crawl_price(self, item_id_inner, proxy_inner, mall_id_inner): if mall_id_inner == '1': crawl = Crawl() item_price_inner = crawl.get_price_jd(item_id_inner, proxy_inner) return item_price_inner elif mall_id_inner == '2': #crawl = Crawl() #item_price_inner = crawl.get_price_tm(item_id_inner, proxy_inner) #return item_price_inner temp_item_price = '-1' return temp_item_price elif mall_id_inner == '3': #crawl = Crawl() #item_price_inner = crawl.get_price_tb(item_id_inner, proxy_inner) #return item_price_inner temp_item_price = '-1' return temp_item_price else: return '-1'
def evaluation_chart(self): # 销量榜数据表名称 sales_volume_rankings_table_name = 'sales_volume_rankings' # 热评榜数据表名称 heat_rankings_table_name = 'heat_rankings' # 创建自定义数据库对象 mysql = MySQL() # 创建爬去对象 mycrawl = Crawl() # 连接数据库 sql = mysql.connection_sql() # 创建游标 cur = sql.cursor() good_rate_list = [] # 好评率列表 # 查询关注图书的信息,中的京东id attention_message = mysql.query_attention(cur, 'jd_id,book_name', sales_volume_rankings_table_name, "attention = '1'") for i in range(len(attention_message)): # 获取好评率与评价时间 good_rate,time = mycrawl.get_evaluation(0, attention_message[i][0]) # 将关注的商品名称与好评率添加至列表当中 good_rate_list.append((attention_message[i][1], good_rate)) # 关注的第一个商品 if i == 0: plt1 = PlotCanvas() # 创建如表画布类对象 # 显示评价分析图 plt1.pie_chart(good_rate_list[0][1], (100 - good_rate_list[0][1]), good_rate_list[0][0]) # 将评价分析图添加至布局中 self.horizontalLayout_0.addWidget(plt1) # 关注的第二个商品 if i == 1: plt2 = PlotCanvas() plt2.pie_chart(good_rate_list[1][1], (100 - good_rate_list[1][1]), good_rate_list[1][0]) self.horizontalLayout_1.addWidget(plt2) # 关注的第三个商品 if i == 2: plt3 =PlotCanvas() plt3.pie_chart(good_rate_list[2][1], (100 - good_rate_list[2][1]),good_rate_list[2][0]) self.horizontalLayout_2.addWidget(plt3) mysql.close_sql() # 关闭数据库
def crawl_name(self, item_id_inner, proxy_inner, mall_id_inner): if mall_id_inner == '1': # jd crawl = Crawl() item_name_inner = crawl.get_name_jd(item_id_inner, proxy_inner) return item_name_inner elif mall_id_inner == '2': # tm #crawl = Crawl() #item_name_inner = crawl.get_name_tm(item_id_inner, proxy_inner) #return item_name_inner temp_item_name = '天猫价格抓取正在攻克中,名称暂不显示' return temp_item_name elif mall_id_inner == '3': # tb #crawl = Crawl() #item_name_inner = crawl.get_name_tb(item_id_inner, proxy_inner) #return item_name_inner temp_item_name = '淘宝价格抓取正在攻克中,名称暂不显示' return temp_item_name else: return '该商品未设定商城名'
def main(): try: name = prompt() # create authenticated twitter api object auth = authenticate.Authenticate(creds_file='twitter_creds.BU') # crawl the given twitter profile for reciprocal friends crawl = Crawl(twitter_api=auth.twitter_api, screen_name=name, node_max=100) # crawl = Crawl(twitter_api=auth.twitter_api, screen_name='smerconish', node_max=100) crawl.crawl_followers() crawl.file_output.close() # close file #create a graph object using networkx and visualize it using graphviz g = Graph(use_name=True, twitter_api=auth.twitter_api, screen_name=name) except Exception as e: print(traceback.format_exc())
def __init__(self, root="."): self.search_path = Crawl(root) self.version = '' self.cache = None self.engines = copy.deepcopy(engine_registry) self.mimetypes = copy.deepcopy(mimetype_registry) self.processors = copy.deepcopy(processor_registry) class ctx(Context): pass self.context_class = ctx for path in path_registry.paths: self.search_path.append_path(path) for extension in self.mimetypes.mimetypes.keys(): self.search_path.append_extension(extension) for ext, engine in self.engines.engines.iteritems(): self.add_engine_to_search_path(ext, engine)
def __init__(self, email='', rate=60, note=60 * 60): config='config.cfg' cfg = configparser.ConfigParser() parentDirPath=os.path.dirname(os.path.abspath(__file__)) path=parentDirPath+'/config/'+config cfg.read(path) self.option = cfg.get('select', 'option') self.scheduler = sched.scheduler(time.time, time.sleep) self.goods_dict = {} self.db = DB() self.crawl = Crawl() self.mail = Mail() self.ding = Dingding() self.email = [email] # 电子邮箱 self.rate = rate # 刷新频率 self.note = note # 通知频率 # 加载数据 result = self.db.query() print('----------加载数据----------') for id, item in result.items(): self.goods_dict[id] = Goods(item['id'], item['want'], item['status'], item['dname']) print(self.goods_dict[id].__dict__) print('----------加载完成----------')
def __init__(self,goal,time): '''goal = 今日订阅目标(增加量 time = 刷新时间 (分钟)''' self.goal = goal self.time_in_seconds = time*60 self.c = Crawl(goal) # 初始化Crawler # 设置GUI界面 self.root = Tk() ########################### 设置初始windows位置 ################## self.root.geometry('220x45+40+560') # 长 X 宽 + 向右平移 + 向下平移 ##################################################################### self.root.title('就是要莽') top_frame = Frame(self.root) # 上层frame用于显示信息 top_frame.pack(fill=BOTH) self.label_text1 = StringVar() self.label_text1.set('今日订阅:') text_label = Label(top_frame, textvariable=self.label_text1,font="32") text_label.grid(row=0,sticky='w') self.cur_num = StringVar() # 当前订阅数 num_label = Label(top_frame, textvariable=self.cur_num,fg="red",font="28") num_label.grid(row=0, column=1,sticky='e') self.label_text2 = StringVar() self.label_text2.set('/'+str(self.goal)) objective_label = Label(top_frame,textvariable=self.label_text2,font="28") objective_label.grid(row=0,column=2,sticky='w') top_frame.columnconfigure(0,weight=4) # 调整widget位置 top_frame.columnconfigure(1,weight=2) top_frame.columnconfigure(2,weight=2) bottom_frame = Frame(self.root) # 下层frame用于手动获取最新订阅量 bottom_frame.pack(fill=BOTH, side=BOTTOM) refresh_button = Button(bottom_frame, text='手动刷新',font="25") refresh_button.bind('<Button-1>', self.refresh) refresh_button.grid(row=0,column=0,sticky=("N", "S", "E", "W")) fans_button=Button(bottom_frame,text='当前订阅',font="25") fans_button.bind('<Button-1>', self.refresh_total_fans) fans_button.grid(row=0,column=1,sticky=("N", "S", "E", "W")) bottom_frame.columnconfigure(0,weight=1) bottom_frame.columnconfigure(1,weight=1) self.root.rowconfigure(0,weight=3) # 调整widget位置 self.root.rowconfigure(1,weight=1) t = threading.Thread(target=self.start_crawl) # 开始运行 t.daemon = True t.start() self.root.mainloop()
def __init__(self,root="."): self.search_path = Crawl(root) self.version = '' self.cache = None self.engines = copy.deepcopy(engine_registry) self.mimetypes = copy.deepcopy(mimetype_registry) self.processors = copy.deepcopy(processor_registry) class ctx(Context): pass self.context_class = ctx for path in path_registry.paths: self.search_path.append_path(path) for extension in self.mimetypes.mimetypes.keys(): self.search_path.append_extension(extension) for ext,engine in self.engines.engines.iteritems(): self.add_engine_to_search_path(ext,engine)
def get_data(usr_id, token): sys.path.append('../') from crawl import Crawl import time c = Crawl() print 'Start web crawl.' c.update([usr_id], token_list=[token]) c.update_img([usr_id], token_list=[token]) c.update_voice([usr_id], token_list=[token]) print 'Crawl is finished.' print 'Start analysis.' #os.system('java -Djava.ext.dirs=../../predict/lib -jar ../../predict/predictor.jar ../../analysis/data_json/'+usr_id) os.system( 'java -Djava.ext.dirs=./lib -jar predictor.jar ../../analysis/data_json/' + usr_id) print 'Analysis is finished.' global five_result #with open('../../predict/predict_result/'+usr_id+'.txt') as ifile: with open('predict_result/' + usr_id + '.txt') as ifile: five_result = eval(ifile.read()) global finished finished = True
class Work: def __init__(self): self.c=Crawl() self.e=Excel() def thread_it(self,func): # 创建线程 t = threading.Thread(target=func) # 守护线程 t.setDaemon(True) # 启动 t.start() def setUp(self): #pb.start() self.c.setUp() #pb.stop() def crawl(self): var.set('') start_row=int(start.get()) end_row=int(end.get()) list=self.e.get_title_list(start_row,end_row)#title_list print(list,flush=True) self.c.crawl(list) time.sleep(2) start.delete(0,tk.END) end.delete(0,tk.END) time.sleep(1) start.insert(0,end_row+1) end.insert(0,end_row+4) num=end_row-start_row+1 var.set('请输入'+str(num)+'个结果 ') #num_list=c.insert() #self.e.write_num(num_list) def insert(self): num=e.get() num_list=[int(i) for i in re.split('[,,]',num)] print(num_list,flush=True) self.e.write_num(num_list) e.delete(0,tk.END) var.set('数据已导入 ') def tearDown(self): self.c.tearDown()
def get_data(usr_id, token): sys.path.append('../') from crawl import Crawl import time c = Crawl() print 'Start web crawl.' c.update([usr_id], token_list=[token]) c.update_img([usr_id], token_list=[token]) c.update_voice([usr_id], token_list=[token]) print 'Crawl is finished.' print 'Start analysis.' #os.system('java -Djava.ext.dirs=../../predict/lib -jar ../../predict/predictor.jar ../../analysis/data_json/'+usr_id) os.system('java -Djava.ext.dirs=./lib -jar predictor.jar ../../analysis/data_json/'+usr_id) print 'Analysis is finished.' global five_result #with open('../../predict/predict_result/'+usr_id+'.txt') as ifile: with open('predict_result/'+usr_id+'.txt') as ifile: five_result = eval(ifile.read()) global finished finished = True
class Monitor: def __init__(self, email='', rate=60, note=60 * 60): config='config.cfg' cfg = configparser.ConfigParser() parentDirPath=os.path.dirname(os.path.abspath(__file__)) path=parentDirPath+'/config/'+config cfg.read(path) self.option = cfg.get('select', 'option') self.scheduler = sched.scheduler(time.time, time.sleep) self.goods_dict = {} self.db = DB() self.crawl = Crawl() self.mail = Mail() self.ding = Dingding() self.email = [email] # 电子邮箱 self.rate = rate # 刷新频率 self.note = note # 通知频率 # 加载数据 result = self.db.query() print('----------加载数据----------') for id, item in result.items(): self.goods_dict[id] = Goods(item['id'], item['want'], item['status'], item['dname']) print(self.goods_dict[id].__dict__) print('----------加载完成----------') # 添加商品 def add(self, id, want, status=True, dname=''): if id not in self.goods_dict.keys(): self.db.add(id, want, status, dname) goods = Goods(id, want, status, dname) name, price, date = self.crawl.get(id) goods.update(name, price, date) self.goods_dict[id] = goods print(self.goods_dict[id].__dict__) return True else: return False # 删除商品 def remove(self, id): if id in self.goods_dict.keys(): self.goods_dict.pop(id) self.db.delete(id) return True else: return False # 更新期望价格 def update_want(self, id, want): if id in self.goods_dict.keys(): self.goods_dict[id].update_want(want) self.goods_dict[id].update_note(0) # 刷新通知时间 self.db.update_want(id, want) return True else: return False # 更新运行状态 def update_status(self, id, status): if id in self.goods_dict.keys(): self.goods_dict[id].update_status(status) self.goods_dict[id].update_note(0) # 刷新通知时间 self.db.update_status(id, status) return True else: return False # 获取历史价格 def history(self, id): if id in self.goods_dict.keys(): return self.crawl.get_history(id) else: return '' # 定时任务 def task(self): ids = list(self.goods_dict.keys()) for id in ids: goods = self.goods_dict[id] if goods.status: name, price, date = self.crawl.get(id) if id not in self.goods_dict.keys(): continue # 防止商品已经删除 goods.update(name, price, date) ########## 检查是否符合发送条件 ########## # 满足通知间隔时间 & 当前价格小于期望价格 if (date - goods.note >= self.note) and (price <= goods.want): if (self.option == 'mail'): print('邮件发送') self.mail.send(self.email, name, price, goods.want, goods.url) else: print('钉钉发送') self.ding.send(name, price, goods.want, goods.url) goods.update_note(date) print('----------刷新数据----------') for goods in self.goods_dict.values(): print(goods.__dict__) print('----------刷新完成----------') # 定时器 def _run(self): self.scheduler.enter(self.rate, 0, self._run, ()) # delay, priority, action, argument=() self.task() # 定时器 def run(self): self.scheduler.enter(0, 0, self._run, ()) # delay, priority, action, argument=() self.scheduler.run()
from crawl import Crawl from crawley import Crawley crawley = Crawley() crawley.welcome() url, levels, user_defined_regex = crawley.user_input() crawl = Crawl(url, levels, user_defined_regex) crawl.perform_crawl() #crawl.test_variables() crawl.save_report() crawley.report() while True: if crawley.crawl_another() == True: url, levels, user_defined_regex = crawley.user_input() crawl = Crawl(url, levels, user_defined_regex) crawl.perform_crawl() #crawl.test_variables() crawl.save_report() crawley.report() else: crawley.goodbye() break
def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline()
class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy()) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath( each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_amuse_link(self): """ 获取每个城市中所有的娱乐场所的链接 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) for each_city in city_list: try: url = each_city.strip().split('\u0001')[1] + '-wanle' name = each_city.strip().split('\u0001')[0] params_city = {'page': 0} maxpage = 200 # 默认最大页数 while True: save_list = [] params_city['page'] += 1 content = self.crawl.crawl_by_get( url, headers=setting.HEADERS, params=params_city, proxies=self._engine_use_proxy(), retry=2, timeout=15) if not content: break # 获取总页数 if params_city['page'] == 1: # 找到最大页数,使用map函数 pagecount = map( lambda x: int(x) if x != '下一页' else -1, self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_NEXTPAGE)) try: maxpage = max(pagecount) except: break element_li = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_LI) if not element_li: break for each_ele in element_li: amuse_name = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_AMUSE_NAME) amuse_type = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_AMUSE_TYPE) amuse_url = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_AMUSE_URL) try: save_info = '{}\u0001{}\u0001{}\u0001{}'.format( name, ''.join(amuse_name), ''.join(amuse_type), ''.join(amuse_url)) except: continue save_list.append(save_info) self.pipe.pipe_txt_save(save_list, filename=setting.FILE_AMUSE_LIST, savetype='a') if params_city['page'] >= maxpage: break time.sleep(0.2) except: continue def _engine_amuse_info(self): """ 获取所有娱乐场所详细数据 :return: """ amuse_list = self.pipe.pipe_txt_load(filename=setting.FILE_AMUSE_LIST) for each_amuse in amuse_list: try: # 娱乐场所数据 amuse_info = each_amuse.strip().split('\u0001') city_name = amuse_info[0] amuse_name = amuse_info[1] amuse_type = amuse_info[2] amuse_url = amuse_info[3] find_id = re.search(re.compile(r'p-oi(\d+)-'), amuse_url) if find_id: amuse_id = find_id.group(1) else: amuse_id = 0 # 获取娱乐场所详细信息 content = self.crawl.crawl_by_get( amuse_url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=5, timeout=10) detail = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_AMUSE_DETAIL) detail['city_name'] = city_name detail['amuse_name'] = amuse_name detail['amuse_type'] = amuse_type detail['amuse_url'] = amuse_url detail['amuse_id'] = amuse_id detail['get_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # 存储数据 # 字段顺序 # city_name, amuse_name, amuse_type, amuse_id, # score, ranking, describe, address, tel, open_time, arrive, intro, web, get_time, amuse_url save_data = '{0[city_name]}\u0001{0[amuse_name]}\u0001{0[amuse_type]}\u0001' \ '{0[amuse_id]}\u0001{0[score]}\u0001{0[ranking]}\u0001' \ '{0[describe]}\u0001{0[address]}\u0001{0[tel]}\u0001' \ '{0[open_time]}\u0001{0[arrive]}\u0001{0[intro]}\u0001' \ '{0[web]}\u0001{0[get_time]}\u0001{0[amuse_url]}\u0001'.format(detail) self.pipe.pipe_txt_save(save_data, filename=setting.FILE_AMUSE_INFO, savetype='a') # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_shop_info') time.sleep(0.1) except Exception as e: print('crawl error', e) continue def _engine_amuse_comments(self): """ 获取所有购物店评论数据 :return: """ amuse_list = self.pipe.pipe_txt_load(filename=setting.FILE_AMUSE_LIST) # 每个店铺最新评论时间表 check_dict = self.pipe.pipe_pickle_load( filename=setting.FILE_COMMENTS_CHECK) if not check_dict: check_dict = {} for each_amuse in amuse_list: try: # 店铺数据 city = each_amuse.strip().split('\u0001')[0] amuse = each_amuse.strip().split('\u0001')[1] type = each_amuse.strip().split('\u0001')[2] amuse_url = each_amuse.strip().split('\u0001')[3] find_id = re.search(re.compile(r'p-oi(\d+)-'), amuse_url) if not find_id: break amuse_id = find_id.group(1) api = setting.COMMENTS_API.format(amuse_id) setting.HEADERS_COMMENTS['Referer'] = amuse_url params = { 'page': 0, 'pageSize': '10', 'poiList': 'true', 'rank': 0, # 全部评论 'sortField': 0 # 按照时间排序 } comments_time = set([]) current_time = check_dict.get(amuse_id, '0') max_page = 1 while True: params['page'] += 1 content = self.crawl.crawl_by_get( api, headers=setting.HEADERS_COMMENTS, proxies=self._engine_use_proxy(), params=params, retry=2, timeout=15) try: content_dict = json.loads(content) except: break if not content_dict.get('data'): break content_comments = content_dict.get('data') # 第一遍抓取要确定评论页数 if params['page'] == 1: page = self.analysis.analysis_by_xpath( content_comments, xpahter=setting.XPATH_COMMENTS_PAGE) if page: max_page = int(''.join(page)) elements_com = self.analysis.analysis_by_xpath( content_comments, xpahter=setting.XPATH_COMMENTS_LI) if not elements_com: break for each_element in elements_com: title = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_TITLE) start = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_START) nick = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_NICK) more = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_MORE) if more: content_more = self.crawl.crawl_by_get( more[0], headers=setting.HEADERS, proxies=self._engine_use_proxy()) content = self.analysis.analysis_by_xpath( content_more, xpahter=setting.XPATH_COMMENTS_DETAIL) else: content = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_CONTENT) date = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_DATE) deal_content = ''.join( list( map( lambda x: x.replace('\n', '').replace( '\r', '').replace('\t', '').replace( ' ', ''), content))) if ''.join(date) > current_time: commetents_info = { 'city': city, 'amuse': amuse, 'amuse_id': amuse_id, 'type': type, 'title': ''.join(title), 'nick': ''.join(nick), 'start': ''.join(start), 'content': deal_content, 'date': ''.join(date), 'get_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'url': amuse_url } for eachkey in commetents_info.keys(): commetents_info[eachkey] = commetents_info[ eachkey].replace('\n', '').replace('\r', '') # 存储数据 # 字段顺序 # city, amuse, amuse_id, type, title, nick, start, content, date, get_time, url save_data = '{0[city]}\u0001{0[amuse]}\u0001{0[amuse_id]}\u0001' \ '{0[type]}\u0001{0[title]}\u0001{0[nick]}\u0001' \ '{0[start]}\u0001{0[content]}\u0001{0[date]}\u0001' \ '{0[get_time]}\u0001{0[url]}'.format(commetents_info) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_AMUSE_COMMENTS, savetype='a') # self.pipe.pipe_mongo_save(commetents_info, dbname='db_qunaer', colname='col_shopping_comments') comments_time.add(''.join(date)) # 超过评论最大页数则切换 if params['page'] >= max_page: break # 当前页面没有新增评论也切换至下一店铺 if not len(comments_time): break # 每个店铺最新的评论时间 if comments_time: check_dict[amuse_id] = max(comments_time) # 抓取到的评论数据 self.pipe.pipe_pickle_save( check_dict, filename=setting.FILE_COMMENTS_CHECK) except: continue def _temp_city_info(self, cityname): """ 做22项数据处理时临时用 :return: """ citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt') city_params = { '国别': '&', '省自治区全称': '&', '省自治区简称': '&', '市州全称': '&', '市州简称': '&', '区县全称': '&', '区县简称': '&', '地区编码': '&', '等级': '&' } spec_city = { '北京': '110000', '天津': '120000', '上海': '310000', '重庆': '500000' } for each in citylist: cityinfo = each.split('\u0001') if cityname in cityinfo: site = cityinfo.index(cityname) if site == 4 or site == 5: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['区县全称'] = cityinfo[4].strip() city_params['区县简称'] = cityinfo[5].strip() city_params['地区编码'] = cityinfo[-1].strip() city_params['等级'] = '区县级' elif site == 2 or site == 3: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00' city_params['等级'] = '地市级' elif cityname in ['北京', '重庆', '上海', '天津']: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityname + '市' city_params['省自治区简称'] = cityname city_params['市州全称'] = cityname + '市' city_params['市州简称'] = cityname city_params['地区编码'] = spec_city[cityname] city_params['等级'] = '直辖' break return city_params @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def start_engine(self): # self._engine_city_link() # self._engine_amuse_link() # 店铺信息和店铺评论可以同时抓取的,用多进程实现,后期可根据需求添加该功能,目前未开发循环抓取功能 # self._engine_amuse_info() self._engine_amuse_comments()
# 关于窗体初始化类 class About_Window(QMainWindow, About_MainWindow): def __init__(self): super(About_Window, self).__init__() self.setupUi(self) # 打开窗体 def open(self): self.show() if __name__ == "__main__": # 创建自定义数据库对象 mysql = MySQL() # 创建爬去对象 mycrawl = Crawl() # 连接数据库 sql = mysql.connection_sql() # 创建游标 cur = sql.cursor() app = QApplication(sys.argv) # 主窗体对象 main = Main() # 显示主窗体 main.show() # 销量排行窗体对象 sales = Sales() # 热评排行窗体对象 heat = Heat()
""" This script just for test and learn how to crawl web pages using python """ from parser import Parser from crawl import Crawl c = Crawl() c.fetch('http://www.blogfa.com/') p = Parser() p.set_html(c.content) p.get_title() p.get_links() print "count of links: %s" % len(p.links) print "title of current url: %s" % p.title
class Engine: def __init__(self): self.crawl = Crawl() self.pipe = Pipeline() self.analysis = Analysis() # def _engine_residential_area_by_json(self): # """ # 获取小区数据,output为json, # 但是高德那边返回的json数据小区更位置对应不上,只能使用xml数据,故不用该模块,使用xml # """ # citys = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID) # types = self.pipe.pipe_txt_load(filename=setting.FILE_TYPE_ID) # current_params = deepcopy(setting.PARAMS) # current_params['key'] = setting.KEY # # 每种类型 # for each_type in types: # typeinfo = each_type.strip().split('\u0001') # type_id = typeinfo[0] # 类型id # type_large = typeinfo[1] # 类型大分类 # type_middle = typeinfo[2] # 类型中分类 # type_small = typeinfo[3] # 类型小分类 # current_params['types'] = type_id # save_filename = '{}_{}_{}_{}.txt'.format(type_id, type_large, type_middle, type_small) # # 每个城市 # for each_city in citys: # cityinfo = each_city.strip().split('\u0001') # province = cityinfo[0] # 省名 # city_name = cityinfo[1] # 城市名 # city_id = cityinfo[2] # 城市id # current_params['city'] = city_id # current_params['page'] = 0 # save_data = [] # while True: # current_params['page'] += 1 # content_json = self.crawl.crawl_by_get(setting.SEARCH_API, params=current_params, # retry=2, timeout=30) # try: # data_json = json.loads(content_json) # except: # continue # pois_list = data_json.get('pois') # if not pois_list: # break # for each_poi in pois_list: # """ # 字段说明: # id: 唯一ID, name: 名称, pcode: poi所在省份编码, pname: poi所在省份名称,citycode: 城市编码, # cityname: 城市名,adcode: 区域编码, adname: 区域名称,address: 地址, alias: 别名, # biz_ext: 深度信息, biz_type: 行业类型, business_area: 所在商圈, discount_num: 优惠信息数目, # distance: 离中心点距离(此结果仅在周边搜索的时候有值), email: 该POI的电子邮箱, entr_location: 入口经纬度, # exit_location: 出口经纬度, gridcode: 地理格ID, groupbuy_num: 团购数据, indoor_data: 室内地图相关数据, # indoor_map: 是否有室内地图标志, location: 经纬度, navi_poiid: 地图编号, photos: 照片相关信息, # postcode: 邮编, tag: 该POI的特色内容, tel: 该POI的电话, type: 兴趣点类型, typecode: 兴趣点类型编码, # website: 该POI的网址 # """ # save_dict = {} # save_dict['id'] = each_poi.get('id', '') # id: 唯一ID # save_dict['name'] = each_poi.get('name', '') # name: 名称 # save_dict['pcode'] = each_poi.get('pcode', '') # pcode: poi所在省份编码 # save_dict['pname'] = each_poi.get('pname', '') # pname: poi所在省份名称 # save_dict['citycode'] = each_poi.get('citycode', '') # citycode: 城市编码 # save_dict['cityname'] = each_poi.get('cityname', '') # cityname: 城市名 # save_dict['adcode'] = each_poi.get('adcode', '') # adcode: 区域编码 # save_dict['adname'] = each_poi.get('adname', '') # adname: 区域名称 # save_dict['address'] = each_poi.get('address', '') # address: 地址 # save_dict['alias'] = each_poi.get('alias', '') # alias: 别名 # save_dict['biz_ext'] = each_poi.get('biz_ext', '') # biz_ext: 深度信息 # save_dict['biz_type'] = each_poi.get('biz_type', '') # biz_type: 行业类型 # save_dict['business_area'] = each_poi.get('business_area', '') # business_area: 所在商圈 # save_dict['discount_num'] = each_poi.get('discount_num', '') # discount_num: 优惠信息数目 # save_dict['email'] = each_poi.get('email', '') # email: 该POI的电子邮箱 # save_dict['entr_location'] = each_poi.get('entr_location', '') # entr_location: 入口经纬度 # save_dict['exit_location'] = each_poi.get('exit_location', '') # exit_location: 出口经纬度 # save_dict['gridcode'] = each_poi.get('gridcode', '') # gridcode: 地理格ID # save_dict['groupbuy_num'] = each_poi.get('groupbuy_num', '') # groupbuy_num: 团购数据 # save_dict['indoor_data'] = each_poi.get('indoor_data', '') # indoor_data: 室内地图相关数据 # save_dict['indoor_map'] = each_poi.get('indoor_map', '') # indoor_map: 是否有室内地图标志 # save_dict['location'] = each_poi.get('location', '') # location: 经纬度 # save_dict['navi_poiid'] = each_poi.get('navi_poiid', '') # navi_poiid: 地图编号 # photos = each_poi.get('photos', []) # photos: 照片相关信息 # save_dict['photo_info'] = '' # for each_photo in photos: # if isinstance(each_photo.get('title', {}), dict): # each_photo['title'] = 'notitle' # save_dict['photo_info'] += '{0[title]}-{0[url]},'.format(each_photo) # save_dict['postcode'] = each_poi.get('postcode', '') # postcode: 邮编 # save_dict['tag'] = each_poi.get('tag', '') # tag: 该POI的特色内容 # save_dict['tel'] = each_poi.get('tel', '') # tel: 该POI的电话 # save_dict['type'] = each_poi.get('type', '') # type: 兴趣点类型 # save_dict['typecode'] = each_poi.get('typecode', '') # typecode: 兴趣点类型编码 # save_dict['website'] = each_poi.get('website', '') # website: 该POI的网址 # for each_key in save_dict.keys(): # save_dict[each_key] = \ # save_dict[each_key] if not isinstance(save_dict[each_key], dict) else '' # # 存储字段类型 # # id, name, pcode, pname, citycode, cityname, adcode, adname, # # address, alias, biz_type, business_area, discount_num, email, # # entr_location, exit_location, gridcode, groupbuy_num, indoor_data, # # indoor_map, location, navi_poiid, photo_info, postcode, tag, tel, type, typecode, website, # save_info = '{0[id]}\u0001{0[name]}\u0001{0[pcode]}\u0001{0[pname]}\u0001' \ # '{0[citycode]}\u0001{0[cityname]}\u0001{0[adcode]}\u0001{0[adname]}\u0001' \ # '{0[address]}\u0001{0[alias]}\u0001{0[biz_type]}\u0001{0[business_area]}\u0001' \ # '{0[discount_num]}\u0001{0[email]}\u0001{0[entr_location]}\u0001' \ # '{0[exit_location]}\u0001' \ # '{0[gridcode]}\u0001{0[groupbuy_num]}\u0001{0[indoor_data]}\u0001' \ # '{0[indoor_map]}\u0001' \ # '{0[location]}\u0001{0[navi_poiid]}\u0001{0[photo_info]}\u0001{0[postcode]}\u0001' \ # '{0[tag]}\u0001{0[tel]}\u0001{0[type]}\u0001{0[typecode]}\u0001' \ # '{0[website]}'.format(save_dict) # save_data.append(save_info) # time.sleep(0.1) # self.pipe.pipe_txt_save(save_data, filename=save_filename, savetype='a') def _engine_residential_area(self): """获取小区数据""" citys = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID) types = self.pipe.pipe_txt_load(filename=setting.FILE_TYPE_ID) current_params = deepcopy(setting.PARAMS) current_params['key'] = setting.KEY # 每种类型 for each_type in types: typeinfo = each_type.strip().split('\u0001') type_id = typeinfo[0] # 类型id type_large = typeinfo[1] # 类型大分类 type_middle = typeinfo[2] # 类型中分类 type_small = typeinfo[3] # 类型小分类 current_params['types'] = type_id save_filename = '{}_{}_{}_{}.txt'.format(type_id, type_large, type_middle, type_small) # 每个城市 for each_city in citys: cityinfo = each_city.strip().split('\u0001') province = cityinfo[0] # 省名 city_name = cityinfo[1] # 城市名 city_id = cityinfo[2] # 城市id current_params['city'] = city_id current_params['page'] = 0 save_data = [] while True: current_params['page'] += 1 content = self.crawl.crawl_by_get(setting.SEARCH_API, params=current_params, retry=2, timeout=30) try: con = re.search(re.compile(r'<response>(.*?)</response>', re.S), content).group(1) pois_list = self.analysis.analysis_by_xpath(con, xpahter=setting.XPATH_POIS) except: continue if not pois_list: break for each_poi in pois_list: """ 字段说明: id: 唯一ID, name: 名称, pcode: poi所在省份编码, pname: poi所在省份名称,citycode: 城市编码, cityname: 城市名,adcode: 区域编码, adname: 区域名称,address: 地址, alias: 别名, biz_ext: 深度信息, biz_type: 行业类型, business_area: 所在商圈, discount_num: 优惠信息数目, distance: 离中心点距离(此结果仅在周边搜索的时候有值), email: 该POI的电子邮箱, entr_location: 入口经纬度, exit_location: 出口经纬度, gridcode: 地理格ID, groupbuy_num: 团购数据, indoor_data: 室内地图相关数据, indoor_map: 是否有室内地图标志, location: 经纬度, navi_poiid: 地图编号, photos: 照片相关信息, postcode: 邮编, tag: 该POI的特色内容, tel: 该POI的电话, type: 兴趣点类型, typecode: 兴趣点类型编码, website: 该POI的网址 """ save_dict = self.analysis.analysis_by_xpath(each_poi, xpahter=setting.XPATH_DETAIL) photos = self.analysis.analysis_by_xpath(each_poi, xpahter=setting.XPATH_PHOTOS) photo_info = '' for each_photo in photos: photo_dict = self.analysis.analysis_by_xpath(each_photo, xpahter=setting.XPATH_PHOTO_DETAIL) photo_dict['title'] = photo_dict['title'] if photo_dict['title'] else 'no_title' photo_info += '{0[title]}-{0[url]},'.format(photo_dict) save_dict['photo_info'] = photo_info # 存储字段类型 # id, name, pcode, pname, citycode, cityname, adcode, adname, # address, alias, biz_type, business_area, discount_num, email, # entr_location, exit_location, gridcode, groupbuy_num, indoor_data, # indoor_map, location, navi_poiid, photo_info, postcode, tag, tel, type, typecode, website, save_info = '{0[id]}\u0001{0[name]}\u0001{0[pcode]}\u0001{0[pname]}\u0001' \ '{0[citycode]}\u0001{0[cityname]}\u0001{0[adcode]}\u0001{0[adname]}\u0001' \ '{0[address]}\u0001{0[alias]}\u0001{0[biz_type]}\u0001{0[business_area]}\u0001' \ '{0[discount_num]}\u0001{0[email]}\u0001{0[entr_location]}\u0001' \ '{0[exit_location]}\u0001' \ '{0[gridcode]}\u0001{0[groupbuy_num]}\u0001{0[indoor_data]}\u0001' \ '{0[indoor_map]}\u0001' \ '{0[location]}\u0001{0[navi_poiid]}\u0001{0[photo_info]}\u0001{0[postcode]}\u0001' \ '{0[tag]}\u0001{0[tel]}\u0001{0[type]}\u0001{0[typecode]}\u0001' \ '{0[website]}'.format(save_dict) save_data.append(save_info) time.sleep(5) self.pipe.pipe_txt_save(save_data, filename=save_filename, savetype='a') @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {"host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass} proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def run_engine(self): self._engine_residential_area()
#encoding=utf-8 from crawl import Crawl useridFile = open("userid.txt", 'r') userid = useridFile.read().strip() useridFile.close() open("result.txt", 'w').close() c = Crawl() print "Job Started ...\n" page = 1 url = c.host + '/'+userid+'/myfans?t=4&page=' + str(page) while ( c.run(url) ): print "fans in page "+str(page)+"\n" page += 1 url = c.host + '/'+userid+'/myfans?t=4&page=' + str(page) print "Done!\n"
class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy()) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath( each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_tactic_link(self): """ 获取每个城市中所有的攻略的链接 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) tactic_check = self.pipe.pipe_pickle_load( filename=setting.FILE_TACTIC_CHECK) if not tactic_check: tactic_check = set([]) for each_city in city_list: """ http://travel.qunar.com/travelbook/list/22-城市拼音-城市id/ hot(hot为热门游记,elite为精华游记,start为行程计划)_ctime(ctime为按最新发表排序,heat为热度排序)/页码.htm """ try: url = each_city.strip().split('\u0001')[1] name = each_city.strip().split('\u0001')[0] pattern = re.compile(r'p-cs(\d+)-(\w+)') city_pname = re.search(pattern, url).group(2) city_id = re.search(pattern, url).group(1) # 拼接攻略所在url(1.城市拼音名称:city_pname, 2.城市id:city_id, 3.分类) tactic_type = ['hot', 'elite', 'start'] # 攻略分类,目前脚本先抓取hot类 tactic_url = setting.TACTIC_URL.format(city_pname, city_id, tactic_type[0]) current_page = 0 maxpage = 200 # 默认最大页数 while True: save_list = [] current_page += 1 content = self.crawl.crawl_by_get( tactic_url + '{}.htm'.format(current_page), headers=setting.HEADERS, retry=2, timeout=15, proxies=self._engine_use_proxy()) if not content: break # 获取总页数 if current_page == 1: # 找到最大页数,使用map函数 pagecount = map( lambda x: int(x) if x != '下一页>' else -1, self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_NEXTPAGE)) try: maxpage = max(pagecount) except: break tactic_ids = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_ID) for each_id in tactic_ids: each_url = 'http://travel.qunar.com/youji/{}'.format( each_id) save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format( name, city_pname, city_id, each_id, each_url) if each_id not in tactic_check: save_list.append(save_info) tactic_check.add(each_id) if save_list: self.pipe.pipe_txt_save( save_list, filename=setting.FILE_TACTIC_LIST, savetype='a') if current_page >= maxpage: break time.sleep(0.2) except: continue def _engine_tactic_info(self): """ 获取所有攻略详细数据 :return: """ tactic_list = self.pipe.pipe_txt_load( filename=setting.FILE_TACTIC_LIST) for each_tactic in tactic_list: try: # 攻略数据 tactic_info = each_tactic.strip().split('\u0001') city_name = tactic_info[0] city_pname = tactic_info[1] city_id = tactic_info[2] tactic_id = tactic_info[3] tactic_url = tactic_info[4] # 获取娱乐场所详细信息 content = self.crawl.crawl_by_get( tactic_url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=3, timeout=15) detail = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_TACTIC_DETAIL) detail['city_name'] = city_name detail['city_pname'] = city_pname detail['city_id'] = city_id detail['tactic_id'] = tactic_id detail['tactic_url'] = tactic_url detail['get_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # 存储数据 # 字段顺序 # city_name, city_pname, city_id, # tactic_id,title,author, # create_date,start_date,days, # avgs_price,person,play_type, # content,get_time, tactic_url save_data = '{0[city_name]}\u0001{0[city_pname]}\u0001{0[city_id]}\u0001' \ '{0[tactic_id]}\u0001{0[title]}\u0001{0[author]}\u0001' \ '{0[create_date]}\u0001{0[start_date]}\u0001{0[days]}\u0001' \ '{0[avgs_price]}\u0001{0[person]}\u0001{0[play_type]}\u0001' \ '{0[content]}\u0001{0[get_time]}\u0001{0[tactic_url]}\u0001'.format(detail) self.pipe.pipe_txt_save(save_data, filename=setting.FILE_TACTIC_INFO, savetype='a') # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_shop_info') time.sleep(0.1) except Exception as e: print('crawl error', e) continue def _engine_tactic_comments(self): """ 获取所有攻略评论数据 :return: """ tactic_list = self.pipe.pipe_txt_load( filename=setting.FILE_TACTIC_LIST) # 每个店铺最新评论时间表 for each_tactic in tactic_list: try: # 店铺数据 each_info = each_tactic.strip().split('\u0001') city_name = each_info[0] city_pname = each_info[1] city_id = each_info[2] tactic_id = each_info[3] tactic_url = each_info[4] setting.HEADERS_COMMENTS['Referer'] = tactic_url params = { 'bookId': tactic_id, # 攻略id 'csrfToken': 'o7mGNaK63wbEaYFJTnDue14WX7sPlyXB', # 暂时固定token 'page': 0, # 页码 'pageSize': 30, # 每页数量 } while True: params['page'] += 1 content = self.crawl.crawl_by_get( setting.COMMENTS_API, headers=setting.HEADERS_COMMENTS, proxies=self._engine_use_proxy(), params=params, retry=2, timeout=15) try: content_dict = json.loads(content) except: break if not content_dict.get('data', {}).get('html'): break content_comments = content_dict.get('data', {}).get('html') # 第一遍抓取要确定评论页数 elements_com = self.analysis.analysis_by_xpath( content_comments, xpahter=setting.XPATH_COMMENTS_LI) if not elements_com: break for each_element in elements_com: ask_content = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_ASK_CONTENT) answer_content = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_ANSWER_CONTENT) ask_date = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_ASK_DATE) answer_date = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_ANSWER_DATE) commetents_info = { 'city_name': city_name, 'city_id': city_id, 'tactic_id': tactic_id, 'ask_content': ask_content, 'answer_content': answer_content, 'ask_date': ask_date, 'answer_date': answer_date, 'get_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'tactic_url': tactic_url } for eachkey in commetents_info.keys(): if isinstance(commetents_info[eachkey], str): commetents_info[eachkey] = commetents_info[eachkey]\ .replace('\n', '').replace('\r', '').replace('\xa0', '') elif isinstance(commetents_info[eachkey], list): commetents_info[eachkey] = ''.join(commetents_info[eachkey])\ .replace('\n', '').replace('\r', '') # 存储数据 # 字段顺序 # city_name, city_id, tactic_id, # ask_content, answer_content, ask_date, # answer_date, get_time, tactic_url, save_data = '{0[city_name]}\u0001{0[city_id]}\u0001{0[tactic_id]}\u0001' \ '{0[ask_content]}\u0001{0[answer_content]}\u0001{0[ask_date]}\u0001' \ '{0[answer_date]}\u0001{0[get_time]}\u0001' \ '{0[tactic_url]}\u0001'.format(commetents_info) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_TACTIC_COMMENTS, savetype='a') # self.pipe.pipe_mongo_save(commetents_info, dbname='db_qunaer', colname='col_shopping_comments') except: continue def _temp_city_info(self, cityname): """ 做22项数据处理时临时用 :return: """ citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt') city_params = { '国别': '&', '省自治区全称': '&', '省自治区简称': '&', '市州全称': '&', '市州简称': '&', '区县全称': '&', '区县简称': '&', '地区编码': '&', '等级': '&' } spec_city = { '北京': '110000', '天津': '120000', '上海': '310000', '重庆': '500000' } for each in citylist: cityinfo = each.split('\u0001') if cityname in cityinfo: site = cityinfo.index(cityname) if site == 4 or site == 5: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['区县全称'] = cityinfo[4].strip() city_params['区县简称'] = cityinfo[5].strip() city_params['地区编码'] = cityinfo[-1].strip() city_params['等级'] = '区县级' elif site == 2 or site == 3: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00' city_params['等级'] = '地市级' elif cityname in ['北京', '重庆', '上海', '天津']: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityname + '市' city_params['省自治区简称'] = cityname city_params['市州全称'] = cityname + '市' city_params['市州简称'] = cityname city_params['地区编码'] = spec_city[cityname] city_params['等级'] = '直辖' break return city_params @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def start_engine(self): self._engine_city_link() # 本版块循环策略为循环抓取攻略,然后评论每次抓取一次攻略列表之后,抓取一遍所有攻略所有评论,并入存入新的文本 self._engine_tactic_link() self._engine_tactic_info() self._engine_tactic_comments()
class EngineSelenium: def __init__(self, name): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() self.options = webdriver.ChromeOptions() # 指定下载位置 prefs = { 'profile.default_content_settings.popups': 0, 'download.default_directory': os.path.abspath('DATA') } self.options.add_experimental_option('prefs', prefs) self.driver = webdriver.Chrome(chrome_options=self.options) self.realname = name self.name = str(name.encode('gbk'))[2:-1].replace('\\x', '%').upper() def _engine_get_trend(self): """ 趋势研究板块数据 :return: """ # 获取数据对应的标题 url = 'http://index.baidu.com/?tpl=trend&word={}'.format(self.name) self.driver.get(url) # 等待页面跳转 time.sleep(5) content_page = self.driver.page_source page_str_date = self.analysis.analysis_by_xpath( content_page, xpahter= "substring-after(.//*[text()='搜索指数概况']/parent::div//*[@class='compInfo'][2], '至')" ) end_date = page_str_date.strip() element_div_all = self.analysis.analysis_by_xpath( content_page, xpahter=setting.XPATH_DIV) element_dict = { "近7天": element_div_all[0:6], "近30天": element_div_all[6::] } for element_name, element_div in element_dict.items(): ele_title = element_div[0:3] # 前3个element为标题 ele_content = element_div[3:6] # 后3个element为图片数据 for i in range(3): if i == 0: value_pic = {} title = self.analysis.analysis_by_xpath( ele_title[i], xpahter=setting.XPATH_TITLE) element_pic = self.analysis.analysis_by_xpath( ele_content[i], xpahter=".//span[starts-with(@class, 'ftlwhf')]") # ===========图片操作 pic_url = self.analysis.analysis_by_xpath( ele_content[i], xpahter=setting.XPATH_PIC) if pic_url: downurl = ''.join(pic_url) try: url = 'http://index.baidu.com' + re.search( re.compile(r'url\("(.*?)"\)', re.S), downurl).group(1) except: url = '' # 去访问图片的下载链接 url_real = url.replace('amp;', '') self.driver.get(url_real) time.sleep(1) # 读取下载的图片并用java那边提供的接口识别内容 pic_code = self.pipe.pipe_pic_load(filename='下载') # 删除该图片 self.pipe.pipe_pic_del(filename='下载') # ===========图片操作 n = 1 titles = list( map(lambda x: x.replace(' ', ''), title.split('|'))) for each in element_pic: pic_info = self.analysis.analysis_by_xpath( each, xpahter=".//span[@class='imgval']") res_pic = [] for each_info in pic_info: imgval = self.analysis.analysis_by_xpath( each_info, xpahter="@style") imgtxt = self.analysis.analysis_by_xpath( each_info, xpahter=".//*[@class='imgtxt']/@style") pic_px = '{},{}'.format( self._engine_tool_regex(imgval), self._engine_tool_regex(imgtxt)) res_pic.append( pic_px.replace('px', '').replace('-', '')) value_pic[titles[n - 1]] = ';'.join(res_pic) n += 1 # 图片识别完输出数据,此处图片二进制文件进行base64处理 for pic_name, pic_px in value_pic.items(): data = { 'data': base64.b64encode(pic_code), 'num1': pic_px, 'type': 'm' } pic_value = self.crawl.crawl_by_post( url=setting.RECOGNITION_URL, data=data) print(end_date, element_name, pic_name, pic_value) save_data = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format( self.realname, end_date, element_name, pic_name, pic_value) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_TREND_ZSGK.format( self.realname, end_date), savetype='a') else: title = self.analysis.analysis_by_xpath( ele_title[i], xpahter=setting.XPATH_TITLE) titles = title.replace(' ', '').split('|') element_pic = self.analysis.analysis_by_xpath( ele_content[i], xpahter=".//span[starts-with(@class, 'ftlwhf')]") pics = [] n = 1 for each in element_pic: syboml = self.analysis.analysis_by_xpath( each, xpahter=".//*[starts-with(@class,'rat')]/text()") pic_info = list( map( self._engine_tool_regex, self.analysis.analysis_by_xpath( each, xpahter=".//*/i/@style"))) pic_px = list( map( lambda x: int( x.replace('-', '').replace('px', '')), pic_info)) pic_value = ''.join( list( map( lambda x: '{:.0f}'.format(x / 8) if x != 80 else '%', pic_px))) value = ''.join(syboml) + pic_value pics.append(value) n += 1 # 可以直接输出的数据 current_pic = dict(zip(titles, pics)) for pic_name, pic_value in current_pic.items(): print(end_date, element_name, pic_name, pic_value) save_data = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format( self.realname, end_date, element_name, pic_name, pic_value) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_TREND_ZSGK.format( self.realname, end_date), savetype='a') # 搜索指数趋势 content_page = self.driver.page_source # 获取res 和 res2 pattern_res = re.compile(r'res=(.*?)&', re.S) pattern_res2 = re.compile(r'res2=(.*?)&', re.S) res = re.search(pattern_res, content_page).group(1) res2 = re.search(pattern_res2, content_page).group(1) page_str_date = self.analysis.analysis_by_xpath( content_page, xpahter= "substring-after(.//*[text()='搜索指数趋势']/parent::div//*[@class='compInfo'][2], '至')" ) page_date = datetime.datetime.strptime(page_str_date.strip(), '%Y-%m-%d') # 此处调节日期 startdate = (page_date - datetime.timedelta(days=29)).strftime('%Y-%m-%d') enddate = page_date.strftime('%Y-%m-%d') # 构造url用于获取res3参数(res3参数需要去请求才能得到) url_res3 = 'http://index.baidu.com/Interface/Search/getAllIndex' \ '/?res={}&res2={}&startdate={}&enddate={}'.format(res, res2, startdate, enddate) self.driver.get(url_res3) time.sleep(2) content_res3 = self.driver.page_source # 返回的数据有整体趋势 pc趋势 移动趋势 pattern_res3 = re.compile(r'<body>(.*?)</body>', re.S) res3 = re.search(pattern_res3, content_res3).group(1) # 取3种趋势的对应参数 res3_dict = json.loads(res3) res3_data = res3_dict.get('data') if not res3_data: print('未能获取搜索指数趋势res3数据') return # all 整体趋势 pc pc趋势 wise 移动趋势 try: data_dict = { '整体趋势': res3_data.get('all')[0].get('userIndexes_enc'), 'pc趋势': res3_data.get('pc')[0].get('userIndexes_enc'), '移动趋势': res3_data.get('wise')[0].get('userIndexes_enc') } except Exception as e: data_dict = {} print('获取对应res3数据出错:{}'.format(e)) for name, current_res3 in data_dict.items(): res3_list = current_res3.split(',')[::-1] timenow = int(time.time() * 1000) n = 0 for each_res3 in res3_list: if n >= 7: break trend_pic = {} # 当前日期 current_date = ( page_date - datetime.timedelta(days=n)).strftime('%Y-%m-%d') url_trend_pic = 'http://index.baidu.com/Interface/IndexShow/show/?res={}&res2={}&classType=1&' \ 'res3[]={}&className=view-value&{}'.format(res, res2, each_res3, timenow) self.driver.get(url_trend_pic) # 切换到图片所在页面并等待一下 time.sleep(1) content_each_pic = self.driver.page_source # 获取对应图片展示的html code = re.search(re.compile(r'"code":\[(.*?)\]', re.S), content_each_pic).group(1) deal_code = code.replace('\\', '').replace('"', '').replace( '<', '<').replace('>', '>')[1:-1] # 获取对应图片的下载链接 url_current_pic = 'http://index.baidu.com' + re.search( re.compile(r'url\("(.*?)"\)', re.S), deal_code).group(1) # 访问以下url将会下载图片 url_img = url_current_pic.replace('amp;', '') # 下载图片 self.driver.get(url_img) time.sleep(0.5) # 读取下载的图片并用java那边提供的接口识别内容 pic_code = self.pipe.pipe_pic_load(filename='下载') # 图片有可能下载失败, 后期这里可能需要调整 if not pic_code: return # 删除该图片 self.pipe.pipe_pic_del(filename='下载') # ==============等待处理这张图片 element_span = self.analysis.analysis_by_xpath( deal_code, xpahter=".//*/span") res_pic = [] for each in element_span: pic_info = self.analysis.analysis_by_xpath( each, xpahter=".//@style") pic_px = '{},{}'.format( self._engine_tool_regex(pic_info[0]), self._engine_tool_regex(pic_info[1])) res_pic.append(pic_px.replace('px', '').replace('-', '')) # 给出对应的日期?还有给出url trend_pic['date'] = current_date trend_pic['name'] = name data = { 'data': base64.b64encode(pic_code), 'num1': ';'.join(res_pic), 'type': 'm' } pic_value = self.crawl.crawl_by_post( url=setting.RECOGNITION_URL, data=data) # 数据输出 print(current_date, name, pic_value) save_data = '{}\u0001{}\u0001{}\u0001{}'.format( self.realname, current_date, name, pic_value) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_TREND_ZSQS.format( self.realname, enddate), savetype='a') n += 1 def _engine_get_demand(self): """ 需求图谱板块 :return: """ url = 'http://index.baidu.com/?tpl=demand&word={}'.format(self.name) self.driver.get(url) time.sleep(6) content_page = self.driver.page_source # 需求图谱数据 page_str_date = self.analysis.analysis_by_xpath( content_page, xpahter= "substring-after(.//*[text()='需求图谱']/parent::div//*[@class='compInfo'][2], '至')" ) end_date = page_str_date.strip() element_demand = self.analysis.analysis_by_xpath( content_page, xpahter=".//*[@id='demand']//*[contains(@style," "'text-anchor: middle')and not(contains(@fill," "'#9a9a9a'))]") for each_demand in element_demand: text = self.analysis.analysis_by_xpath( each_demand, xpahter='.//descendant::text()') value_x = self.analysis.analysis_by_xpath(each_demand, xpahter='.//@x') value_y = self.analysis.analysis_by_xpath(each_demand, xpahter='.//@y') value_dy = self.analysis.analysis_by_xpath(each_demand, xpahter='.//tspan/@dy') if text: save_data = '{}\u0001{}\u0001{}\u0001{}'.format( ''.join(text), ''.join(value_x), ''.join(value_y), ''.join(value_dy)) print('{}_{}_{}_{}'.format(''.join(text), ''.join(value_x), ''.join(value_y), ''.join(value_dy))) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_DEMAND_XQTP.format( self.realname, end_date), savetype='a') # 相关词分类数据 element_tab = self.analysis.analysis_by_xpath( content_page, xpahter=".//*[@id='tablelist']//*[@class='listN1']") page_str_date = self.analysis.analysis_by_xpath( content_page, xpahter= "substring-after(.//*[text()='相关词分类']/parent::div//*[@class='compInfo'][2], '至')" ) enddate = page_str_date.strip() # 左边框内容(来源相关词) for i in range(0, 2): th = self.analysis.analysis_by_xpath( element_tab[i], xpahter=".//descendant::th/text()") title = ''.join(th) trs = self.analysis.analysis_by_xpath( element_tab[i], xpahter=".//*[@class='rank']/parent::tr") for each_tr in trs: rank = self.analysis.analysis_by_xpath( each_tr, xpahter=".//*[@class='rank']/text()") words = self.analysis.analysis_by_xpath( each_tr, xpahter=".//*[@class='hotWord']/text()") style = self.analysis.analysis_by_xpath( each_tr, xpahter=".//*[@class='psBar']/@style") width = re.search(re.compile(r'width:(.*?);', re.S), ''.join(style)).group(1) save_data = '{}\u0001{}\u0001{}\u0001{}'.format( title, ''.join(rank), ''.join(words), width.strip()) print(save_data) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_DEMAND_XGC.format( self.realname, enddate)) # 右边框内容(搜索指数,上升最快) for i in range(2, 4): th = self.analysis.analysis_by_xpath( element_tab[i], xpahter=".//descendant::th/text()") title = ''.join(th) trs = self.analysis.analysis_by_xpath( element_tab[i], xpahter=".//*[@class='rank']/parent::tr") for each_tr in trs: rank = self.analysis.analysis_by_xpath( each_tr, xpahter=".//*[@class='rank']/text()") words = self.analysis.analysis_by_xpath( each_tr, xpahter=".//*[@class='hotWord']/text()") num = self.analysis.analysis_by_xpath( each_tr, xpahter="string(.//td[last()])") save_data = '{}\u0001{}\u0001{}\u0001{}'.format( title, ''.join(rank), ''.join(words), num.strip()) print(save_data) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_DEMAND_XGC.format( self.realname, enddate)) def _engine_get_sentiment(self): """ 资讯关注板块数据 :return: """ url = 'http://index.baidu.com/?tpl=trend&word={}'.format(self.name) self.driver.get(url) # 新闻资讯监测数据 time.sleep(2) content_page = self.driver.page_source # 获取res 和 res2 pattern_res = re.compile(r'res=(.*?)&', re.S) pattern_res2 = re.compile(r'res2=(.*?)&', re.S) res = re.search(pattern_res, content_page).group(1) res2 = re.search(pattern_res2, content_page).group(1) # 用res/res2去请求getPcFeedIndex这个接口,此处res/res2来自需求图谱板块获取的,但是不影响数据结果 # 资讯指数接口 api_info = 'http://index.baidu.com/Interface/search/getPcFeedIndex/?res={}&res2={}&type=feed'.format( res, res2) # 新闻指数接口 api_news = 'http://index.baidu.com/Interface/search/getNews/?res={}&res2={}&type=search'.format( res, res2) api_dict = {'资讯指数': api_info, '媒体指数': api_news} for api_name, api_url in api_dict.items(): self.driver.get(api_url) content_data = self.driver.page_source # 请求对应解密码锁需要的唯一id,并且必须在一定时间内(目测在10-20s左右)要完成请求,不然请求回来的解密码就失效了 uniqid = re.search(re.compile(r'"uniqid":"(.*?)"', re.S), content_data).group(1) # 所有的数据在这,为页面上选择全部数据时候的内容,后期可根据需要,选择数量,此数据需要截取 userindexs = re.search(re.compile(r'"userIndexes":"(.*?)"', re.S), content_data).group(1) # 当前数据时间段 data_date = re.search(re.compile(r'"period":"\d+\|(\d+)"', re.S), content_data).group(1) # 当前搜索内容 name = re.search(re.compile(r'"key":"(.*?)",', re.S), content_data).group(1) # 需要拿到uniqid去请求对应解密码,以下是接口 url_ptbk = 'http://index.baidu.com/Interface/api/ptbk?res={}&res2={}&uniqid={}'.format( res, res2, uniqid) self.driver.get(url_ptbk) content_pasw = self.driver.page_source # 获取返回的解密码 pasw = re.search(re.compile(r'"data":"(.*?)"', re.S), content_pasw).group(1) # 将解密码组合成字典,其中,值为,的key则为本次数据中的隔断 pasw_key = pasw[0:int(len(pasw) / 2)] pasw_value = pasw[int(len(pasw) / 2)::] pasw_dict = dict(zip(pasw_key, pasw_value)) # 数据分割 for k, v in pasw_dict.items(): if v == ',': data_list = userindexs.split(k) break # 处理数据 n = 1 print(api_name) for each_data in data_list: current_time = ( datetime.datetime.strptime(data_date, '%Y%m%d') - datetime.timedelta(days=len(data_list) - n)).strftime('%Y-%m-%d') each_value = '' for i in each_data: each_value += pasw_dict[i] # current_time 为时间 each_value 为对应的搜索数量 save_data = '{}\u0001{}\u0001{}'.format( api_name, current_time, each_value) print(save_data) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_SENTIMENT_XWZS.format( self.realname, data_date), savetype='a') n += 1 time.sleep(2) # 最下面的新闻数据 url_news = 'http://index.baidu.com/?tpl=sentiment&word={}'.format( self.name) self.driver.get(url_news) time.sleep(6) content_news = self.driver.page_source # 直接从页面上取 element_a = self.analysis.analysis_by_xpath( content_news, xpahter=".//*[starts-with(@class,'stmNews')]" "//*[@class='listN1']//*[starts-with(@class," "'mhref')]/a") # 当前页面展示的新闻链接及标题 for each_ele in element_a: title = self.analysis.analysis_by_xpath(each_ele, xpahter=".//@title") href = self.analysis.analysis_by_xpath(each_ele, xpahter=".//@href") save_data = '{}\u0001{}'.format(''.join(title), ''.join(href)) print(save_data) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_SENTIMENT_NEWS.format( self.realname, data_date), savetype='a') def _engine_get_crowd(self): """ 人群画像板块数据 :return: """ url = 'http://index.baidu.com/?tpl=crowd&word={}'.format(self.name) self.driver.get(url) time.sleep(6) content_page = self.driver.page_source page_str_date = self.analysis.analysis_by_xpath( content_page, xpahter= "substring-after(.//*[text()='地域分布']/parent::div//*[@class='compInfo'][2], '至')" ) end_date = page_str_date.strip() # 地域分布 for name in ['省份', '城市', '区域']: element = self.driver.find_element_by_xpath( ".//*[text()='{}']".format(name)) element.click() time.sleep(2) content_page = self.driver.page_source ele_trs = self.analysis.analysis_by_xpath( content_page, xpahter= ".//*[@class='tang-scrollpanel-content']//*[starts-with(@class,'items')]/descendant::tr" ) # 区域只有7个,后面的数据为城市的 if name == '区域': ele_trs = ele_trs[0:7] for each_tr in ele_trs: rank = self.analysis.analysis_by_xpath( each_tr, xpahter=".//*[@class='scRank']/text()") cityname = self.analysis.analysis_by_xpath( each_tr, xpahter=".//*[@class='scName']/text()") style = self.analysis.analysis_by_xpath( each_tr, xpahter=".//*[@class='zbar'][1]/@style") width = re.search(re.compile(r'width:(.*?);', re.S), ''.join(style)).group(1) save_data = '{}\u0001{}\u0001{}\u0001{}'.format( end_date, ''.join(rank), ''.join(cityname), ''.join(width)) print(save_data) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_CROWD_DYFB.format( self.realname, end_date), savetype='a') # 人群属性 content_page = self.driver.page_source page_str_date = self.analysis.analysis_by_xpath( content_page, xpahter= "substring-after(.//*[text()='人群属性']/parent::div//*[@class='compInfo'][2], '至')" ) enddate = page_str_date.strip() # 年龄分布 age_height = self.analysis.analysis_by_xpath( content_page, xpahter=".//*[@id='grp_social_l']//*[" "@fill='#3ec7f5']/@height") # value = self.analysis.analysis_by_xpath(content_page, xpahter=".//*[@id='grp_social_l']//*[starts-with(@style," # "'text-anchor: middle')]/descendant::text()") # 计算总数 total = reduce(lambda x, y: float(x) + float(y), age_height) # 计算每一个阶段的百分比 percent = list( map(lambda x: '{:.2f}%'.format((float(x) / total) * 100), age_height)) # 构造对应数据,这里把每个数据key写为固定的 age_dict = { '19岁及以下': percent[0], '20-29岁': percent[1], '30-39岁': percent[2], '40-49岁': percent[3], '50岁及以上': percent[4], } # 性别分布 sex_height = self.analysis.analysis_by_xpath( content_page, xpahter=".//*[@id='grp_social_r']//*[" "@fill='#3ec7f5']/@height") # 计算总数 total = reduce(lambda x, y: float(x) + float(y), sex_height) # 计算每一个阶段的百分比 percent = list( map(lambda x: '{:.2f}%'.format((float(x) / total) * 100), sex_height)) # 构造对应数据,这里把每个数据key写为固定的 sex_dict = {'男': percent[0], '女': percent[1]} save_data = [] for k, v in age_dict.items(): save_info = '{}\u0001年龄分布\u0001{}\u0001{}'.format(enddate, k, v) save_data.append(save_info) for k1, v1 in sex_dict.items(): save_info = '{}\u0001性别分布\u0001{}\u0001{}'.format(enddate, k1, v1) save_data.append(save_info) print(save_data) self.pipe.pipe_txt_save(save_data, filename=setting.FILE_CROWD_RQSX.format( self.realname, enddate), savetype='a') def _engine_do_login(self): """ 登录处理 :return: """ login_url = 'http://index.baidu.com/' self.driver.get(login_url) element = self.driver.find_element_by_xpath(".//*[text()='登录']") element.click() time.sleep(5) element = self.driver.find_element_by_xpath( ".//*/input[@name='userName']") element.send_keys('daqbigdata') time.sleep(3) element = self.driver.find_element_by_xpath( ".//*/input[@name='password']") element.send_keys('daqsoft') time.sleep(1) element = self.driver.find_element_by_xpath( ".//*/input[@type='submit']") element.click() time.sleep(8) @staticmethod def _engine_tool_regex(str_data): """ 正则取数据 :return: """ if isinstance(str_data, list): deal_data = ''.join(str_data) else: deal_data = str_data try: return re.search(re.compile(r':([-]{0,1}\d+px)', re.S), deal_data).group(1) except: return def run_engine(self): # 先登录 self._engine_do_login() self._engine_get_trend() self._engine_get_demand() self._engine_get_sentiment() self._engine_get_crowd() # 最后关闭浏览器 self.driver.close()
class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy()) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath( each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_scenic_link(self): """ 获取每个城市中所有的热门景点的链接 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) for each_city in city_list: url = each_city.strip().split('\u0001')[1] + '-jingdian' city_name = each_city.strip().split('\u0001')[0] content = self.crawl.crawl_by_get(url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=2, timeout=15) element_a = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_HOT_A) save_list = [] for each_ele in element_a: scenic_full_name = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_HOT_NAME) current_url = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_HOT_HREF) scenic_name = ''.join(scenic_full_name).replace('旅游攻略', '') scenic_url = ''.join(current_url) scenic_id = re.search(re.compile(r'p-oi(\d+)-'), scenic_url).group(1) # 存储字段 # city_name, scenic_id, scenic_name, scenic_url save_info = '{}\u0001{}\u0001{}\u0001{}'.format( city_name, scenic_id, scenic_name, scenic_url) save_list.append(save_info) self.pipe.pipe_txt_save(save_list, filename=setting.FILE_SCENIC_LIST, savetype='a') @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def start_engine(self): self._engine_city_link() self._engine_scenic_link()
#encoding=utf-8 from crawl import Crawl c = Crawl() url = "/345920104" c.run(c.host+url) print "Done!\n"
def __init__(self): Crawl.__init__(self, INIT_URL, SKIP_URL, USE_TOR) #select collection self.mongo_collection = self.mongo_conn['cdiscount_product']
def run(): crawler = Crawl() vips = crawler.all_come_to_bowl() print_vips(vips)