def process_item(self, item, spider): self.id = getattr(spider, "aid") self.userinfo = UserInfo.from_json(self.r.get(self.id)) self.table_name = self.userinfo['table_name'] # resp = requests.get("http://localhost:9998/crawling?append=%s" % self.id) # self.flag = resp.text # print('the flag:', self.flag) # print('type:',type(self.flag)) # if self.flag: # self.flag = str(self.flag,encoding='utf-8') self.mid = self.userinfo['if_store_data'] self.flag = str(self.mid) print("My pipe flag is:", self.flag) if 'yes' not in self.flag: print('用户还没有输入') self.preitem.append(item) if 'yes' in self.flag: if self.preitem is not None: self.storeData() self.preitem = None final_list = [] data = dict(item) v_list_length = len(list(data.values())[0]) for i in range(v_list_length): final_list.append({}) keys = ','.join(data.keys()) values = ','.join(['%s'] * len(data)) for i in range(v_list_length): for key in data.keys(): final_list[i][key] = data[key][i] print("final_list22222", final_list) sql = 'insert into %s(%s) values(%s)' % (self.table_name, keys, values) # params = [] # values = [] # for k, v in item.fields.items(): # params.append('%s') # value = item._values[k] # values.append(value) # sql = 'insert into %s(%s) values(%s)' % (self.curType, ','.join(item.fields.keys()), ','.join(params)) try: for sub_dict in final_list: self.cursor.execute(sql, tuple(sub_dict.values())) except Exception as error: print('sql error .................................') print('error:', error) return item
def __init__(self, id='', **kwargs): super(conSpider, self).__init__(**kwargs) self.conn = pymysql.connect(host=HOST, user=USER, passwd=PASSWD, db=DB, charset='utf8', port=PORT) self.cursor = self.conn.cursor() self.aid = id print("aid:", self.aid) self.r = redis.StrictRedis(host=RHOST, port=RPORT) self.userinfo = UserInfo.from_json(self.r.get(self.aid))
def close(spider, reason): logging.info('spider关闭了') userinfo = UserInfo.from_json(spider.r.get(spider.aid)) #由于closespider关闭需要时间,所以应该区分自然关闭还是强制关闭 if not spider.click_button_flag: userinfo['spider_state'] = 'nature_close' userinfo['error_msg'] = '爬虫程序已经关闭' else: userinfo['spider_state'] = 'close' # userinfo['spider_state'] = 'close' spider.r.set(spider.aid, userinfo.to_json()) logging.info('aha,the spider closed')
def __init__(self, id='', **kwargs): super(ToolSpider, self).__init__(**kwargs) self.conn = pymysql.connect(host=HOST, user=USER, passwd=PASSWD, db=DB, charset='utf8', port=PORT) self.r = redis.StrictRedis(host=RHOST, port=RPORT) self.cursor = self.conn.cursor() self.aid = id self.click_button_flag = False self.bloom = BloomFilter(max_elements=100000, error_rate=0.05) self.userinfo = UserInfo.from_json(self.r.get(self.aid)) self._getWebsitesInDB()
def process_item(self, item, spider): self.id = getattr(spider, "aid") self.userinfo = UserInfo.from_json(self.r.get(self.id)) print('userinfo in redis pipeline:', self.userinfo) logging.debug('piplines:start to Redis items') data = dict(item) # if not self.key: # for k in data.keys(): # k = k + self.id # self.key.append(k) for k, v in data.items(): data_stored = self.userinfo['crawling_result'] data_ = data_stored.get(k, []) data_.append(v) data_stored[k] = data_ self.r.set(self.id, self.userinfo.to_json()) return item
def group(r, id): """ 合成redis数据为[[,],] :return: """ # id = str(random.randint(0, 10000)) # print(id, type(id)) if r.get(id): userinfo = UserInfo.from_json(r.get(id)) else: # return None,None,None return None,None,'正在' error_msg = userinfo['error_msg'] # print('userinfo_in_group:', userinfo['crawling_result']) # print('userinfo_in_group:',userinfo['if_store_data']) key = [] # 按字段名索引 th = None get_th = "select COLUMN_NAME from information_schema.COLUMNS where table_name = '{}'".format(userinfo['table_name']) connect = con() with connect: cur = connect.cursor() cur.execute(get_th) th = cur.fetchall() for seg in th: key.append(seg[0]) totalNo = 0 keyExists = np.zeros((len(key), 1)) for i in np.arange(len(key)): t = userinfo['crawling_result'].get(key[i], None) if t: keyExists[i] = 1 totalNo += 1 if totalNo == 0: return None, None,error_msg else: index = np.nonzero(keyExists)[0] key_defined = np.array(key)[index] # print('key_defined:',key_defined) # dis = [list(map(lambda x: x.decode('utf-8'), userinfo['crawling_result'].get(k))) for k in key_defined] # dis = [userinfo['crawling_result'].get(k) for k in key_defined] dis = [list(map(lambda x:str(x),userinfo['crawling_result'].get(k))) for k in key_defined] return key_defined.tolist(), dis, error_msg
def process_item(self, item, spider): logging.info('helloworld1') self.id = getattr(spider, "aid") logging.info('helloworld2') logging.info('the id in pipeline:{}'.format(self.id)) logging.info('the userinfo in pipeline is :%s' % len(self.r.get(self.id))) self.userinfo = UserInfo.from_json(self.r.get(self.id)) self.table_name = self.userinfo['table_name'] self.mid = self.userinfo['if_store_data'] self.flag = str(self.mid) logging.info('the flag in pipelines:%s' % self.flag) if 'yes' not in self.flag: logging.debug('用户还没有点击保存按钮') self.preitem.append(item) if 'yes' in self.flag: if self.preitem is not None: logging.debug('first restore data from redis to database') self.storeData() self.preitem = None logging.debug('piplines:start to process items') params = [] values = [] for k, v in item.fields.items(): params.append('%s') value = item._values[k] values.append(value) sql = 'insert into %s(%s) values(%s)' % (self.table_name, ','.join( item.fields.keys()), ','.join(params)) try: # print('pipeline_tuple:%s'%tuple(values)) # self.cursor.execute(sql,(','.join(values))) print(values) self.cursor.execute(sql, tuple(values)) except Exception as error: print('sql error .................................') print('error:', error) return item
def crawling(): # r = redis.Redis(host=RHOST, port=RPORT) # 是否存储 is_save = request.form.get('save') # 用户id(随机数) id = request.form.get('append') userinfo = UserInfo.from_json(r.get(id)) if is_save: userinfo['if_store_data'] = is_save r.set(id, userinfo.to_json()) if not is_save: is_save = userinfo['if_store_data'] logging.info('用户选择是否存储爬取结果:%s'%is_save) if is_save: if "sw" in str(is_save): logging.info(is_save) userinfo['crawling_result'] = {} userinfo['spider_state'] = 'open' userinfo['if_store_data'] = 'start' # userinfo['crawling_result'] = {} # if userinfo['spider_state'] == 'close': # userinfo['if_store_data'] = 'start' # else:userinfo['if_store_data'] = 'nosw' #等待spider关闭并调用close()方法 # time.sleep(10) # userinfo['spider_state'] = 'open' userinfo['error_msg'] = '正在爬取中' r.set(id, userinfo.to_json()) time.sleep(2) logging.info('userinfo in crawling:%s'% UserInfo.from_json(r.get(id))['crawling_result']) po = Pool(10) logging.info('before') logging.info('after:%s'%UserInfo.from_json(r.get(id))['spider_state']) time.sleep(5) po.apply_async(scrapyprocess, (id, "selenium",)) if 'no' == str(is_save): userinfo['crawling_result'] = {} userinfo['spider_state'] = 'close' userinfo['if_store_data'] = 'no' userinfo['error_msg'] = '爬虫已经关闭' r.set(id, userinfo.to_json()) # 用于保存当爬虫结束但是没有及时点击保存按钮的情况 userinfo = UserInfo.from_json(r.get(id)) print('userinfo_in_crawling:', userinfo) if userinfo['spider_state'] == 'nature_close' and 'yes' in str(is_save): logging.debug('views关闭了') storeData(userinfo) logging.info('数据保存后删除缓存') logging.info('居然关闭了') r.delete(id) if userinfo['spider_state'] == 'nature_close' and str(is_save) == 'no': logging.info('用户选择"停止并丢弃"按钮后删除缓存') logging.info('居然关闭了2') r.delete(id) return redirect(url_for('get_results', id=id))
def scrapy(): form = SubForm() default_form = DefaultForm() if not (form.validate_on_submit() or default_form.validate_on_submit()): return redirect(url_for('root')) result = 0 default_crawl_flag = 0 way = request.form.get("cus") table_name = request.form.get('table_name') select = request.form.get("option") page_num = request.form.get("page") selenium_page = request.form.get("selenium_num") url_list_xpath = request.form.get("list_xpath") append_idt = request.form.get("append") input_xpath = request.form.get("input_xpath") website = form.website.data # mode:1.Tool(quick three layer mode) 2. content(slow two layer mode) 3.selenium(slow three layer mode) model = "Tool" if select == "two": model = "content" if way == "nodefault": fields_xpath_list = [] for i in range(33): tb = "tb" + str(i) fields_xpath_list.append(request.form.get(tb)) for list in fields_xpath_list: result = result or list if not result: return "未填写class或id属性" else: default_website = request.form.get('website_url') website = default_website default_crawl_flag = 1 user = UserInfo() user['user_id'] = append_idt user['website'] = website user['table_name'] = table_name user['default_crawl_flag'] = default_crawl_flag user['two_or_three_mode'] = model user['fields_xpath_list'] = fields_xpath_list # 只用于两层爬取的页码输入 user['page_num'] = page_num # 是否存储数据 user['if_store_data'] = 'start' user['error_msg'] = '正在爬取中' user['url_list_xpath'] = url_list_xpath user['page_num_xpath_list'] = input_xpath # 3层爬取中从默认的快速爬取模式切换到慢速爬取后的页码数码 user['selenium_page'] = selenium_page # 存储爬取结果 user['crawling_result'] = {} # 爬虫的默认状态 user['spider_state'] = 'open' # r = redis.Redis(host=RHOST, port=RPORT) r.set(user['user_id'], user.to_json()) print('user:'******'user_id'], user['two_or_three_mode'],)) return render_template("middle.html", append_idt=append_idt)