def parse(self, response): # 不应该在这里持久化 r = (response.body).decode( 'utf-8') # 炸裂啊!!!,花了1个多小时!!!搞清楚encode和decode的区别,decode作用于byte流 item = JiayuanUserItem(user_id_item=self.item_uid, user_data_item=str(r)) yield item if self.start_id < self.stop_id: self.start_id += 1 conn = sqlite3.connect( "E:/02 Python/01 crawl/jiayuanspider/04 user_data/jiayuan_m_user_list.db" ) cu = conn.cursor() SELECT_DB = "select uid from m_user_table where id=" + str( self.start_id) cu.execute(SELECT_DB) uid_obj = str(cu.fetchall()) uid = (re.findall(u"[0-9]+", uid_obj))[0] # 需要的uid信息 self.item_uid = uid # 将uid持久化 user_url = self.url + "/" + uid # 抛出请求 yield scrapy.Request(method='GET', url=user_url, callback=self.parse, headers=self.headers, cookies=self.cookie_in_dict) else: spider_log("url all yield.") pass
def parse(self, response): # get raw reopnse data(in json), package it into item. re_list = re.findall( r"##jiayser##(.+?)##jiayser##", (response.body).decode('raw_unicode_escape')) # 这里的编码还未确定,目前可用 js = json.loads(re_list[0]) item = JiayuanListItem(user_list_item=js['userInfo']) spider_log("item yield.") yield item
def process_item(self, item, spider): if spider.name == "user_list_spider": spider_log("user_list_spider item " + str(self.item_count) + " get.") list_raw_data = item.get("user_list_item") list_process(list_raw_data, self.db_path) self.item_count = +1 elif spider.name == "user_data_spider": tmp_id = item.get("user_id_item") tmp_data = item.get("user_data_item") user_raw_data = str(tmp_data) user_id = str(tmp_id) user_process(user_id, user_raw_data, self.db_path) else: pass
def open_spider(self, spider): time_stamp = current_time('file_name') if spider.name == "user_list_spider": spider_log("user_list_spider started.") #self.db_path = "E:/02 Python/01 crawl/jiayuanspider/database/" + time_stamp + " jiayuan_user_list.db" # 调试用 #self.db_path = "E:/02 Python/01 crawl/jiayuanspider/database/jiayuan_m_user_list.db" elif spider.name == "user_data_spider": spider_log("user_data_spider started.") #self.db_path = "E:/02 Python/01 crawl/jiayuanspider/user_data/" + time_stamp + " jiayuan_user_data.db" self.db_path = "E:/02 Python/01 crawl/jiayuanspider/04 user_data/jiayuan_m_user_data.db" else: pass
def close_spider(self, spider): # close database. # 获取爬取的统计信息,发送邮件 conn = sqlite3.connect(self.db_path) cu = conn.cursor() SELECT_DB = "select uid from m_user_table" cu.execute(SELECT_DB) data_list = cu.fetchall() count = len(data_list) spider_log("user_data_spider crawl user count : " + str(count)) # 输出不同的邮件内容 if spider.name == "user_list_spider": ''' email_content_dict = { 'project_name': 'jiayuanspider', 'crawler_name': spider.name, 'list_name': 'm_user', 'crawl num': count } ''' pass elif spider.name == "user_data_spider": email_content_dict = { 'project_name': 'jiayuanspider', 'crawler_name': spider.name, 'list_name': 'm_user', 'id_range': '1-5000', 'date': '8.8', 'crawl num': count } else: pass cu.close() conn.commit() # 插入完毕后一并提交保存 conn.close() send_email(str(email_content_dict), self.db_path) spider_log(" ") spider_log("database closed.") spider_log("spider stopped.") # 执行完毕后自动关机 #os.system('shutdown -s -t 60')
def start_requests(self): # defined start url url = "http://search.jiayuan.com/v2/search_v2.php" spider_log("user_list_spider start.") cookie_in_dict = get_cookie() # 账号好像出问题了 headers = { 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 'accept-encoding': "gzip, deflate", 'accept-language': "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", 'cache-control': "no-cache", 'connection': "keep-alive", 'host': "www.jiayuan.com", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", } # change params in this dict # t=148568&ft=off&f=select&mt=d params = { 'f': "select", 'jsversion': "v5", 'listStyle': "bigPhoto", 'p': "1", 'sex': "m", 'sn': "defualt", 'stc': "27:1,1:99", # 最近注册 'sv': "1" } #log url info. spider_log("crawl url: " + url) # crawl strategy. for page in range(1, 2): params['p'] = str(page) #yield scrapy.FormRequest(url=url, callback=self.parse, formdata=params) yield scrapy.FormRequest(url=url, headers=headers, callback=self.parse, formdata=params, cookies=cookie_in_dict) spider_log("request page" + params['p'] + ", response yield.")
def list_process(list_raw_data, db_path): time_stamp = current_time('file_name') # 能否将数据库操作移动到openspider中 # db_path = "E:/02 Python/01 crawl/jiayuanspider/database/" + time_stamp + " jiayuan_user_list.db" conn = sqlite3.connect(db_path) cu = conn.cursor() try: CREATE_DB = "CREATE TABLE m_user_table(id INTEGER PRIMARY KEY AUTOINCREMENT, uid TEXT UNIQUE, nickname CHAR(50)," \ " age INTEGER, height INTEGER, education CHAR(50), work_location CHAR(50), image CHAR(100))" cu.execute(CREATE_DB) spider_log("database created, table name: m_user_table.") except: spider_log("database existed.") user_list = list_raw_data #spider_log("page in item was parsed, page_first_uid is: " + str(user_list[0]['realUid'])) for user in user_list: try: INSERT_DB = "insert into m_user_table(uid, nickname, age, height, education, work_location, image) " \ "values(\'" + str(user['realUid']) + "\',\'" + user[ 'nickname'] + "\', \'" + \ str(user['age']) + "\', \'" + str( user['height']) + "\', \'" + str(user['education']) + \ "\', \'" + str(user['work_location']) + "\', \'" + str( user['image']) + "\')" cu.execute(INSERT_DB) spider_log("insert uid: " + str(user['realUid'])) except: spider_log("insert error. uid: " + str(user['realUid'])) cu.close() conn.commit() # 插入完毕后一并提交保存 conn.close()
def user_process(user_id, user_raw_data, db_path): # 下面定义了需要获取的数据容器 # 暂存用户基本数据 info = { 'nickname': '默认昵称', 'uid': '', 'charm': '', 'age': '', 'marriage': '', 'province': '', 'city': '', 'education': '', 'height': '', 'weight': '', 'salary': '', 'car': '', 'house': '', 'constellation': '', 'minority': '', 'zodiac': '', 'blood_type': '' } # 暂存用头像数据 image = {'img_url': '', 'img_num': ''} # 暂存用户自我简介 self_intro = "self_intro." # 暂存用户的交友要求 demand = { 'demand_age': '', 'demand_height': '', 'demand_minority': '', 'demand_education': '', 'demand_photo': '', 'demand_marriage': '', 'demand_location': '', 'demand_sincerity': '' } ''' # 暂存用户的生活方式 lifestyle = { 'smoke': '', 'drink': '', 'exercise': '', 'eat': '', 'shop': '', 'faith': '', 'time': '', 'circle': '', 'cost': '' } ''' ''' # 暂存用户的经济能力,该信息已由基本信息给出 economic = { 'salary': '', 'car': '', 'house': '' } ''' # 暂存用户的工作信息 work = { 'position': '', 'industry': '', 'university': '', 'major': '', 'language': '' } # 暂存用户的家庭信息 marriage = { 'origin': '', 'residence': '', 'nationality': '', 'personality': '', 'humor': '', 'temper': '', 'marriage_attitude': '', 'kid': '', 'marriage_time': '', 'share_house': '', 'parents': '', 'relatives': '' } # maybe use a class will be better. user = [info, image, self_intro, demand, work, marriage] # parse data. raw_data = user_raw_data soup = BeautifulSoup(raw_data, "lxml") # 排除异常情况,对用户进行标记 title_text = soup.title.text title_pattern = "世纪佳缘交友网:查看用户详细资料失败" if operator.eq(title_text, title_pattern): body_text = soup.body.text case_1 = ["该会员已被加黑"] case_2 = ["该用户找到意中人"] # case_3 = [""] if operator.eq(re.findall(r"该会员已被加黑", body_text), case_1): user[0]['uid'] = user_id user[0]['nickname'] = "该会员已被加黑" spider_log("insert uid: " + user[0]['uid'] + " 该会员已被加黑") pass elif operator.eq(re.findall(r"该用户找到意中人", body_text), case_2): user[0]['uid'] = user_id user[0]['nickname'] = "该用户找到意中人" spider_log("insert uid: " + user[0]['uid'] + " 该用户找到意中人") pass elif operator.eq(re.findall(r"该用户正在约会中", body_text), case_2): user[0]['uid'] = user_id user[0]['nickname'] = "该用户正在约会中" spider_log("insert uid: " + user[0]['uid'] + " 该用户正在约会中") pass else: pass else: # 全局定位方案,采用BOM迭代访问,下文中也采用了查找定位,查找的缺陷是后面的模块不好遍历 user_info = soup.find(class_='content_705') # BOM树遍历 # 自我介绍 self_intro_div = user_info.div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling # 爱情DNA self_DNA_div = self_intro_div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling # 择偶要求 self_demand_div = self_DNA_div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling # 生活方式 self_style_div = self_demand_div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling # 经济实力 self_economic_div = self_style_div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling # 工作学习 self_work_div = self_economic_div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling # 婚姻观念 self_marriage_div = self_work_div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling #-----***----- # h4 data: nickname, uid tag = soup.h4.text # user nickname user[0]['nickname'] = (re.findall(r"(.+?)ID", tag))[0] #user uid user[0]['uid'] = (re.findall(u"[0-9]+", tag))[0] # h6 data: charm tag = soup.h6.text user[0]['charm'] = tag #user info. tag = soup.find(class_='member_name').text user[0]['age'] = (re.findall(u"[0-9]+", tag))[0] user[0]['marriage'] = (re.findall(r"岁,(.+?),来", tag))[0] user[0]['province'] = soup.find(class_='member_name').a.text user[0]['city'] = soup.find(class_='member_name').a.next_sibling.text #user info detail. tag = soup.find(class_='member_info_list fn-clear') # html list obj. education = tag.li user[0]['education'] = education.em.text height = education.next_sibling.next_sibling user[0]['height'] = height.em.text car = height.next_sibling.next_sibling user[0]['car'] = car.div.next_sibling.next_sibling.text.strip() salary = car.next_sibling.next_sibling user[0]['salary'] = salary.div.next_sibling.next_sibling.text.strip( ) # 去掉 数据前面的换行 house = salary.next_sibling.next_sibling user[0]['house'] = house.div.next_sibling.next_sibling.text.strip() weight = house.next_sibling.next_sibling user[0]['weight'] = weight.div.next_sibling.next_sibling.text.strip() constellation = weight.next_sibling.next_sibling user[0]['constellation'] = constellation.em.text minority = constellation.next_sibling.next_sibling user[0]['minority'] = minority.em.text zodiac = minority.next_sibling.next_sibling user[0]['zodiac'] = zodiac.em.text blood_type = zodiac.next_sibling.next_sibling user[0]['blood_type'] = blood_type.em.text # self image. tag = soup.find(class_='big_pic fn-clear') attr = tag.ul.li.a.next_element.attrs # url 位于书标签属性之中,返回属性dict user[1]['img_url'] = attr['_src'] img_num = soup.find(class_='pho_ico').text user[1]['img_num'] = img_num.strip() # remove subspace. # self intro. user_info = soup.find(class_='js_text').text user[2] = user_info.strip() # remove subspace. #demand info. try: tag = soup.find(class_='js_list fn-clear') demand_age = tag.li user[3]['demand_age'] = demand_age.div.text demand_height = demand_age.next_sibling.next_sibling user[3]['demand_height'] = demand_height.div.text demand_minority = demand_height.next_sibling.next_sibling user[3]['demand_minority'] = demand_minority.div.text demand_education = demand_minority.next_sibling.next_sibling user[3]['demand_education'] = demand_education.div.text demand_photo = demand_education.next_sibling.next_sibling user[3]['demand_photo'] = demand_photo.div.text demand_marriage = demand_photo.next_sibling.next_sibling user[3]['demand_marriage'] = demand_marriage.div.text demand_location = demand_marriage.next_sibling.next_sibling user[3]['demand_location'] = demand_location.div.text demand_sincerity = demand_location.next_sibling.next_sibling user[3]['demand_sincerity'] = demand_sincerity.div.text except: # print("user demand info is not exsit.") pass # work info. tag = self_work_div # 工作信息 try: work_item = tag.ul work = work_item.li user[4]['position'] = work.em.text industry = work.next_sibling.next_sibling user[4]['industry'] = industry.em.text except: # print("work info is not exist.") pass # 学习信息 try: study_item = tag.ul.next_sibling.next_sibling.next_sibling.next_sibling university = study_item.li user[4]['university'] = university.em.text major = university.next_sibling.next_sibling user[4]['major'] = major.em.text language = major.next_sibling.next_sibling user[4]['language'] = language.em.text except: # print("study info is not exist.") pass # marriage info. try: tag = self_marriage_div # 关于自己 self_item = tag.ul about_self = self_item.li user[5]['origin'] = about_self.em.text residence = about_self.next_sibling.next_sibling user[5]['residence'] = residence.em.text nationality = residence.next_sibling.next_sibling user[5]['nationality'] = nationality.em.text personality = nationality.next_sibling.next_sibling user[5]['personality'] = personality.em.text humor = personality.next_sibling.next_sibling user[5]['humor'] = humor.em.text temper = humor.next_sibling.next_sibling user[5]['temper'] = temper.em.text marriage_attitude = temper.next_sibling.next_sibling user[5]['marriage_attitude'] = marriage_attitude.em.text kid = marriage_attitude.next_sibling.next_sibling user[5]['kid'] = kid.em.text marriage_time = kid.next_sibling.next_sibling user[5]['marriage_time'] = marriage_time.em.text except: # print("marriage info is not exist.") pass # 关于家庭 try: family_item = tag.ul.next_sibling.next_sibling.next_sibling.next_sibling share_house = family_item.li user[5]['share_house'] = share_house.em.text relatives = share_house.next_sibling.next_sibling user[5]['relatives'] = relatives.em.text parents = relatives.next_sibling.next_sibling user[5]['parents'] = parents.em.text except: # print("family info is not exist.") pass # 以上分别清理了正常用户和异常用户的数据,并暂存在dic中 conn = sqlite3.connect(db_path) cu = conn.cursor() #spider_log("database created, table name: m_user_table.") # create db if not exsit. try: CREATE_DB = "CREATE TABLE m_user_table(id INTEGER PRIMARY KEY AUTOINCREMENT, uid TEXT, nickname CHAR(50),charm INTEGER, "\ "age INTEGER, height INTEGER, weight INTEGER, province CHAR(5), city CHAR(5), education CHAR(50), salary CHAR(50), car CHAR(10),"\ " house CHAR(10), constellation CHAR(5), minority CHAR(2), zodiac CHAR(3), blood_type CHAR(3), marriage CHAR(3), img_url TEXT, "\ "img_num INTEGER, self_intro TEXT)" cu.execute(CREATE_DB) except: #spider_log("database existed.") pass try: # 目前只设计了user info 数据的持久化,明天加入其它剩余数据的持久化,完成整个数据爬取和数据持久化的过程,集成到scrapy中,成后完成开发日志,如果将raw_data 本地化会占用大量的空间,不如使用数据库持久化 INSERT_DB = "insert into m_user_table(uid, nickname, charm, age, height, weight, province, city, education, salary, car, house, constellation, minority, zodiac, blood_type" \ ", marriage, img_url, img_num, self_intro) values(\'" + user[0]['uid'] + "\', \'" + user[0]['nickname'] + "\', \'" + user[0]['charm'] + "\', \'" + user[0]['age'] + \ "\', \'" + user[0]['height'] + "\', \'" + user[0]['weight'] + "\', \'" + user[0]['province'] + "\', \'" + user[0]['city'] + "\', \'" + user[0]['education'] + \ "\', \'" + user[0]['salary'] + "\', \'" + user[0]['car'] + "\', \'" + user[0]['house'] + "\', \'" + user[0]['constellation'] + "\', \'" + user[0]['minority'] + \ "\', \'" + user[0]['zodiac'] + "\', \'" + user[0]['blood_type'] + "\', \'" + user[0]['marriage'] + "\', \'" + user[1]['img_url'] + "\', \'" + user[1]['img_num'] + \ "\', \'" + user[2] + "\')" cu.execute(INSERT_DB) spider_log("insert uid: " + user[0]['uid']) except: spider_log("insert error. uid: " + user[0]['uid']) cu.close() conn.commit() # 只有commit()之后才能写入数据 conn.close() file_name = user_id #file_name = user[0]['uid'] # 这种处理方法不能包括拉去不到用户信息的情况 file_path = "E:/02 Python/01 crawl/jiayuanspider/04 user_data/raw_data/m_user/" + file_name + ".txt" f = open(file_path, 'w') try: f.write(user_raw_data) f.close() # 一个BUG,当用户名不包括gbk编码字符时,写文件报错(数据库数据无影响) except: str = "用户名包含非GBK编码字符,windows文本编码无法识别。" f.write(str) f.close()