def parse_review(self, response): hxs = Selector(response) asin = response.meta['asin'] title = FmtSQLCharater(first_item(hxs.xpath('//title/text()').extract())) title = title.replace(u'Amazon.com: Customer Reviews: ', '') rlist = hxs.xpath("//div[@id='cm_cr-review_list']/div[@class='a-section review']") for div in rlist: r = Review() r['product_id'] = asin r['product_name'] = title r['review_id'] = first_item(div.xpath('@id').extract()) votes = FmtSQLCharater(first_item(div.xpath('div[1]/span/text()').extract())) match = re.search(u'(.+) people found this helpful', votes, re.I) if match: r['total_feedback_num'] = match.group(1) r['total_helpful_num'] = match.group(2) # r['full_star'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[1]/i/span/text()").extract())) r['title'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[2]/text()").extract())) r['cust_name'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[1]/a/text()").extract())) r['creation_date'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[4]/text()").extract())) #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/') r['body'] = first_item(div.xpath("div[5]/span").extract()) yield r #下一页 if len(rlist) == 10: page = response.meta['page'] + 1 log.msg('Request Product[%s]-[%d] page review ...' % (asin, page)) yield Request( url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', str(page)), callback=self.parse_review, headers=self.headers, meta={'page': page, 'asin': asin} )
def parse_review(self, response): data = response.body if data == '': log.msg(format='%(request)s post fail.response is empty.', level=log.ERROR, request=response.url) return try: data = data.decode('GBK', 'ignore') js = json.loads(data) except: log.msg(u'图书[%s]评论请求结果解析异常,非json数据.url=%s' % (response.meta['sku'], response.url), level=log.INFO) return for item in js['comments']: r = Review() r['product_id'] = item['referenceId'] r['product_name'] = item['referenceName'] r['review_id'] = item['id'] r['title'] = item['title'] if item.has_key('title') else '' r['body'] = FmtSQLCharater(item['content']) r['creation_date'] = item['creationTime'] r['score'] = item['score'] r['cust_name'] = FmtSQLCharater(item['nickname']) r['cust_lev'] = item['userLevelName'] r['cust_level_simple'] = item['userLevelId'] r['cust_img'] = item['userImageUrl'] r['comment_tags'] = '#'.join( map(lambda x: x['name'], item['commentTags'])) if item.has_key( 'commentTags') else '' r['images'] = '#'.join( map(lambda x: x['imgUrl'], item['images'])) if item.has_key('images') else '' r['total_feedback_num'] = item['replyCount'] r['total_helpful_num'] = item['usefulVoteCount'] yield r #下一页 if len(js['comments']) == 10: sku = response.meta['sku'] page = response.meta['page'] + 1 log.msg(u'请求商品[%s]的第[%d]页评论...' % (sku, page)) yield Request(url=self.review_url.replace('<?sku?>', sku).replace( '<?page?>', str(page)), callback=self.parse_review, headers=self.headers, meta={ 'page': page, 'sku': sku })
def parse_desc(self, response): data = response.body data = data.decode('GBK', 'ignore') data = data[9:-1] try: js = json.loads(data) except: log.msg(u'图书[%s]描述请求结果解析异常,非json数据.url=%s' % (response.meta['b']['product_id'], response.url), level=log.INFO) return b = response.meta['b'] hxs = Selector(None, js['content']) b['product_features'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-1']/div[2]/div[@class='book-detail-content']" ).extract()) b['abstract'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-2']/div[2]/div[@class='book-detail-content']" ).extract()) b['recommendation'] = b['abstract'] b['content'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-3']/div[2]/div[@class='book-detail-content']" ).extract()) b['brief_introduction'] = b['content'] b['authorintro'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-4']/div[2]/div[@class='book-detail-content']" ).extract()) b['extract'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-5']/div[2]/div[@class='book-detail-content']" ).extract()) b['catalog'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-6']/div[2]/div[@class='book-detail-content']" ).extract()) b['more_information'] = first_item( hxs.xpath( "//div[@id='detail-tag-id-8']/div[2]/div[@class='book-detail-content']" ).extract()) # b['abstract'] = FmtSQLCharater(b['abstract']) b['catalog'] = FmtSQLCharater(b['catalog']) b['recommendation'] = FmtSQLCharater(b['recommendation']) b['content'] = FmtSQLCharater(b['content']) b['brief_introduction'] = FmtSQLCharater(b['brief_introduction']) b['authorintro'] = FmtSQLCharater(b['authorintro']) b['extract'] = FmtSQLCharater(b['extract']) b['more_information'] = FmtSQLCharater(b['more_information']) log.msg(u'请求商品[%s]的价格信息...' % b['product_id']) yield Request(url=self.price_url.replace('<?sku?>', b['product_id']), callback=self.parse_price, headers=self.headers, meta={'b': b})
def parse_review(self, response): data = response.body if data == '' or data == '[]': log.msg(format='%(request)s post fail.response is [].', level=log.ERROR, request=response.url) return try: js = json.loads(data) except: log.msg( u'图书[%s]评论页码[%d]请求结果解析异常,非json数据.url=%s' % (response.meta['pid'], response.meta['page'], response.url), level=log.INFO) return if js.has_key('review_list') and js['review_list'] is not None: log.msg(u'评论请求职位ID[%s]的第%d页,总数=%d' % (response.meta['pid'], response.meta['page'], len(js['review_list']))) for review in js['review_list']: r = Review() r['product_name'] = FmtSQLCharater(js['product']['name']) for (key, value) in review.iteritems(): if key == 'stars': r['full_star'] = value['full_star'] r['has_half_star'] = value['has_half_star'] elif key in ('experience_ids', 'point_items'): continue elif key in ('body', 'title'): r[key] = FmtSQLCharater(value) else: r[key] = value yield r #下一页 if js['pageinfo'].has_key('next'): pid = response.meta['pid'] page = js['pageinfo']['next'] log.msg(u'评论请求职位ID[%s]的第%d页' % (pid, page)) yield Request(url=self.review_url.replace('<?pid?>', pid).replace( '<?page?>', str(page)), callback=self.parse_review, headers=self.headers, meta={ 'page': page, 'pid': pid })
def process_item(self, item, spider): item_name = item.__class__.__name__ commandtext = '' #学校 if item_name == 'DuoXSchool': commandtext = '''insert ignore into duox_school(id, fullName, companyName, mainCourse, province, city, zone, lat, lng, address, business, contact, phone, discription, brandAward, brandHistory, brandSchoolCount, brandStudentCount, envArea, envFacilities, envFitment, envHealth, envPantry, envParentRest, envType, serviceDetail, teacherAge, teacherCount, teacherQualifier, schoolImage, imageTurn) values(%s, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')''' % (item['id'], item['fullName'], item['companyName'], item['mainCourse'], item['province'], item['city'], item['zone'], item['lat'], item['lng'], item['address'], item['business'], item['contact'], item['phone'], FmtSQLCharater(item['discription']), item['brandAward'], item['brandHistory'], item['brandSchoolCount'], item['brandStudentCount'], item['envArea'], item['envFacilities'], item['envFitment'], item['envHealth'], item['envPantry'], item['envParentRest'], item['envType'], item['serviceDetail'], item['teacherAge'], item['teacherCount'], item['teacherQualifier'], item['schoolImage'], item['imageTurn']) #教师 elif item_name == 'DuoXTeacher': commandtext = "insert ignore into duox_teacher(s_id, id, teacherName, image) values(%s, %s, '%s', '%s')" % (item['s_id'], item['id'], item['teacherName'], item['image']) #课程 elif item_name == 'DuoXCourse': commandtext = '''insert ignore into duox_course(s_id, id, province, city, zone, schoolFullName, courseName, lat, lng, typeName1, typeName2, ageStart, ageEnd, perPrice, packagePrice, needBook, studentCount, courseImage, discount, address, business, courseDes, imageTurn, priceList) values(%s, %s, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')''' % (item['s_id'], item['id'], item['province'], item['city'], item['zone'], item['schoolFullName'], FmtSQLCharater(item['courseName']), item['lat'], item['lng'], item['typeName1'], item['typeName2'], item['ageStart'], item['ageEnd'], item['perPrice'], item['packagePrice'], item['needBook'], item['studentCount'], item['courseImage'], item['discount'], item['address'], item['business'], FmtSQLCharater(item['courseDes']), item['imageTurn'], item['priceList']) #评论 elif item_name == 'DuoXComment': commandtext = '''insert ignore into duox_comment(s_id, s_name, id, commentText, commentType, contactName, contactPhone, createTime, typeId, typeName) values(%s, '%s', %d, '%s', %s, '%s', '%s', %s, %s, '%s')''' % (item['s_id'], item['s_name'], item['id'], FmtSQLCharater(item['commentText']), item['commentType'], item['contactName'], item['contactPhone'], item['createTime'], item['typeId'], FmtSQLCharater(item['typeName'])) # if commandtext != '': ret = TQDbPool.execute('GSX', commandtext) if ret is None: log.msg(u'sql指令执行失败,出现异常:dbname=%s,commamd=%s' % ('GSX', commandtext), level=log.ERROR) elif ret == -2: log.error(u'数据库连接失败导致执行失败:dbname=%s,commamd=%s' % ('GSX', commandtext), level=log.ERROR) return item
def parse_co(self, response): try: msg = ET.fromstring(response.body) except BaseException as e: log.msg(u'企业编号<{}>返回结果非法!'.format(response.meta['coid']), level = log.ERROR) return # job = response.meta['job'] if text_(msg.find('result')) == '1': item = msg.find('resultbody') co = Company() co['SiteID'] = self.site co['company_id'] = text_(item.find('coid')) co['CompanyName'] = FmtCmpNameCharacter(text_(item.find('coname'))) co['Industry'] = '' if item.find('indtype1').text: co['Industry'] = text_(item.find('indtype1')) if item.find('indtype2').text: co['Industry'] += ',' + text_(item.find('indtype2')) co['CompanyType'] = text_(item.find('cotype')) co['CompanyScale'] = text_(item.find('cosize')) co['CompanyAddress'] = FmtSQLCharater(text_(item.find('caddr'))) co['CompanyDesc'] = FmtSQLCharater(text_(item.find('coinfo'))) co['CompanyUrl'] = text_(item.find('cojumpurl')) co['CompanyLogoUrl'] = text_(item.find('logo')) co['GisLongitude'] = text_(item.find('lon')) co['GisLatitude'] = text_(item.find('lat')) co['CityName'] = job['CityName'] co['AreaCode'] = job['AreaCode'] co['Relation'] = '' co['Mobile'] = '' co['Credibility'] = '' co['Licensed'] = '' co['Yan'] = '' co['FangXin'] = '' co['Email'] = '' co['PraiseRate'] = '0' co['UserId'] = '' co['UserName'] = '' co['ProvinceName'] = '' co['WorkArea1'] = '' co['AreaCode1'] = '' yield co yield job
def parse_info_list(self, resp): meta = resp.meta log.msg(u'已下载<%s>第<%d>页,开始解析...' % (meta['name'], meta['page']), log.INFO) ret = json.loads(resp.body) list = ret['list'] log.msg(u'<%s>第<%d>页数量<%d>' % (meta['name'], meta['page'], len(list))) for item in list: fieldNames = item.keys() # topic = Topic() topicFields = topic.fields.keys() user = User() userFields = user.fields.keys() theme = Theme() themeFields = theme.fields.keys() # for field in fieldNames: if field in topicFields: if field not in ['top_cmt', 'themes']: topic[field] = FmtSQLCharater(item[field]) if field in userFields: user[field] = FmtSQLCharater(item[field]) if field in themeFields: theme[field] = FmtSQLCharater(item[field]) # yield user yield topic yield theme # # 下一页请求 if len(list) > 0 and meta['page'] < MAX_PAGE: # 下次请求参数 meta['page'] += 1 meta['maxtime'] = ret['info']['maxtime'] yield self._createNextRequest(meta)
def parse_info(self, response): (result, js) = self._validate_response(response) if result: if 1 == js['jsonStatus']: j = Job() j['Published'] = response.meta['published'] j['CityId'] = response.meta['region']['id'] j['CityName'] = response.meta['region']['name'] for (key, value) in js['result'].iteritems(): if key in ['Title', 'Description']: j[key] = FmtSQLCharater(value) else: if key not in ['AllApplies']: j[key] = value yield j else: self.did_failed_request(response, js['message'])
def parse_review(self, response): hxs = Selector(response) asin = response.meta['asin'] title = FmtSQLCharater( first_item(hxs.xpath('//title/text()').extract())) title = title.replace(u'Amazon.com: Customer Reviews: ', '') rlist = hxs.xpath( "//div[@id='cm_cr-review_list']/div[@class='a-section review']") for div in rlist: r = Review() r['product_id'] = asin r['product_name'] = title r['review_id'] = first_item(div.xpath('@id').extract()) votes = FmtSQLCharater( first_item(div.xpath('div[1]/span/text()').extract())) match = re.search(u'(.+) people found this helpful', votes, re.I) if match: r['total_feedback_num'] = match.group(1) r['total_helpful_num'] = match.group(2) # r['full_star'] = FmtSQLCharater( first_item(div.xpath("div[2]/a[1]/i/span/text()").extract())) r['title'] = FmtSQLCharater( first_item(div.xpath("div[2]/a[2]/text()").extract())) r['cust_name'] = FmtSQLCharater( first_item(div.xpath("div[3]/span[1]/a/text()").extract())) r['creation_date'] = FmtSQLCharater( first_item(div.xpath("div[3]/span[4]/text()").extract())) #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/') r['body'] = first_item(div.xpath("div[5]/span").extract()) yield r #下一页 if len(rlist) == 10: page = response.meta['page'] + 1 log.msg('Request Product[%s]-[%d] page review ...' % (asin, page)) yield Request(url=self.review_url.replace( '<?asin?>', asin).replace('<?page?>', str(page)), callback=self.parse_review, headers=self.headers, meta={ 'page': page, 'asin': asin })
def parse_info(self, response): data = response.body if data == '' or data == '[]': log.msg(format='%(request)s post fail.response is [].', level=log.ERROR, request=response.url) return try: js = json.loads(data) except: log.msg(u'兼职职位[%d]请求结果解析异常,非json数据.url=%s' % (response.meta['aid'], response.url), level=log.INFO) return if 1 == js['status']: j = Job() for (key, value) in js['activityDetail'].iteritems(): if key not in ['salary', 'avatar', 'banner_kw']: if value is None: if key == 'group_id': j[key] = -1 elif key == 'post_time': j[key] = '' else: if key == 'require_info': j[key] = FmtSQLCharater(value) else: j[key] = value yield j # cid = js['activityDetail']['company_im_id'] yield Request(url=self.cmp_url.replace('<?cid?>', str(cid)), callback=self.parse_cmp, headers=self.headers, dont_filter=True, meta={'cid': cid}) else: log.msg(js['msg'])
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(None, data) #开始解析 linkid = response.meta['linkid'] # title = response.meta['title'] # logourl = response.meta['logourl'] # location = response.meta['location'] # function = response.meta['f'] # postdate = response.meta['postdate'] # companyname = first_item( hxs.xpath( '//div[@class="additional_info"]/span[@class="company"]/a/text()' ).extract()) companyname = companyname.lstrip(' ') companyname = companyname.rstrip(' ') if companyname == '': log.msg(u'该职位来源其他网站(%s),无法抓取.' % response.url, level=log.ERROR) return # desc = first_item( hxs.xpath('//div[@class="p-description"]').extract()) desc = desc.lstrip('<div class="p-description">') desc = desc.rstrip('</div>') desc = desc.replace('\t', '') # title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) # job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = linkid job['JobTitle'] = title job['Company'] = companyname job['JobName'] = function job['JobDesc'] = FmtSQLCharater(desc) job['JobType'] = 1 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = datetime.strptime(postdate, '%Y-%m-%d') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['CompanyLogoUrl'] = logourl company['AreaName'] = job['CityName'] # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 title = first_item( hxs.xpath( '//h1[@class="entry-title mt_title1"]/text()').extract()) companyname = first_item( hxs.xpath('//span[@class="entry-author"]/text()').extract()) companyname = companyname.rstrip(' - ') # match = re.search(r'^<td.+>Location</td>\s+<td.+>(.+)</td>$', data, re.I | re.M) if match: location = match.group(1) if location.find(', ') > 0: location = location.split(',')[0] else: location = '' # match = re.search(r'^<td.+>Posted</td>\s+<td.+>(.+)</td>$', data, re.I | re.M) if match: postdate = match.group(1) else: postdate = '' # jobdesc = first_item( hxs.xpath( '//div[@class="user-page mt_content1"]/div[@class="mt_content1"]' ).extract()) linkid = first_item( hxs.xpath('//input[@id="uid"]/@value').extract()) # title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) # job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = linkid job['JobTitle'] = title job['Company'] = companyname job['JobName'] = response.meta['sector'] job['JobDesc'] = FmtSQLCharater(jobdesc) job['Salary'] = salary job['JobType'] = 1 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = datetime.strptime(postdate, '%d %b %y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobAddress'] = address job['Mobile'] = phone # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['CompanyAddress'] = address company['WebSite'] = website company['CompanyLogoUrl'] = logourl company['AreaName'] = job['CityName'] company['Mobile'] = phone # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 match = re.search(r"^var foldr = '(.+)';", data, re.I | re.M) if match: linkid = match.group(1) else: linkid = '' if linkid == '': log.msg(u'页面没有找到职位ID,丢弃。%s' % response.url, log.ERROR) return else: log.msg(u'找到职位,ID=[%s]' % linkid) # title = first_item( hxs.xpath( '//div[@class="ns_jd_headingbig hl"]/h1/strong/text()'). extract()) title = title.rstrip(' ') logourl = first_item( hxs.xpath( '//div[@class="ns_jd_comp_logo"]/img/@src').extract()) companyname = first_item( hxs.xpath('//span[@class="ns_comp_name"]/text()').extract()) #Locations match = re.search( r'<strong>Locations</strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: location = match.group(1) else: location = '' #Experience match = re.search( r'<strong>Experience </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: experience = match.group(1) else: experience = '' #Keywords / Skills match = re.search( r'<strong>Keywords / Skills </strong></h2></div>\s+<div class="ns_jobsum_txt"\s.+>(.+)\s</div>', data, re.I | re.M) if match: skills = match.group(1) else: skills = '' #Education match = re.search( r'<strong>Education </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: education = match.group(1) else: education = '' #Function match = re.search( r'<strong>Function </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: function = match.group(1) function = function.replace(' • ', '*') function = function.replace('<br />', '') else: function = '' #Role match = re.search( r'<strong>Role </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: role = match.group(1) role = role.replace(' • ', '*') role = role.replace('<br />', '') else: role = '' #Industry match = re.search( r'<strong>Industry </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>', data, re.I | re.M) if match: industry = match.group(1) industry = industry.replace(' • ', '') industry = industry.replace('<br />', ';') else: industry = '' #Summary match = re.search( r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)</div>', data, re.I | re.M) if match: summary = match.group(1) else: #存在中途换行的情况 match = re.search( r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+\s+.+)</div>', data, re.I | re.M) if match: summary = match.group(1) else: summary = '' # match = re.search( r'<strong>Posted On </strong></h2></div>\s+<div class="ns_jobsum_txt">\s(.+)\s</div>\t', data, re.I | re.M) if match: postdate = match.group(1) else: postdate = '' # desc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract() if desc: jobdesc = hxs.xpath( '//div[@class="ns_jobdesc hl"]').extract()[0] else: jobdesc = '' # if desc and len(desc) > 1: comdesc = hxs.xpath( '//div[@class="ns_jobdesc hl"]').extract()[1] else: comdesc = '' # title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) # job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = linkid job['JobTitle'] = title job['Company'] = companyname job['JobName'] = function job['JobDesc'] = FmtSQLCharater(summary + '<p>' + jobdesc) job['JobType'] = 1 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = postdate.replace('st', '') postdate = postdate.replace('nd', '') postdate = postdate.replace('rd', '') postdate = postdate.replace('th', '') postdate = datetime.strptime(postdate, '%d %b %Y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobComputerSkill'] = skills job['Exercise'] = experience job['Eduacation'] = education job['JobFunction'] = role job['Industry'] = industry # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['Industry'] = industry company['CompanyLogoUrl'] = logourl company['CompanyDesc'] = FmtSQLCharater(comdesc) company['AreaName'] = job['CityName'] # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_job(self, response): data = response.body if data == '' or data == '[]': log.msg(format= '%(request)s get fail.response is [].', level = log.ERROR, request = response.url) return js = json.loads('{}') try: js = json.loads(data) except: log.msg(u'职位详情请求结果解析异常,非json数据.url=%s' % response.url, level = log.INFO) return #列表解析 if js['StatusCode'] == 200: # pd = js['PositionDetail'] #过滤没有emaillist的职位 #if pd['EmailList'] == '': # log.msg(u'职位[%s-%s]没有email地址,丢弃.' % (pd['Number'], pd['Name'])) # return # cd = js['CompanyDetail'] cr = js['Coordinate'] # publish = pd['DateStart'].replace('T', ' ') publish = publish.replace('Z', '') pos = publish.rfind('.') if pos > 0: publish = publish[0: pos - len(publish)] publish = datetime.datetime.strptime(publish, '%Y-%m-%d %H:%M:%S') # if exist_linkid(self.site, pd['Number'], int(mktime(publish.timetuple()))): return # j = WebJob() j['SiteID'] = self.site j['JobTitle'] = pd['Name'] j['Company'] = pd['CompanyName'] j['PublishTime'] = publish j['RefreshTime'] = publish j['ClickTimes'] = 0 #依据智联职位类别查找SQL Server职位类别代码与职位名称 zJob = int(pd['SubJobType']) zJobClassName = FmtJobPositionWithPrefix('redis_cache_1', self.company_prefix, zJob) if zJobClassName != '': j['JobCode'] = zJobClassName.split('#')[0] j['JobName'] = zJobClassName.split('#')[1] else: log.msg(u'职位类别=%d,在redis上没有查找到对应的职位类别代码与职位名称' % zJob, level = log.ERROR) return # j['Salary'] = pd['Salary'] j['SalaryType'] = 0 j['Eduacation'] = pd['Education'] j['Number'] = '%d人' % pd['RecruitNumber'] j['Exercise'] = pd['WorkingExp'] if pd.has_key('WelfareTab'): j['SSWelfare'] = ','.join(map(lambda wel: wel.values()[0], pd['WelfareTab'])) else: j['SSWelfare'] = '' j['SBWelfare'] = '' j['OtherWelfare'] = '' j['JobDesc'] = pd['Description'] j['Relation'] = '' j['Mobile'] = '' j['Email'] = pd['EmailList'] j['JobAddress'] = FmtSQLCharater(cd['Address']) j['InsertTime'] = datetime.datetime.today() j['Sex'] = u'不限' j['LinkID'] = pd['Number'] j['Tag'] = '' j['ProvinceName'] = '' j['CityName'] = pd['WorkCity'] j['WorkArea'] = pd['WorkCity'] if pd.has_key('CityDistrict'): j['WorkArea1'] = pd['CityDistrict'] else: j['WorkArea1'] = '' j['WorkArea2'] = '' j['CompanyLink'] = self.company_prefix + pd['CompanyNumber'] if pd['WorkType'] == u'全职' or pd['WorkType'] == u'实习': j['JobType'] = 1 else: j['JobType'] = 2 j['SyncStatus'] = 0 j['SrcUrl'] = response.url j['GisLongitude'] = cr['Longitude'] j['GisLatitude'] = cr['Latitude'] j['StartDate'] = pd['DateStart'] j['EndDate'] = pd['DateEnd'] #其他默认信息 j['AnFmtID'] = 0 j['KeyValue'] = '' if cd['Industry']: j['Industry'] = cd['Industry'] else: j['Industry'] = '' j['CompanyType'] = cd['Property'] j['CompanyScale'] = cd['CompanySize'] j['Require'] = u'招%s|学历%s|经验%s|性别%s' % (j['Number'], j['Eduacation'], j['Exercise'], j['Sex']) j['Telphone1'] = '' j['Telphone2'] = '' j['Age'] = 0 j['ValidDate'] = '' j['ParentName'] = '' j['EduacationValue'] = 0 j['SalaryMin'] = 0.0 j['SalaryMax'] = 0.0 j['NumberValue'] = 0 j['SexValue'] = 0 j['OperStatus'] = 0 j['LastModifyTime'] = datetime.datetime.today() j['PropertyTag'] = '' j['SalaryValue'] = 0 j['ExerciseValue'] = 0 j['Valid'] = 'T' j['JobWorkTime'] = '' j['JobComputerSkill'] = '' j['ForeignLanguage'] = '' j['JobFunction'] = '' j['JobRequest'] = '' j['BusinessCode'] = '' #企业信息 c = Company() c['SiteID'] = self.site c['company_id'] = self.company_prefix + cd['Number'] c['Credibility'] = '' c['Licensed'] = '' c['Yan'] = '' c['FangXin'] = '' c['CompanyName'] = cd['Name'] c['CityName'] = cd['CityName'] c['AreaCode'] = '' c['Relation'] = '' c['Mobile'] = '' c['Industry'] = cd['Industry'] c['CompanyType'] = cd['Property'] c['CompanyScale'] = cd['CompanySize'] c['CompanyAddress'] = cd['Address'] c['CompanyDesc'] = cd['Description'] c['CompanyUrl'] = cd['Url'] if cd['companyLogo']: c['CompanyLogoUrl'] = cd['companyLogo'] else: c['CompanyLogoUrl'] = '' c['Email'] = '' c['PraiseRate'] = '0' c['GisLongitude'] = cr['Longitude'] c['GisLatitude'] = cr['Latitude'] c['UserId'] = '' c['UserName'] = '' c['ProvinceName'] = '' c['WorkArea1'] = cd['CityName'] c['AreaCode1'] = '' # log.msg(j['JobCode'] + '--' + j['JobName']) yield c yield j else: log.msg(u'职位详情请求失败,原因:%s.url=%s' % (js['StatusDescription'], response.url))
def parse_info(self, response): data = response.body js = BaseSpider.fmt_json(self, data) if js: job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = js['jobid'] job['JobTitle'] = FmtSQLCharater(js['jobttl']) job['Company'] = FmtSQLCharater(js['coym']) job['JobDesc'] = FmtSQLCharater(js['dsc']) job['SrcUrl'] = js['applyurl'] ovw = js['ovw'] ovw = ovw.replace(' ', '') # match = re.search( r"<h3 class='jd-label'>Industry</h3>\n<p>(.*)</p>", ovw, re.I | re.M) if match: job['Industry'] = match.group(1) # match = re.search( r"<h3 class='jd-label'>Job Function</h3>\n<p>(.*)</p>", ovw, re.I | re.M) if match: job['JobName'] = match.group(1) # job['CityName'] = 'Singapore' job['WorkArea'] = 'Singapore' match = re.search( r"<h3 class='jd-label'>Work Region</h3>\n<p>(.*)</p>", ovw, re.I | re.M) if match: job['WorkArea1'] = match.group(1).replace('Singapore - ', '') # match = re.search( r"<h3 class='jd-label'>Job Type</h3>\n<p>(.*)</p>", ovw, re.I | re.M) if match: if match.group(1).find('Full Time') >= 0: job['JobType'] = 1 else: job['JobType'] = 0 # match = re.search(r"Min. Education Level : (.*?)</li><li>", ovw, re.I | re.M) if match: job['Eduacation'] = FmtSQLCharater(match.group(1)) # match = re.search(r"Year of Exp Required : (.*?)</li><li>", ovw, re.I | re.M) if match: job['Exercise'] = match.group(1) # match = re.search(r"Skills : (.*?)</li><li>", ovw, re.I | re.M) if match: job['JobComputerSkill'] = FmtSQLCharater(match.group(1)) # match = re.search(r"Language : (.*?)</li><li>", ovw, re.I | re.M) if match: job['ForeignLanguage'] = match.group(1) # match = re.search(r"Salary : (.*?)</span>", ovw, re.I | re.M) if match: job['Salary'] = match.group(1) job['Number'] = 'one person' #13-Jul-2015 PostDate = datetime.strptime(js['pstdttme'], '%d-%b-%Y') job['PublishTime'] = PostDate job['RefreshTime'] = PostDate # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = job['Company'] company['Industry'] = job['Industry'] company['AreaName'] = 'Singapore' company['CompanyDesc'] = '' # match = re.search(r"Website:</strong> (.*)<br />", ovw, re.I | re.M) if match: company['CompanyUrl'] = match.group(1) # yield company yield job else: log.msg(u'职位详情请求结果解析异常,非json数据.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #页面解析 #企业横幅 company_banner = first_item( hxs.xpath( '//img[@id="company_banner"]/@data-original').extract()) #企业logo company_logo = first_item( hxs.xpath( '//img[@id="company_logo"]/@data-original').extract()) #职位名称 position_title = first_item( hxs.xpath('//h1[@id="position_title"]/text()').extract()) position_title = FmtSQLCharater(position_title) #企业名称 company_name = first_item( hxs.xpath('//h2[@id="company_name"]/a/text()').extract()) if company_name == '': company_name = first_item( hxs.xpath('//h2[@id="company_name"]/text()').extract()) company_name = company_name.replace('\n', '') company_name = company_name.replace('\t', '') company_name = company_name.lstrip(' ') company_name = company_name.rstrip(' ') company_name = FmtSQLCharater(company_name) if company_name == '': log.msg(u'企业名称为空,url=%s' % response.url) return #企业SrcUrl地址 company_url = first_item( hxs.xpath('//h2[@id="company_name"]/a/@href').extract()) #薪资 salary = first_item( hxs.xpath('//div[@id="salary"]/p/a/text()').extract()) #经验 experience = first_item( hxs.xpath( '//div[@id="experience"]/p[@id="years_of_experience"]/span[@id="years_of_experience"]/text()' ).extract()) experience = experience.replace('\n', '') experience = experience.replace('\t', '') #Location location = first_item( hxs.xpath( '//div[@id="location"]/p/span[@id="single_work_location"]/text()' ).extract()) location = location.replace('\n', '') location = location.replace('\t', '') #职位描述(可能包含岗位职责、职位要求) job_desc = first_item( hxs.xpath('//div[@id="job_description"]').extract()) #企业信息 company_registration_number = first_item( hxs.xpath('//span[@id="company_registration_number"]/text()'). extract()) company_industry = first_item( hxs.xpath('//p[@id="company_industry"]/text()').extract()) company_website = first_item( hxs.xpath('//a[@id="company_website"]/text()').extract()) company_contact = first_item( hxs.xpath('//p[@id="company_contact"]/text()').extract()) company_size = first_item( hxs.xpath('//p[@id="company_size"]/text()').extract()) work_environment_working_hours = first_item( hxs.xpath('//p[@id="work_environment_working_hours"]/text()'). extract()) work_environment_dress_code = first_item( hxs.xpath( '//p[@id="work_environment_dress_code"]/text()').extract()) work_environment_benefits = first_item( hxs.xpath( '//p[@id="work_environment_benefits"]/text()').extract()) work_environment_spoken_language = first_item( hxs.xpath('//p[@id="work_environment_spoken_language"]/text()' ).extract()) #gallery gallery = '' thumbs = hxs.xpath('//ul[@class="gallery-thumb"]/li') for item in thumbs: gallery += first_item( item.xpath('img/@data-original').extract()) + ';' #企业描述 company_overview_all = first_item( hxs.xpath('//div[@id="company_overview_all"]').extract()) #work location match = re.search(r'¢er=(.*?)&', data, re.I | re.M) if match: gps_location = match.group(1) lat = gps_location.split(',')[0] lng = gps_location.split(',')[1] else: lat = '0.0' lng = '0.0' # address = first_item( hxs.xpath('//p[@id="address"]/text()').extract()) address = FmtSQLCharater(address) #Advertised: 23-June-2015 posting_date = first_item( hxs.xpath('//p[@id="posting_date"]/text()').extract()) posting_date = posting_date.replace('Advertised:', '') posting_date = posting_date.replace(' ', '') # job = JobsDB_Job() job['SiteID'] = self.site_id #http://jobs.jobstreet.com/sg/jobs/4712859?fr=J job['LinkID'] = response.url[34:-5] job['JobTitle'] = position_title job['Company'] = company_name job['Industry'] = company_industry job['JobName'] = response.meta['name'] job['JobDesc'] = FmtSQLCharater(job_desc) job['Salary'] = salary job['Exercise'] = experience job['JobType'] = 1 job['SrcUrl'] = response.url job['SSWelfare'] = work_environment_benefits job['Number'] = 'one person' #时间格式化 PostDate = datetime.strptime(posting_date, '%d-%B-%Y') job['PublishTime'] = PostDate job['RefreshTime'] = PostDate if location <> '' and len(location.split('-')) > 1: job['CityName'] = location.split('-')[0].replace(' ', '') job['WorkArea1'] = location.split('-')[1].replace(' ', '') else: job['CityName'] = location job['WorkArea'] = job['CityName'] job['ForeignLanguage'] = work_environment_spoken_language job['JobWorkTime'] = work_environment_working_hours job['GisLongitude'] = lng job['GisLatitude'] = lat job['JobAddress'] = address job['Mobile'] = company_contact # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = company_name company['Industry'] = company_industry company['CompanyScale'] = company_size company['CompanyAddress'] = address company['CompanyUrl'] = company_url company['WebSite'] = company_website company['CompanyLogoUrl'] = company_logo company['AreaName'] = job['CityName'] company['CompanyDesc'] = FmtSQLCharater(company_overview_all) company['Mobile'] = company_contact company['GisLongitude'] = lng company['GisLatitude'] = lat company['OtherInfo'] = company_banner + '#' + gallery # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): data = response.body if data == '' or data == '[]': log.msg(format='%(request)s post fail.response is [].', level=log.ERROR, request=response.url) return try: js = json.loads(data) except: log.msg(u'图书详情[%s]请求结果解析异常,非json数据.url=%s' % (response.meta['p']['id'], response.url), level=log.INFO) return # pin = js['product_info_new'] pd = js['product_desc'] pds = js['product_desc_sorted'] p = Product() # for (key, value) in pin.iteritems(): if key == 'mobile_exclusive_price': continue elif key == 'shop_id': continue elif key == 'product_name': p[key] = FmtSQLCharater(value) elif key == 'outlets': continue elif key == 'publish_info': for (key1, value1) in value.iteritems(): if key1 == 'author_arr': continue else: if key1 == 'author_name': p['publish_' + key1] = FmtSQLCharater(value1) else: p['publish_' + key1] = value1 elif key == 'promo_model': continue elif key == 'stock_info': p['stock_status'] = value['stock_status'] elif key == 'category_info': continue elif key == 'comm_info': for (key1, value1) in value.iteritems(): if key1 == 'items': continue else: p['comm_' + key1] = value1 elif key == 'total_review_count': continue elif key == 'abstract': p[key] = FmtSQLCharater(value) elif key == 'images': p['images'] = '#'.join(value) elif key == 'images_big': p['images_big'] = '#'.join(value) elif key == 'stars': p['stars_full_star'] = value['full_star'] p['stars_has_half_star'] = value['has_half_star'] elif key == 'ebook_info': p['ebook_read_ebook_at_h5'] = value['read_ebook_at_h5'] p['ebook_is_client_buy'] = value['is_client_buy'] elif key == 'is_yb_product': continue elif key == 'is_show_arrive': continue elif key == 'share_url': continue elif key == 'spuinfo': if value != '': p['spuinfo_num'] = value['num'] p['spuinfo_spus_id'] = value['spus_id'] elif key == 'bd_promo_price': continue elif key == 'template_id': continue elif key == 'bang_rank': p['bang_rank_word'] = value['word'] p['bang_rank_path_name'] = value['path_name'] p['bang_rank_rank'] = value['rank'] p['bang_rank_catPath'] = value['catPath'] elif key == 'same_cate_product': continue elif key == 'show_dangdangsale': continue elif key == 'in_wishlist': continue elif key == 'page_template': continue elif key == 'platform_banner': continue else: p[key] = value # for (key, value) in pd.iteritems(): p[key] = FmtSQLCharater(value) if key == 'beautiful_image': hxs = Selector(None, value) images = hxs.xpath('//body/img/@src').extract() p['beautiful_image_list'] = '#'.join(images) # for item in pds: if item['name'] == u'推荐语': p['recommendation'] = FmtSQLCharater(item['content']) elif item['name'] == u'简介': p['brief_introduction'] = FmtSQLCharater(item['content']) #elif item['name'] == u'目录': # p['catalog'] = item['content'] elif item['name'] == u'出版信息': continue elif item['name'] == u'更多': p['more_information'] = FmtSQLCharater(item['content']) # yield p
def parse_job(self, response): data = response.body js = self._fmt_json(data) if js and js.has_key('JobAdDetails'): jd = js['JobAdDetails'][0] if jd: job = JobsDB_Job() job['SiteID'] = self.site_id job['LinkID'] = jd['Id'] job['JobTitle'] = jd['JobTitle'] job['Company'] = jd['Company'] job['Industry'] = jd['Industry'] for func in jd['JobFunction']: job['JobName'] += func + '#' job['JobDesc'] = FmtSQLCharater(jd['JobDesc']) job['Salary'] = jd['Salary'] if jd['SalaryLow'] <> 'Hidden' and jd[ 'SalaryLow'] <> 'Not Provided': job['SalaryMin'] = float(jd['SalaryLow'].replace( ',', '').replace('+', '')) if jd['SalaryUp'] <> 'Hidden' and jd[ 'SalaryUp'] <> 'Not Provided': job['SalaryMax'] = float(jd['SalaryUp'].replace( ',', '').replace('+', '')) ''' if jd['SalaryUnit'] <> 'Hidden': job['SalaryType'] = jd['SalaryUnit'] ''' job['Eduacation'] = jd['Qualification'] CareerLevel = jd['CareerLevel'] job['Exercise'] = jd['WorkExperience'] EmploymentTerm = jd['EmploymentTerm'] job['JobTypeName'] = EmploymentTerm if EmploymentTerm.find('Full Time') >= 0: job['JobType'] = 1 elif EmploymentTerm.find('Part Time') >= 0: job['JobType'] = 2 elif EmploymentTerm.find('Permanent') >= 0: job['JobType'] = 3 elif EmploymentTerm.find('Temporary') >= 0: job['JobType'] = 4 elif EmploymentTerm.find('Contract') >= 0: job['JobType'] = 5 elif EmploymentTerm.find('Internship') >= 0: job['JobType'] = 6 elif EmploymentTerm.find('Freelance') >= 0: job['JobType'] = 7 elif EmploymentTerm.find('Contract-to-Perm') >= 0: job['JobType'] = 8 elif EmploymentTerm.find('Temp-to-Perm') >= 0: job['JobType'] = 9 if js.has_key('DesktopSiteURL'): job['SrcUrl'] = js['DesktopSiteURL'] Benefits = '' for bf in jd['BenefitId']: Benefits += str(bf) + ';' ''' if bf == 5: Benefits += 'Double pay;' elif bf == 7: Benefits += 'Free shuttle bus;' elif bf == 1: Benefits += 'Performance bonus;' elif bf == 14: Benefits += 'Dental insurance;' elif bf == 4: Benefits += 'Overtime pay;' elif bf == 10: Benefits += 'Five-day work week;' elif bf == 8: Benefits += 'Medical insurance;' ''' job['SSWelfare'] = Benefits #IsExpired = jd['IsExpired'] #Summary1 = jd['Summary1'] #Summary2 = jd['Summary2'] #Summary3 = jd['Summary3'] job['Number'] = 'one person' PostDate = jd['PostDate'].replace('T', ' ') PostDate = PostDate.replace('+08:00', '') PostDate = datetime.strptime(PostDate, '%Y-%m-%d %H:%M:%S') job['PublishTime'] = PostDate job['RefreshTime'] = PostDate job['CityName'] = 'Singapore' job['WorkArea'] = 'Singapore' Location = jd['Location'] #Downtown Core, CBD (Central Area) if Location <> 'No Fixed Location': if Location.find(',') > 0: job['WorkArea1'] = Location.split(',')[0] job['WorkArea2'] = Location.split(',')[1] else: job['WorkArea1'] = Location # ''' company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = jd['Company'] company['Industry'] = jd['Industry'] company['AreaName'] = 'Singapore' company['CompanyDesc'] = FmtSQLCharater(jd['CompanyDesc']) if js.has_key('CompanyLogo'): company['CompanyLogoUrl'] = jd['CompanyLogo'] #OmnitureJobAdFuncIds = js['OmnitureJobAdFuncIds'] #17|32|128|267 #OmnitureLocationId = jd['OmnitureLocationId'] #1297 #AdType = jd['AdType'] yield company ''' yield job else: log.msg(u'职位详情请求结果解析异常,非json数据.url=%s' % response.url, level=log.INFO)
def parse_info(self, response): data = response.body if data == '': log.msg(format='%(request)s post fail.response is empty.', level=log.ERROR, request=response.url) return # """ root = response.meta['root'] leaf = response.meta['leaf'] age = response.meta['age'] star = response.meta['star'] """ asin = response.meta['asin'] # hxs = Selector(None, data) # container = hxs.xpath("//div[@class='a-container']") right = container.xpath("div[@id='rightCol']") left = container.xpath("div[@id='leftCol']") center = container.xpath("div[@id='centerCol']") # log.msg('Book--') b = Book() b['product_id'] = asin b['product_name'] = FmtSQLCharater( first_item( center.xpath( "div[@id='booksTitle']/div/h1[@id='title']/span[@id='productTitle']/text()" ).extract())) b['subname'] = b['product_name'] b['publish_paper_quality'] = FmtSQLCharater( first_item( center.xpath( "div[@id='booksTitle']/div/h1[@id='title']/span[2]/text()" ).extract())) author = center.xpath("div[@id='booksTitle']/div[@id='byline']") log.msg('author html:' + author.extract()) b['publish_author_name'] = FmtSQLCharater( first_item(author.xpath('string(.)').extract())) b['publish_author_name'] = b['publish_author_name'].replace( '\n', '').replace('\t', '').replace(' ', '') b['abstract'] = FmtSQLCharater( first_item( hxs.xpath( "div[@id='bookDescription_feature_div']/noscript/text()"). extract())) images = left.xpath( "div[@id='booksImageBlock_feature_div']/div[@id='imageBlockOuter']/div[@id='imageBlockThumbs']/span/div/img/@src" ).extract() bigImages = map( lambda x: x.replace( '_AC_SY60_CR,0,0,60,60_', '_SY498_BO1,204,203,200_').replace( '_AC_SX60_CR,0,0,60,60_', '_SX443_BO1,204,203,200_'), images) b['images'] = '#'.join(images) b['images_big'] = '#'.join(bigImages) # buybox = right.xpath( "div[@id='buybox_feature_div']/div[@id='combinedBuyBox']/form[@id='addToCart']/div[@id='buybox']/div/div[@class='a-box-inner']/div" ) b['sale_price'] = FmtSQLCharater( first_item( buybox.xpath( "//*[@id='a-autoid-5-announce']/span[2]/span").extract())) b['discount'] = FmtSQLCharater( first_item( buybox.xpath( "div[@id='buyNewSection']/div/div[@id='soldByThirdParty']/span[2]/text()" ).extract())) b['original_price'] = FmtSQLCharater( first_item( buybox.xpath( "//*[@id='a-autoid-4-announce']/span[2]").extract())) b['sale_price'] = b['sale_price'].replace('¥', '') b['discount'] = b['discount'].replace(' (', '').replace(u'折) ', '') b['original_price'] = b['original_price'].replace(u'¥', '') #基本信息 bullets = hxs.xpath( "//div[@id='productDetails']/table/tr/td[@class='bucket']/div[@class='content']/ul/li" ) for li in bullets: log.msg('Book-base-info') if li.xpath(u"b[contains(text(), 'Publisher')]"): publisher = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) #未来出版社; 第1版 (2011年11月1日) match = re.search(u'(.+); 第(.+)版 \((.+)\)', publisher, re.I | re.M) if match: b['publish_publisher'] = match.group(1) b['publish_version_num'] = match.group(2) b['publish_publish_date'] = match.group(3) elif li.xpath(u"b[contains(text(), 'Series')]"): b['product_name'] = FmtSQLCharater( first_item(li.xpath("a/text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Paperback')]"): b['publish_paper_quality'] = u'Paperback' b['publish_number_of_pages'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Hardcover')]"): b['publish_paper_quality'] = u'Hardcover' b['publish_number_of_pages'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), '纸板书')]"): b['publish_paper_quality'] = u'纸板书' b['publish_number_of_pages'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Age Range')]"): b['age'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'Language')]"): b['publish_subtitle_language'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), '开本')]"): b['publish_product_size'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) elif li.xpath(u"b[contains(text(), 'ISBN-13')]"): b['publish_standard_id'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).lstrip()) #elif li.xpath(u"b[contains(text(), '条形码')]"): # b['publish_barcode'] = first_item(li.xpath("text()").extract()).lstrip() elif li.xpath(u"b[contains(text(), 'Product Dimensions')]"): b['publish_product_size2'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).replace( '\n', '').lstrip().rstrip()) elif li.xpath(u"b[contains(text(), 'Shipping Weight')]"): b['publish_product_weight'] = FmtSQLCharater( first_item(li.xpath("text()").extract()).replace( '\n', '').lstrip().rstrip()) #elif li.xpath(u"b[contains(text(), '品牌')]"): # b['brand'] = first_item(li.xpath("text()").extract()).lstrip() #商品描述 begin = data.find('var iframeContent =') end = data.find('obj.onloadCallback = onloadCallback;') if begin and end: desc = data[begin + 21:end - 10] desc = urllib2.unquote(desc) hxs = Selector(None, desc) b['recommendation'] = first_item( hxs.xpath( u"//div[@class='content']/h3[contains(text(), '编辑推荐')]/following-sibling::div[1]/text()" ).extract()) b['catalog'] = first_item( hxs.xpath( u"//div[@class='content']/h3[contains(text(), '目录')]/following-sibling::div[1]/text()" ).extract()) b['more_information'] = first_item( hxs.xpath( u"//div[@class='content']/h3[contains(text(), '文摘')]/following-sibling::div[1]/text()" ).extract()) # yield b
def parse_job(self, response): try: msg = ET.fromstring(response.body) except BaseException as e: log.msg(u'职位类别<{}>,职位编号<{}>返回结果非法!'.format(response.meta['funcName'], response.meta['jobid']), level = log.ERROR) return # func = response.meta['func'] funcName = response.meta['funcName'] jobid = response.meta['jobid'] # if msg.find('result').text == '1': item = msg.find('resultbody') webJob = WebJob() webJob['SiteID'] = self.site webJob['JobTitle'] = FmtSQLCharater(text_(item.find('jobname'))) webJob['Company'] = FmtCmpNameCharacter(text_(item.find('coname'))) webJob['PublishTime'] = FmtAnnounceDateToDateTime(text_(item.find('issuedate')), '-')[0] webJob['RefreshTime'] = webJob['PublishTime'] webJob['JobType'] = 1 webJob['SalaryType'] = 0 webJob['Salary'] = text_(item.find('providesalary')) webJob['Eduacation'] = text_(item.find('degree')) webJob['Number'] = text_(item.find('jobnum')) webJob['Exercise'] = text_(item.find('workyear')) webJob['SSWelfare'] = text_(item.find('welfare')) webJob['SBWelfare'] = text_(item.find('jobtag')) webJob['LinkID'] = jobid webJob['JobCode'] = str(int(func)) webJob['JobName'] = funcName webJob['Sex'] = u'不限' webJob['Require'] = u'招%s人|学历%s|经验%s|性别%s' % (webJob['Number'], webJob['Eduacation'], webJob['Exercise'], webJob['Sex']) jobarea = text_(item.find('jobarea')).split('-') CityName = jobarea[0] webJob['CityName'] = CityName webJob['WorkArea'] = CityName webJob['WorkArea1'] = '' webJob['WorkArea2'] = '' if len(jobarea) > 1: webJob['WorkArea1'] = jobarea[1] if len(jobarea) > 2: webJob['WorkArea2'] = jobarea[2] webJob['AreaCode'] = FmtAreaCodeSimple('remote_252_1', CityName) webJob['JobAddress'] = FmtSQLCharater(item.find('address').text) coid = item.find('coid').text webJob['CompanyLink'] = 'wuyao_' + coid webJob['SyncStatus'] = 0 webJob['AnFmtID'] = 0 webJob['KeyValue'] = '' webJob['ClickTimes'] = 0 webJob['SBWelfare'] = '' webJob['OtherWelfare'] = '' webJob['Relation'] = '' webJob['Mobile'] = '' webJob['Email'] = '' webJob['Tag'] = '' webJob['ProvinceName'] = '' webJob['Telphone1'] = '' webJob['Telphone2'] = '' webJob['Age'] = 0 webJob['ValidDate'] = '' webJob['ParentName'] = '' webJob['EduacationValue'] = 0 webJob['SalaryMin'] = 0.0 webJob['SalaryMax'] = 0.0 webJob['NumberValue'] = 0 webJob['SexValue'] = 0 webJob['OperStatus'] = 0 webJob['PropertyTag'] = '' webJob['SalaryValue'] = 0 webJob['ExerciseValue'] = 0 webJob['Valid'] = 'T' webJob['JobWorkTime'] = '' webJob['JobComputerSkill'] = '' webJob['ForeignLanguage'] = '' webJob['JobFunction'] = '' webJob['JobRequest'] = '' webJob['BusinessCode'] = '' webJob['InsertTime'] = datetime.datetime.today() webJob['LastModifyTime'] = datetime.datetime.today() #替换\符号 webJob['SrcUrl'] = text_(item.find('share_url')) webJob['GisLongitude'] = text_(item.find('joblon')) webJob['GisLatitude'] = text_(item.find('joblat')) webJob['JobDesc'] = text_(item.find('jobinfo')) webJob['CompanyType'] = text_(item.find('cotype')) webJob['CompanyScale'] = text_(item.find('cosize')) # link_url = self.co_url.replace('<?coid?>', coid) link_url = self.create_url(link_url) yield Request(link_url, meta = {'coid': coid, 'job': webJob}, callback = self.parse_co)
def parse_info(self, response): if response.status == 200: data = response.body hxs = Selector(response) #开始解析 title = first_item( hxs.xpath('//h1[@itemprop="title"]/text()').extract()) salary = first_item( hxs.xpath('//span[@itemprop="baseSalary"]/text()').extract()) location = first_item( hxs.xpath('//span[@itemprop="address"]/text()').extract()) jobtype = first_item( hxs.xpath( '//span[@itemprop="employmentType"]/text()').extract()) companyname = first_item( hxs.xpath('//span[@itemprop="name"]/text()').extract()) postdate = first_item( hxs.xpath('//span[@itemprop="datePosted"]/text()').extract()) jobdesc = first_item( hxs.xpath('//section[@class="description"]/div[@class="well"]' ).extract()) logourl = first_item( hxs.xpath( '//section[@class="brandInfo"]/div[@class="well"]/h2/img/@src' ).extract()) if logourl != '': logourl = self.create_url(logourl) # match = re.search(r'<label>Contact:</label>\s*(.+)</li>', data, re.I | re.M) if match: contact = match.group(1) else: contact = '' # match = re.search(r'<label>Address:</label>\s*(.+)</li>', data, re.I | re.M) if match: address = match.group(1) else: address = '' # match = re.search(r'<label>Phone:</label>\s*(.+)</li>', data, re.I | re.M) if match: phone = match.group(1) else: phone = '' # match = re.search(r'<label>Email:</label>\s*(.+)</li>', data, re.I | re.M) if match: email = match.group(1) else: email = '' # match = re.search(r'<label>Website:</label>\s*<a href="(.+)" ', data, re.I | re.M) if match: website = match.group(1) else: website = '' title = FmtSQLCharater(title) companyname = FmtSQLCharater(companyname) location = FmtSQLCharater(location) address = FmtSQLCharater(address) # job = JobsDB_Job() job['SiteID'] = self.site_id match = re.search(r'\.id(.+)\?', response.url, re.I | re.M) if match: job['LinkID'] = str(int(match.group(1))) job['JobTitle'] = title job['Company'] = companyname job['JobName'] = response.meta['sector'] job['JobDesc'] = FmtSQLCharater(jobdesc) job['Salary'] = salary if jobtype.find('Full time') > 0: job['JobType'] = 1 else: job['JobType'] = 0 job['SrcUrl'] = response.url job['Number'] = 'one person' #时间格式化 if postdate == '': postdate = datetime.today() else: postdate = datetime.strptime(postdate, '%d %b %y') job['PublishTime'] = postdate job['RefreshTime'] = postdate job['CityName'] = location job['WorkArea'] = job['CityName'] job['JobAddress'] = address job['Mobile'] = phone # company = JobsDB_Company() company['WebSiteID'] = self.site_id company['CompanyName'] = companyname company['CompanyAddress'] = address company['WebSite'] = website company['CompanyLogoUrl'] = logourl company['AreaName'] = job['CityName'] company['Mobile'] = phone # yield company yield job else: log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)