Exemplo n.º 1
0
 def parse_review(self, response):
     hxs = Selector(response)
     asin = response.meta['asin']
     title = FmtSQLCharater(first_item(hxs.xpath('//title/text()').extract()))
     title = title.replace(u'Amazon.com: Customer Reviews: ', '')
     rlist = hxs.xpath("//div[@id='cm_cr-review_list']/div[@class='a-section review']")
     for div in rlist:
         r = Review()
         r['product_id'] = asin
         r['product_name'] = title
         r['review_id'] = first_item(div.xpath('@id').extract())
         votes = FmtSQLCharater(first_item(div.xpath('div[1]/span/text()').extract()))
         match = re.search(u'(.+) people found this helpful', votes, re.I)
         if match:
             r['total_feedback_num'] = match.group(1)
             r['total_helpful_num'] = match.group(2)
         #
         r['full_star'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[1]/i/span/text()").extract()))
         r['title'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[2]/text()").extract()))
         r['cust_name'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[1]/a/text()").extract()))
         r['creation_date'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[4]/text()").extract()))
         #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/')
         r['body'] = first_item(div.xpath("div[5]/span").extract())
         yield r
     #下一页
     if len(rlist) == 10:
         page = response.meta['page'] + 1
         log.msg('Request Product[%s]-[%d] page review ...' % (asin, page))
         yield Request(
             url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', str(page)),
             callback=self.parse_review,
             headers=self.headers,
             meta={'page': page, 'asin': asin}
         )
Exemplo n.º 2
0
 def parse_review(self, response):
     data = response.body
     if data == '':
         log.msg(format='%(request)s post fail.response is empty.',
                 level=log.ERROR,
                 request=response.url)
         return
     try:
         data = data.decode('GBK', 'ignore')
         js = json.loads(data)
     except:
         log.msg(u'图书[%s]评论请求结果解析异常,非json数据.url=%s' %
                 (response.meta['sku'], response.url),
                 level=log.INFO)
         return
     for item in js['comments']:
         r = Review()
         r['product_id'] = item['referenceId']
         r['product_name'] = item['referenceName']
         r['review_id'] = item['id']
         r['title'] = item['title'] if item.has_key('title') else ''
         r['body'] = FmtSQLCharater(item['content'])
         r['creation_date'] = item['creationTime']
         r['score'] = item['score']
         r['cust_name'] = FmtSQLCharater(item['nickname'])
         r['cust_lev'] = item['userLevelName']
         r['cust_level_simple'] = item['userLevelId']
         r['cust_img'] = item['userImageUrl']
         r['comment_tags'] = '#'.join(
             map(lambda x: x['name'], item['commentTags'])) if item.has_key(
                 'commentTags') else ''
         r['images'] = '#'.join(
             map(lambda x: x['imgUrl'],
                 item['images'])) if item.has_key('images') else ''
         r['total_feedback_num'] = item['replyCount']
         r['total_helpful_num'] = item['usefulVoteCount']
         yield r
     #下一页
     if len(js['comments']) == 10:
         sku = response.meta['sku']
         page = response.meta['page'] + 1
         log.msg(u'请求商品[%s]的第[%d]页评论...' % (sku, page))
         yield Request(url=self.review_url.replace('<?sku?>', sku).replace(
             '<?page?>', str(page)),
                       callback=self.parse_review,
                       headers=self.headers,
                       meta={
                           'page': page,
                           'sku': sku
                       })
Exemplo n.º 3
0
    def parse_desc(self, response):
        data = response.body
        data = data.decode('GBK', 'ignore')
        data = data[9:-1]
        try:
            js = json.loads(data)
        except:
            log.msg(u'图书[%s]描述请求结果解析异常,非json数据.url=%s' %
                    (response.meta['b']['product_id'], response.url),
                    level=log.INFO)
            return
        b = response.meta['b']
        hxs = Selector(None, js['content'])
        b['product_features'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-1']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['abstract'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-2']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['recommendation'] = b['abstract']
        b['content'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-3']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['brief_introduction'] = b['content']
        b['authorintro'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-4']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['extract'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-5']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['catalog'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-6']/div[2]/div[@class='book-detail-content']"
            ).extract())
        b['more_information'] = first_item(
            hxs.xpath(
                "//div[@id='detail-tag-id-8']/div[2]/div[@class='book-detail-content']"
            ).extract())
        #
        b['abstract'] = FmtSQLCharater(b['abstract'])
        b['catalog'] = FmtSQLCharater(b['catalog'])
        b['recommendation'] = FmtSQLCharater(b['recommendation'])
        b['content'] = FmtSQLCharater(b['content'])
        b['brief_introduction'] = FmtSQLCharater(b['brief_introduction'])
        b['authorintro'] = FmtSQLCharater(b['authorintro'])
        b['extract'] = FmtSQLCharater(b['extract'])
        b['more_information'] = FmtSQLCharater(b['more_information'])

        log.msg(u'请求商品[%s]的价格信息...' % b['product_id'])

        yield Request(url=self.price_url.replace('<?sku?>', b['product_id']),
                      callback=self.parse_price,
                      headers=self.headers,
                      meta={'b': b})
Exemplo n.º 4
0
 def parse_review(self, response):
     data = response.body
     if data == '' or data == '[]':
         log.msg(format='%(request)s post fail.response is [].',
                 level=log.ERROR,
                 request=response.url)
         return
     try:
         js = json.loads(data)
     except:
         log.msg(
             u'图书[%s]评论页码[%d]请求结果解析异常,非json数据.url=%s' %
             (response.meta['pid'], response.meta['page'], response.url),
             level=log.INFO)
         return
     if js.has_key('review_list') and js['review_list'] is not None:
         log.msg(u'评论请求职位ID[%s]的第%d页,总数=%d' %
                 (response.meta['pid'], response.meta['page'],
                  len(js['review_list'])))
         for review in js['review_list']:
             r = Review()
             r['product_name'] = FmtSQLCharater(js['product']['name'])
             for (key, value) in review.iteritems():
                 if key == 'stars':
                     r['full_star'] = value['full_star']
                     r['has_half_star'] = value['has_half_star']
                 elif key in ('experience_ids', 'point_items'):
                     continue
                 elif key in ('body', 'title'):
                     r[key] = FmtSQLCharater(value)
                 else:
                     r[key] = value
             yield r
         #下一页
         if js['pageinfo'].has_key('next'):
             pid = response.meta['pid']
             page = js['pageinfo']['next']
             log.msg(u'评论请求职位ID[%s]的第%d页' % (pid, page))
             yield Request(url=self.review_url.replace('<?pid?>',
                                                       pid).replace(
                                                           '<?page?>',
                                                           str(page)),
                           callback=self.parse_review,
                           headers=self.headers,
                           meta={
                               'page': page,
                               'pid': pid
                           })
Exemplo n.º 5
0
    def process_item(self, item, spider):
        item_name = item.__class__.__name__
        commandtext = ''
        #学校
        if item_name == 'DuoXSchool':
            commandtext = '''insert ignore into duox_school(id, fullName, companyName, mainCourse, province,
city, zone, lat, lng, address, business, contact, phone, discription,
brandAward, brandHistory, brandSchoolCount, brandStudentCount,
envArea, envFacilities, envFitment, envHealth, envPantry, envParentRest, envType,
serviceDetail, teacherAge, teacherCount, teacherQualifier, schoolImage, imageTurn)
values(%s, '%s', '%s', '%s', '%s',
'%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s',
'%s', '%s', '%s', '%s',
'%s', '%s', '%s', '%s', '%s', '%s', '%s',
'%s', '%s', '%s', '%s', '%s', '%s')''' % (item['id'], item['fullName'], item['companyName'], item['mainCourse'], item['province'],
                item['city'], item['zone'], item['lat'], item['lng'], item['address'], item['business'], item['contact'], item['phone'], FmtSQLCharater(item['discription']),
                item['brandAward'], item['brandHistory'], item['brandSchoolCount'], item['brandStudentCount'],
                item['envArea'], item['envFacilities'], item['envFitment'], item['envHealth'], item['envPantry'], item['envParentRest'], item['envType'],
                item['serviceDetail'], item['teacherAge'], item['teacherCount'], item['teacherQualifier'], item['schoolImage'], item['imageTurn'])
        #教师
        elif item_name == 'DuoXTeacher':
            commandtext = "insert ignore into duox_teacher(s_id, id, teacherName, image) values(%s, %s, '%s', '%s')" % (item['s_id'], item['id'], item['teacherName'], item['image'])
        #课程
        elif item_name == 'DuoXCourse':
            commandtext = '''insert ignore into duox_course(s_id, id, province, city, zone, schoolFullName,
courseName, lat, lng, typeName1, typeName2, ageStart, ageEnd, perPrice, packagePrice, needBook,
studentCount, courseImage, discount, address, business, courseDes, imageTurn, priceList)
values(%s, %s, '%s', '%s', '%s', '%s',
'%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s',
'%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')''' % (item['s_id'], item['id'], item['province'], item['city'], item['zone'], item['schoolFullName'],
                FmtSQLCharater(item['courseName']), item['lat'], item['lng'], item['typeName1'], item['typeName2'], item['ageStart'], item['ageEnd'], item['perPrice'], item['packagePrice'], item['needBook'],
                item['studentCount'], item['courseImage'], item['discount'], item['address'], item['business'], FmtSQLCharater(item['courseDes']), item['imageTurn'], item['priceList'])
        #评论
        elif item_name == 'DuoXComment':
            commandtext = '''insert ignore into duox_comment(s_id, s_name, id, commentText, commentType,
contactName, contactPhone, createTime, typeId, typeName) values(%s, '%s', %d, '%s', %s,
'%s', '%s', %s, %s, '%s')''' % (item['s_id'], item['s_name'], item['id'], FmtSQLCharater(item['commentText']), item['commentType'],
                item['contactName'], item['contactPhone'], item['createTime'], item['typeId'], FmtSQLCharater(item['typeName']))
        #
        if commandtext != '':
            ret = TQDbPool.execute('GSX', commandtext)
            if ret is None:
                log.msg(u'sql指令执行失败,出现异常:dbname=%s,commamd=%s' % ('GSX', commandtext), level=log.ERROR)
            elif ret == -2:
                log.error(u'数据库连接失败导致执行失败:dbname=%s,commamd=%s' % ('GSX', commandtext), level=log.ERROR)
        return item
Exemplo n.º 6
0
 def parse_co(self, response):
     try:
         msg = ET.fromstring(response.body)
     except BaseException as e:
         log.msg(u'企业编号<{}>返回结果非法!'.format(response.meta['coid']), level = log.ERROR)
         return
         #
     job = response.meta['job']
     if text_(msg.find('result')) == '1':
         item = msg.find('resultbody')
         co = Company()
         co['SiteID'] = self.site
         co['company_id'] = text_(item.find('coid'))
         co['CompanyName'] = FmtCmpNameCharacter(text_(item.find('coname')))
         co['Industry'] = ''
         if item.find('indtype1').text:
         	co['Industry'] = text_(item.find('indtype1'))
         if item.find('indtype2').text:
         	co['Industry'] += ',' + text_(item.find('indtype2'))
         co['CompanyType'] = text_(item.find('cotype'))
         co['CompanyScale'] = text_(item.find('cosize'))
         co['CompanyAddress'] = FmtSQLCharater(text_(item.find('caddr')))
         co['CompanyDesc'] = FmtSQLCharater(text_(item.find('coinfo')))
         co['CompanyUrl'] = text_(item.find('cojumpurl'))
         co['CompanyLogoUrl'] = text_(item.find('logo'))
         co['GisLongitude'] = text_(item.find('lon'))
         co['GisLatitude'] = text_(item.find('lat'))
         co['CityName'] = job['CityName']
         co['AreaCode'] = job['AreaCode']
         co['Relation'] = ''
         co['Mobile'] = ''
         co['Credibility'] = ''
         co['Licensed'] = ''
         co['Yan'] = ''
         co['FangXin'] = ''
         co['Email'] = ''
         co['PraiseRate'] = '0'
         co['UserId'] = ''
         co['UserName'] = ''
         co['ProvinceName'] = ''
         co['WorkArea1'] = ''
         co['AreaCode1'] = ''
         yield co
     yield job
Exemplo n.º 7
0
 def parse_info_list(self, resp):
     meta = resp.meta
     log.msg(u'已下载<%s>第<%d>页,开始解析...' % (meta['name'], meta['page']),
             log.INFO)
     ret = json.loads(resp.body)
     list = ret['list']
     log.msg(u'<%s>第<%d>页数量<%d>' % (meta['name'], meta['page'], len(list)))
     for item in list:
         fieldNames = item.keys()
         #
         topic = Topic()
         topicFields = topic.fields.keys()
         user = User()
         userFields = user.fields.keys()
         theme = Theme()
         themeFields = theme.fields.keys()
         #
         for field in fieldNames:
             if field in topicFields:
                 if field not in ['top_cmt', 'themes']:
                     topic[field] = FmtSQLCharater(item[field])
             if field in userFields:
                 user[field] = FmtSQLCharater(item[field])
             if field in themeFields:
                 theme[field] = FmtSQLCharater(item[field])
         #
         yield user
         yield topic
         yield theme
         #
     # 下一页请求
     if len(list) > 0 and meta['page'] < MAX_PAGE:
         # 下次请求参数
         meta['page'] += 1
         meta['maxtime'] = ret['info']['maxtime']
         yield self._createNextRequest(meta)
Exemplo n.º 8
0
 def parse_info(self, response):
     (result, js) = self._validate_response(response)
     if result:
         if 1 == js['jsonStatus']:
             j = Job()
             j['Published'] = response.meta['published']
             j['CityId'] = response.meta['region']['id']
             j['CityName'] = response.meta['region']['name']
             for (key, value) in js['result'].iteritems():
                 if key in ['Title', 'Description']:
                     j[key] = FmtSQLCharater(value)
                 else:
                     if key not in ['AllApplies']:
                         j[key] = value
             yield j
         else:
             self.did_failed_request(response, js['message'])
Exemplo n.º 9
0
 def parse_review(self, response):
     hxs = Selector(response)
     asin = response.meta['asin']
     title = FmtSQLCharater(
         first_item(hxs.xpath('//title/text()').extract()))
     title = title.replace(u'Amazon.com: Customer Reviews: ', '')
     rlist = hxs.xpath(
         "//div[@id='cm_cr-review_list']/div[@class='a-section review']")
     for div in rlist:
         r = Review()
         r['product_id'] = asin
         r['product_name'] = title
         r['review_id'] = first_item(div.xpath('@id').extract())
         votes = FmtSQLCharater(
             first_item(div.xpath('div[1]/span/text()').extract()))
         match = re.search(u'(.+) people found this helpful', votes, re.I)
         if match:
             r['total_feedback_num'] = match.group(1)
             r['total_helpful_num'] = match.group(2)
         #
         r['full_star'] = FmtSQLCharater(
             first_item(div.xpath("div[2]/a[1]/i/span/text()").extract()))
         r['title'] = FmtSQLCharater(
             first_item(div.xpath("div[2]/a[2]/text()").extract()))
         r['cust_name'] = FmtSQLCharater(
             first_item(div.xpath("div[3]/span[1]/a/text()").extract()))
         r['creation_date'] = FmtSQLCharater(
             first_item(div.xpath("div[3]/span[4]/text()").extract()))
         #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/')
         r['body'] = first_item(div.xpath("div[5]/span").extract())
         yield r
     #下一页
     if len(rlist) == 10:
         page = response.meta['page'] + 1
         log.msg('Request Product[%s]-[%d] page review ...' % (asin, page))
         yield Request(url=self.review_url.replace(
             '<?asin?>', asin).replace('<?page?>', str(page)),
                       callback=self.parse_review,
                       headers=self.headers,
                       meta={
                           'page': page,
                           'asin': asin
                       })
Exemplo n.º 10
0
 def parse_info(self, response):
     data = response.body
     if data == '' or data == '[]':
         log.msg(format='%(request)s post fail.response is [].',
                 level=log.ERROR,
                 request=response.url)
         return
     try:
         js = json.loads(data)
     except:
         log.msg(u'兼职职位[%d]请求结果解析异常,非json数据.url=%s' %
                 (response.meta['aid'], response.url),
                 level=log.INFO)
         return
     if 1 == js['status']:
         j = Job()
         for (key, value) in js['activityDetail'].iteritems():
             if key not in ['salary', 'avatar', 'banner_kw']:
                 if value is None:
                     if key == 'group_id':
                         j[key] = -1
                     elif key == 'post_time':
                         j[key] = ''
                 else:
                     if key == 'require_info':
                         j[key] = FmtSQLCharater(value)
                     else:
                         j[key] = value
         yield j
         #
         cid = js['activityDetail']['company_im_id']
         yield Request(url=self.cmp_url.replace('<?cid?>', str(cid)),
                       callback=self.parse_cmp,
                       headers=self.headers,
                       dont_filter=True,
                       meta={'cid': cid})
     else:
         log.msg(js['msg'])
Exemplo n.º 11
0
 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(None, data)
         #开始解析
         linkid = response.meta['linkid']
         #
         title = response.meta['title']
         #
         logourl = response.meta['logourl']
         #
         location = response.meta['location']
         #
         function = response.meta['f']
         #
         postdate = response.meta['postdate']
         #
         companyname = first_item(
             hxs.xpath(
                 '//div[@class="additional_info"]/span[@class="company"]/a/text()'
             ).extract())
         companyname = companyname.lstrip(' ')
         companyname = companyname.rstrip(' ')
         if companyname == '':
             log.msg(u'该职位来源其他网站(%s),无法抓取.' % response.url, level=log.ERROR)
             return
         #
         desc = first_item(
             hxs.xpath('//div[@class="p-description"]').extract())
         desc = desc.lstrip('<div class="p-description">')
         desc = desc.rstrip('</div>')
         desc = desc.replace('\t', '')
         #
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = linkid
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = function
         job['JobDesc'] = FmtSQLCharater(desc)
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = datetime.strptime(postdate, '%Y-%m-%d')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['CompanyLogoUrl'] = logourl
         company['AreaName'] = job['CityName']
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
Exemplo n.º 12
0
 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         title = first_item(
             hxs.xpath(
                 '//h1[@class="entry-title mt_title1"]/text()').extract())
         companyname = first_item(
             hxs.xpath('//span[@class="entry-author"]/text()').extract())
         companyname = companyname.rstrip(' - ')
         #
         match = re.search(r'^<td.+>Location</td>\s+<td.+>(.+)</td>$', data,
                           re.I | re.M)
         if match:
             location = match.group(1)
             if location.find(', ') > 0:
                 location = location.split(',')[0]
         else:
             location = ''
         #
         match = re.search(r'^<td.+>Posted</td>\s+<td.+>(.+)</td>$', data,
                           re.I | re.M)
         if match:
             postdate = match.group(1)
         else:
             postdate = ''
         #
         jobdesc = first_item(
             hxs.xpath(
                 '//div[@class="user-page mt_content1"]/div[@class="mt_content1"]'
             ).extract())
         linkid = first_item(
             hxs.xpath('//input[@id="uid"]/@value').extract())
         #
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = linkid
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = response.meta['sector']
         job['JobDesc'] = FmtSQLCharater(jobdesc)
         job['Salary'] = salary
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = datetime.strptime(postdate, '%d %b %y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobAddress'] = address
         job['Mobile'] = phone
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['CompanyAddress'] = address
         company['WebSite'] = website
         company['CompanyLogoUrl'] = logourl
         company['AreaName'] = job['CityName']
         company['Mobile'] = phone
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
Exemplo n.º 13
0
 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         match = re.search(r"^var foldr = '(.+)';", data, re.I | re.M)
         if match:
             linkid = match.group(1)
         else:
             linkid = ''
         if linkid == '':
             log.msg(u'页面没有找到职位ID,丢弃。%s' % response.url, log.ERROR)
             return
         else:
             log.msg(u'找到职位,ID=[%s]' % linkid)
         #
         title = first_item(
             hxs.xpath(
                 '//div[@class="ns_jd_headingbig hl"]/h1/strong/text()').
             extract())
         title = title.rstrip(' ')
         logourl = first_item(
             hxs.xpath(
                 '//div[@class="ns_jd_comp_logo"]/img/@src').extract())
         companyname = first_item(
             hxs.xpath('//span[@class="ns_comp_name"]/text()').extract())
         #Locations
         match = re.search(
             r'<strong>Locations</strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             location = match.group(1)
         else:
             location = ''
         #Experience
         match = re.search(
             r'<strong>Experience </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             experience = match.group(1)
         else:
             experience = ''
         #Keywords / Skills
         match = re.search(
             r'<strong>Keywords / Skills </strong></h2></div>\s+<div class="ns_jobsum_txt"\s.+>(.+)\s</div>',
             data, re.I | re.M)
         if match:
             skills = match.group(1)
         else:
             skills = ''
         #Education
         match = re.search(
             r'<strong>Education </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             education = match.group(1)
         else:
             education = ''
         #Function
         match = re.search(
             r'<strong>Function </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             function = match.group(1)
             function = function.replace(' &bull; ', '*')
             function = function.replace('<br />', '')
         else:
             function = ''
         #Role
         match = re.search(
             r'<strong>Role </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             role = match.group(1)
             role = role.replace(' &bull; ', '*')
             role = role.replace('<br />', '')
         else:
             role = ''
         #Industry
         match = re.search(
             r'<strong>Industry </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)\s</div>',
             data, re.I | re.M)
         if match:
             industry = match.group(1)
             industry = industry.replace(' &bull; ', '')
             industry = industry.replace('<br />', ';')
         else:
             industry = ''
         #Summary
         match = re.search(
             r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+)</div>',
             data, re.I | re.M)
         if match:
             summary = match.group(1)
         else:
             #存在中途换行的情况
             match = re.search(
                 r'<strong>Summary </strong></h2></div>\s+<div class="ns_jobsum_txt">(.+\s+.+)</div>',
                 data, re.I | re.M)
             if match:
                 summary = match.group(1)
             else:
                 summary = ''
         #
         match = re.search(
             r'<strong>Posted On </strong></h2></div>\s+<div class="ns_jobsum_txt">\s(.+)\s</div>\t',
             data, re.I | re.M)
         if match:
             postdate = match.group(1)
         else:
             postdate = ''
         #
         desc = hxs.xpath('//div[@class="ns_jobdesc hl"]').extract()
         if desc:
             jobdesc = hxs.xpath(
                 '//div[@class="ns_jobdesc hl"]').extract()[0]
         else:
             jobdesc = ''
         #
         if desc and len(desc) > 1:
             comdesc = hxs.xpath(
                 '//div[@class="ns_jobdesc hl"]').extract()[1]
         else:
             comdesc = ''
         #
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = linkid
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = function
         job['JobDesc'] = FmtSQLCharater(summary + '<p>' + jobdesc)
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = postdate.replace('st', '')
             postdate = postdate.replace('nd', '')
             postdate = postdate.replace('rd', '')
             postdate = postdate.replace('th', '')
             postdate = datetime.strptime(postdate, '%d %b %Y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobComputerSkill'] = skills
         job['Exercise'] = experience
         job['Eduacation'] = education
         job['JobFunction'] = role
         job['Industry'] = industry
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['Industry'] = industry
         company['CompanyLogoUrl'] = logourl
         company['CompanyDesc'] = FmtSQLCharater(comdesc)
         company['AreaName'] = job['CityName']
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
Exemplo n.º 14
0
 def parse_job(self, response):
     data = response.body
     if data == '' or data == '[]':
         log.msg(format= '%(request)s get fail.response is [].', level = log.ERROR, request = response.url)
         return
     js = json.loads('{}')
     try:
         js = json.loads(data)
     except:
         log.msg(u'职位详情请求结果解析异常,非json数据.url=%s' % response.url, level = log.INFO)
         return
     #列表解析
     if js['StatusCode'] == 200:
         #
         pd = js['PositionDetail']
         #过滤没有emaillist的职位
         #if pd['EmailList'] == '':
         #    log.msg(u'职位[%s-%s]没有email地址,丢弃.' % (pd['Number'], pd['Name']))
         #    return
         #
         cd = js['CompanyDetail']
         cr = js['Coordinate']
         #
         publish = pd['DateStart'].replace('T', ' ')
         publish = publish.replace('Z', '')
         pos = publish.rfind('.')
         if pos > 0:
             publish = publish[0: pos - len(publish)]
         publish = datetime.datetime.strptime(publish, '%Y-%m-%d %H:%M:%S')
         #
         if exist_linkid(self.site, pd['Number'], int(mktime(publish.timetuple()))):
             return
         #
         j = WebJob()
         j['SiteID'] = self.site
         j['JobTitle'] = pd['Name']
         j['Company'] = pd['CompanyName']
         j['PublishTime'] = publish
         j['RefreshTime'] = publish
         j['ClickTimes'] = 0
         #依据智联职位类别查找SQL Server职位类别代码与职位名称
         zJob = int(pd['SubJobType'])
         zJobClassName = FmtJobPositionWithPrefix('redis_cache_1', self.company_prefix, zJob)
         if zJobClassName != '':
             j['JobCode'] = zJobClassName.split('#')[0]
             j['JobName'] = zJobClassName.split('#')[1]
         else:
             log.msg(u'职位类别=%d,在redis上没有查找到对应的职位类别代码与职位名称' % zJob, level = log.ERROR)
             return
         #
         j['Salary'] = pd['Salary']
         j['SalaryType'] = 0
         j['Eduacation'] = pd['Education']
         j['Number'] = '%d人' % pd['RecruitNumber']
         j['Exercise'] = pd['WorkingExp']
         if pd.has_key('WelfareTab'):
             j['SSWelfare'] = ','.join(map(lambda wel: wel.values()[0], pd['WelfareTab']))
         else:
             j['SSWelfare'] = ''
         j['SBWelfare'] = ''
         j['OtherWelfare'] = ''
         j['JobDesc'] = pd['Description']
         j['Relation'] = ''
         j['Mobile'] = ''
         j['Email'] = pd['EmailList']
         j['JobAddress'] = FmtSQLCharater(cd['Address'])
         j['InsertTime'] = datetime.datetime.today()
         j['Sex'] = u'不限'
         j['LinkID'] = pd['Number']
         j['Tag'] = ''
         j['ProvinceName'] = ''
         j['CityName'] = pd['WorkCity']
         j['WorkArea'] = pd['WorkCity']
         if pd.has_key('CityDistrict'):
             j['WorkArea1'] = pd['CityDistrict']
         else:
             j['WorkArea1'] = ''
         j['WorkArea2'] = ''
         j['CompanyLink'] = self.company_prefix + pd['CompanyNumber']
         if pd['WorkType'] == u'全职' or pd['WorkType'] == u'实习':
             j['JobType'] = 1
         else:
             j['JobType'] = 2
         j['SyncStatus'] = 0
         j['SrcUrl'] = response.url
         j['GisLongitude'] = cr['Longitude']
         j['GisLatitude'] = cr['Latitude']
         j['StartDate'] = pd['DateStart']
         j['EndDate'] = pd['DateEnd']
         #其他默认信息
         j['AnFmtID'] = 0
         j['KeyValue'] = ''
         if cd['Industry']:
             j['Industry'] = cd['Industry']
         else:
             j['Industry'] = ''
         j['CompanyType'] = cd['Property']
         j['CompanyScale'] = cd['CompanySize']
         j['Require'] = u'招%s|学历%s|经验%s|性别%s' % (j['Number'], j['Eduacation'], j['Exercise'], j['Sex'])
         j['Telphone1'] = ''
         j['Telphone2'] = ''
         j['Age'] = 0
         j['ValidDate'] = ''
         j['ParentName'] = ''
         j['EduacationValue'] = 0
         j['SalaryMin'] = 0.0
         j['SalaryMax'] = 0.0
         j['NumberValue'] = 0
         j['SexValue'] = 0
         j['OperStatus'] = 0
         j['LastModifyTime'] = datetime.datetime.today()
         j['PropertyTag'] = ''
         j['SalaryValue'] = 0
         j['ExerciseValue'] = 0
         j['Valid'] = 'T'
         j['JobWorkTime'] = ''
         j['JobComputerSkill'] = ''
         j['ForeignLanguage'] = ''
         j['JobFunction'] = ''
         j['JobRequest'] = ''
         j['BusinessCode'] = ''
         #企业信息
         c = Company()
         c['SiteID'] = self.site
         c['company_id'] = self.company_prefix + cd['Number']
         c['Credibility'] = ''
         c['Licensed'] = ''
         c['Yan'] = ''
         c['FangXin'] = ''
         c['CompanyName'] = cd['Name']
         c['CityName'] = cd['CityName']
         c['AreaCode'] = ''
         c['Relation'] = ''
         c['Mobile'] = ''
         c['Industry'] = cd['Industry']
         c['CompanyType'] = cd['Property']
         c['CompanyScale'] = cd['CompanySize']
         c['CompanyAddress'] = cd['Address']
         c['CompanyDesc'] = cd['Description']
         c['CompanyUrl'] = cd['Url']
         if cd['companyLogo']:
             c['CompanyLogoUrl'] = cd['companyLogo']
         else:
             c['CompanyLogoUrl'] = ''
         c['Email'] = ''
         c['PraiseRate'] = '0'
         c['GisLongitude'] = cr['Longitude']
         c['GisLatitude'] = cr['Latitude']
         c['UserId'] = ''
         c['UserName'] = ''
         c['ProvinceName'] = ''
         c['WorkArea1'] = cd['CityName']
         c['AreaCode1'] = ''
         # log.msg(j['JobCode'] + '--' + j['JobName'])
         yield c
         yield j
     else:
         log.msg(u'职位详情请求失败,原因:%s.url=%s' % (js['StatusDescription'], response.url))
Exemplo n.º 15
0
 def parse_info(self, response):
     data = response.body
     js = BaseSpider.fmt_json(self, data)
     if js:
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         job['LinkID'] = js['jobid']
         job['JobTitle'] = FmtSQLCharater(js['jobttl'])
         job['Company'] = FmtSQLCharater(js['coym'])
         job['JobDesc'] = FmtSQLCharater(js['dsc'])
         job['SrcUrl'] = js['applyurl']
         ovw = js['ovw']
         ovw = ovw.replace('                            ', '')
         #
         match = re.search(
             r"<h3 class='jd-label'>Industry</h3>\n<p>(.*)</p>", ovw,
             re.I | re.M)
         if match:
             job['Industry'] = match.group(1)
         #
         match = re.search(
             r"<h3 class='jd-label'>Job Function</h3>\n<p>(.*)</p>", ovw,
             re.I | re.M)
         if match:
             job['JobName'] = match.group(1)
         #
         job['CityName'] = 'Singapore'
         job['WorkArea'] = 'Singapore'
         match = re.search(
             r"<h3 class='jd-label'>Work Region</h3>\n<p>(.*)</p>", ovw,
             re.I | re.M)
         if match:
             job['WorkArea1'] = match.group(1).replace('Singapore - ', '')
         #
         match = re.search(
             r"<h3 class='jd-label'>Job Type</h3>\n<p>(.*)</p>", ovw,
             re.I | re.M)
         if match:
             if match.group(1).find('Full Time') >= 0:
                 job['JobType'] = 1
             else:
                 job['JobType'] = 0
         #
         match = re.search(r"Min. Education Level : (.*?)</li><li>", ovw,
                           re.I | re.M)
         if match:
             job['Eduacation'] = FmtSQLCharater(match.group(1))
         #
         match = re.search(r"Year of Exp Required : (.*?)</li><li>", ovw,
                           re.I | re.M)
         if match:
             job['Exercise'] = match.group(1)
         #
         match = re.search(r"Skills : (.*?)</li><li>", ovw, re.I | re.M)
         if match:
             job['JobComputerSkill'] = FmtSQLCharater(match.group(1))
         #
         match = re.search(r"Language : (.*?)</li><li>", ovw, re.I | re.M)
         if match:
             job['ForeignLanguage'] = match.group(1)
         #
         match = re.search(r"Salary : (.*?)</span>", ovw, re.I | re.M)
         if match:
             job['Salary'] = match.group(1)
         job['Number'] = 'one person'
         #13-Jul-2015
         PostDate = datetime.strptime(js['pstdttme'], '%d-%b-%Y')
         job['PublishTime'] = PostDate
         job['RefreshTime'] = PostDate
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = job['Company']
         company['Industry'] = job['Industry']
         company['AreaName'] = 'Singapore'
         company['CompanyDesc'] = ''
         #
         match = re.search(r"Website:</strong> (.*)<br />", ovw,
                           re.I | re.M)
         if match:
             company['CompanyUrl'] = match.group(1)
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常,非json数据.url=%s' % response.url,
                 level=log.INFO)
Exemplo n.º 16
0
 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #页面解析
         #企业横幅
         company_banner = first_item(
             hxs.xpath(
                 '//img[@id="company_banner"]/@data-original').extract())
         #企业logo
         company_logo = first_item(
             hxs.xpath(
                 '//img[@id="company_logo"]/@data-original').extract())
         #职位名称
         position_title = first_item(
             hxs.xpath('//h1[@id="position_title"]/text()').extract())
         position_title = FmtSQLCharater(position_title)
         #企业名称
         company_name = first_item(
             hxs.xpath('//h2[@id="company_name"]/a/text()').extract())
         if company_name == '':
             company_name = first_item(
                 hxs.xpath('//h2[@id="company_name"]/text()').extract())
         company_name = company_name.replace('\n', '')
         company_name = company_name.replace('\t', '')
         company_name = company_name.lstrip(' ')
         company_name = company_name.rstrip(' ')
         company_name = FmtSQLCharater(company_name)
         if company_name == '':
             log.msg(u'企业名称为空,url=%s' % response.url)
             return
         #企业SrcUrl地址
         company_url = first_item(
             hxs.xpath('//h2[@id="company_name"]/a/@href').extract())
         #薪资
         salary = first_item(
             hxs.xpath('//div[@id="salary"]/p/a/text()').extract())
         #经验
         experience = first_item(
             hxs.xpath(
                 '//div[@id="experience"]/p[@id="years_of_experience"]/span[@id="years_of_experience"]/text()'
             ).extract())
         experience = experience.replace('\n', '')
         experience = experience.replace('\t', '')
         #Location
         location = first_item(
             hxs.xpath(
                 '//div[@id="location"]/p/span[@id="single_work_location"]/text()'
             ).extract())
         location = location.replace('\n', '')
         location = location.replace('\t', '')
         #职位描述(可能包含岗位职责、职位要求)
         job_desc = first_item(
             hxs.xpath('//div[@id="job_description"]').extract())
         #企业信息
         company_registration_number = first_item(
             hxs.xpath('//span[@id="company_registration_number"]/text()').
             extract())
         company_industry = first_item(
             hxs.xpath('//p[@id="company_industry"]/text()').extract())
         company_website = first_item(
             hxs.xpath('//a[@id="company_website"]/text()').extract())
         company_contact = first_item(
             hxs.xpath('//p[@id="company_contact"]/text()').extract())
         company_size = first_item(
             hxs.xpath('//p[@id="company_size"]/text()').extract())
         work_environment_working_hours = first_item(
             hxs.xpath('//p[@id="work_environment_working_hours"]/text()').
             extract())
         work_environment_dress_code = first_item(
             hxs.xpath(
                 '//p[@id="work_environment_dress_code"]/text()').extract())
         work_environment_benefits = first_item(
             hxs.xpath(
                 '//p[@id="work_environment_benefits"]/text()').extract())
         work_environment_spoken_language = first_item(
             hxs.xpath('//p[@id="work_environment_spoken_language"]/text()'
                       ).extract())
         #gallery
         gallery = ''
         thumbs = hxs.xpath('//ul[@class="gallery-thumb"]/li')
         for item in thumbs:
             gallery += first_item(
                 item.xpath('img/@data-original').extract()) + ';'
         #企业描述
         company_overview_all = first_item(
             hxs.xpath('//div[@id="company_overview_all"]').extract())
         #work location
         match = re.search(r'&center=(.*?)&', data, re.I | re.M)
         if match:
             gps_location = match.group(1)
             lat = gps_location.split(',')[0]
             lng = gps_location.split(',')[1]
         else:
             lat = '0.0'
             lng = '0.0'
         #
         address = first_item(
             hxs.xpath('//p[@id="address"]/text()').extract())
         address = FmtSQLCharater(address)
         #Advertised: 23-June-2015
         posting_date = first_item(
             hxs.xpath('//p[@id="posting_date"]/text()').extract())
         posting_date = posting_date.replace('Advertised:', '')
         posting_date = posting_date.replace(' ', '')
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         #http://jobs.jobstreet.com/sg/jobs/4712859?fr=J
         job['LinkID'] = response.url[34:-5]
         job['JobTitle'] = position_title
         job['Company'] = company_name
         job['Industry'] = company_industry
         job['JobName'] = response.meta['name']
         job['JobDesc'] = FmtSQLCharater(job_desc)
         job['Salary'] = salary
         job['Exercise'] = experience
         job['JobType'] = 1
         job['SrcUrl'] = response.url
         job['SSWelfare'] = work_environment_benefits
         job['Number'] = 'one person'
         #时间格式化
         PostDate = datetime.strptime(posting_date, '%d-%B-%Y')
         job['PublishTime'] = PostDate
         job['RefreshTime'] = PostDate
         if location <> '' and len(location.split('-')) > 1:
             job['CityName'] = location.split('-')[0].replace(' ', '')
             job['WorkArea1'] = location.split('-')[1].replace(' ', '')
         else:
             job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['ForeignLanguage'] = work_environment_spoken_language
         job['JobWorkTime'] = work_environment_working_hours
         job['GisLongitude'] = lng
         job['GisLatitude'] = lat
         job['JobAddress'] = address
         job['Mobile'] = company_contact
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = company_name
         company['Industry'] = company_industry
         company['CompanyScale'] = company_size
         company['CompanyAddress'] = address
         company['CompanyUrl'] = company_url
         company['WebSite'] = company_website
         company['CompanyLogoUrl'] = company_logo
         company['AreaName'] = job['CityName']
         company['CompanyDesc'] = FmtSQLCharater(company_overview_all)
         company['Mobile'] = company_contact
         company['GisLongitude'] = lng
         company['GisLatitude'] = lat
         company['OtherInfo'] = company_banner + '#' + gallery
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)
Exemplo n.º 17
0
 def parse_info(self, response):
     data = response.body
     if data == '' or data == '[]':
         log.msg(format='%(request)s post fail.response is [].',
                 level=log.ERROR,
                 request=response.url)
         return
     try:
         js = json.loads(data)
     except:
         log.msg(u'图书详情[%s]请求结果解析异常,非json数据.url=%s' %
                 (response.meta['p']['id'], response.url),
                 level=log.INFO)
         return
     #
     pin = js['product_info_new']
     pd = js['product_desc']
     pds = js['product_desc_sorted']
     p = Product()
     #
     for (key, value) in pin.iteritems():
         if key == 'mobile_exclusive_price':
             continue
         elif key == 'shop_id':
             continue
         elif key == 'product_name':
             p[key] = FmtSQLCharater(value)
         elif key == 'outlets':
             continue
         elif key == 'publish_info':
             for (key1, value1) in value.iteritems():
                 if key1 == 'author_arr':
                     continue
                 else:
                     if key1 == 'author_name':
                         p['publish_' + key1] = FmtSQLCharater(value1)
                     else:
                         p['publish_' + key1] = value1
         elif key == 'promo_model':
             continue
         elif key == 'stock_info':
             p['stock_status'] = value['stock_status']
         elif key == 'category_info':
             continue
         elif key == 'comm_info':
             for (key1, value1) in value.iteritems():
                 if key1 == 'items':
                     continue
                 else:
                     p['comm_' + key1] = value1
         elif key == 'total_review_count':
             continue
         elif key == 'abstract':
             p[key] = FmtSQLCharater(value)
         elif key == 'images':
             p['images'] = '#'.join(value)
         elif key == 'images_big':
             p['images_big'] = '#'.join(value)
         elif key == 'stars':
             p['stars_full_star'] = value['full_star']
             p['stars_has_half_star'] = value['has_half_star']
         elif key == 'ebook_info':
             p['ebook_read_ebook_at_h5'] = value['read_ebook_at_h5']
             p['ebook_is_client_buy'] = value['is_client_buy']
         elif key == 'is_yb_product':
             continue
         elif key == 'is_show_arrive':
             continue
         elif key == 'share_url':
             continue
         elif key == 'spuinfo':
             if value != '':
                 p['spuinfo_num'] = value['num']
                 p['spuinfo_spus_id'] = value['spus_id']
         elif key == 'bd_promo_price':
             continue
         elif key == 'template_id':
             continue
         elif key == 'bang_rank':
             p['bang_rank_word'] = value['word']
             p['bang_rank_path_name'] = value['path_name']
             p['bang_rank_rank'] = value['rank']
             p['bang_rank_catPath'] = value['catPath']
         elif key == 'same_cate_product':
             continue
         elif key == 'show_dangdangsale':
             continue
         elif key == 'in_wishlist':
             continue
         elif key == 'page_template':
             continue
         elif key == 'platform_banner':
             continue
         else:
             p[key] = value
     #
     for (key, value) in pd.iteritems():
         p[key] = FmtSQLCharater(value)
         if key == 'beautiful_image':
             hxs = Selector(None, value)
             images = hxs.xpath('//body/img/@src').extract()
             p['beautiful_image_list'] = '#'.join(images)
     #
     for item in pds:
         if item['name'] == u'推荐语':
             p['recommendation'] = FmtSQLCharater(item['content'])
         elif item['name'] == u'简介':
             p['brief_introduction'] = FmtSQLCharater(item['content'])
         #elif item['name'] == u'目录':
         #    p['catalog'] = item['content']
         elif item['name'] == u'出版信息':
             continue
         elif item['name'] == u'更多':
             p['more_information'] = FmtSQLCharater(item['content'])
     #
     yield p
Exemplo n.º 18
0
 def parse_job(self, response):
     data = response.body
     js = self._fmt_json(data)
     if js and js.has_key('JobAdDetails'):
         jd = js['JobAdDetails'][0]
         if jd:
             job = JobsDB_Job()
             job['SiteID'] = self.site_id
             job['LinkID'] = jd['Id']
             job['JobTitle'] = jd['JobTitle']
             job['Company'] = jd['Company']
             job['Industry'] = jd['Industry']
             for func in jd['JobFunction']:
                 job['JobName'] += func + '#'
             job['JobDesc'] = FmtSQLCharater(jd['JobDesc'])
             job['Salary'] = jd['Salary']
             if jd['SalaryLow'] <> 'Hidden' and jd[
                     'SalaryLow'] <> 'Not Provided':
                 job['SalaryMin'] = float(jd['SalaryLow'].replace(
                     ',', '').replace('+', ''))
             if jd['SalaryUp'] <> 'Hidden' and jd[
                     'SalaryUp'] <> 'Not Provided':
                 job['SalaryMax'] = float(jd['SalaryUp'].replace(
                     ',', '').replace('+', ''))
             '''
             if jd['SalaryUnit'] <> 'Hidden':
                 job['SalaryType'] = jd['SalaryUnit']
             '''
             job['Eduacation'] = jd['Qualification']
             CareerLevel = jd['CareerLevel']
             job['Exercise'] = jd['WorkExperience']
             EmploymentTerm = jd['EmploymentTerm']
             job['JobTypeName'] = EmploymentTerm
             if EmploymentTerm.find('Full Time') >= 0:
                 job['JobType'] = 1
             elif EmploymentTerm.find('Part Time') >= 0:
                 job['JobType'] = 2
             elif EmploymentTerm.find('Permanent') >= 0:
                 job['JobType'] = 3
             elif EmploymentTerm.find('Temporary') >= 0:
                 job['JobType'] = 4
             elif EmploymentTerm.find('Contract') >= 0:
                 job['JobType'] = 5
             elif EmploymentTerm.find('Internship') >= 0:
                 job['JobType'] = 6
             elif EmploymentTerm.find('Freelance') >= 0:
                 job['JobType'] = 7
             elif EmploymentTerm.find('Contract-to-Perm') >= 0:
                 job['JobType'] = 8
             elif EmploymentTerm.find('Temp-to-Perm') >= 0:
                 job['JobType'] = 9
             if js.has_key('DesktopSiteURL'):
                 job['SrcUrl'] = js['DesktopSiteURL']
             Benefits = ''
             for bf in jd['BenefitId']:
                 Benefits += str(bf) + ';'
                 '''
                 if bf == 5:
                     Benefits += 'Double pay;'
                 elif bf == 7:
                     Benefits += 'Free shuttle bus;'
                 elif bf == 1:
                     Benefits += 'Performance bonus;'
                 elif bf == 14:
                     Benefits += 'Dental insurance;'
                 elif bf == 4:
                     Benefits += 'Overtime pay;'
                 elif bf == 10:
                     Benefits += 'Five-day work week;'
                 elif bf == 8:
                     Benefits += 'Medical insurance;'
                 '''
             job['SSWelfare'] = Benefits
             #IsExpired = jd['IsExpired']
             #Summary1 = jd['Summary1']
             #Summary2 = jd['Summary2']
             #Summary3 = jd['Summary3']
             job['Number'] = 'one person'
             PostDate = jd['PostDate'].replace('T', ' ')
             PostDate = PostDate.replace('+08:00', '')
             PostDate = datetime.strptime(PostDate, '%Y-%m-%d %H:%M:%S')
             job['PublishTime'] = PostDate
             job['RefreshTime'] = PostDate
             job['CityName'] = 'Singapore'
             job['WorkArea'] = 'Singapore'
             Location = jd['Location']  #Downtown Core, CBD (Central Area)
             if Location <> 'No Fixed Location':
                 if Location.find(',') > 0:
                     job['WorkArea1'] = Location.split(',')[0]
                     job['WorkArea2'] = Location.split(',')[1]
                 else:
                     job['WorkArea1'] = Location
             #
             '''
             company = JobsDB_Company()
             company['WebSiteID'] = self.site_id
             company['CompanyName'] = jd['Company']
             company['Industry'] = jd['Industry']
             company['AreaName'] = 'Singapore'
             company['CompanyDesc'] = FmtSQLCharater(jd['CompanyDesc'])
             if js.has_key('CompanyLogo'):
                 company['CompanyLogoUrl'] = jd['CompanyLogo']
             #OmnitureJobAdFuncIds = js['OmnitureJobAdFuncIds'] #17|32|128|267
             #OmnitureLocationId = jd['OmnitureLocationId'] #1297
             #AdType = jd['AdType']
             yield company
             '''
             yield job
     else:
         log.msg(u'职位详情请求结果解析异常,非json数据.url=%s' % response.url,
                 level=log.INFO)
Exemplo n.º 19
0
 def parse_info(self, response):
     data = response.body
     if data == '':
         log.msg(format='%(request)s post fail.response is empty.',
                 level=log.ERROR,
                 request=response.url)
         return
     #
     """
     root = response.meta['root']
     leaf = response.meta['leaf']
     age = response.meta['age']
     star = response.meta['star']
     """
     asin = response.meta['asin']
     #
     hxs = Selector(None, data)
     #
     container = hxs.xpath("//div[@class='a-container']")
     right = container.xpath("div[@id='rightCol']")
     left = container.xpath("div[@id='leftCol']")
     center = container.xpath("div[@id='centerCol']")
     #
     log.msg('Book--')
     b = Book()
     b['product_id'] = asin
     b['product_name'] = FmtSQLCharater(
         first_item(
             center.xpath(
                 "div[@id='booksTitle']/div/h1[@id='title']/span[@id='productTitle']/text()"
             ).extract()))
     b['subname'] = b['product_name']
     b['publish_paper_quality'] = FmtSQLCharater(
         first_item(
             center.xpath(
                 "div[@id='booksTitle']/div/h1[@id='title']/span[2]/text()"
             ).extract()))
     author = center.xpath("div[@id='booksTitle']/div[@id='byline']")
     log.msg('author html:' + author.extract())
     b['publish_author_name'] = FmtSQLCharater(
         first_item(author.xpath('string(.)').extract()))
     b['publish_author_name'] = b['publish_author_name'].replace(
         '\n', '').replace('\t', '').replace(' ', '')
     b['abstract'] = FmtSQLCharater(
         first_item(
             hxs.xpath(
                 "div[@id='bookDescription_feature_div']/noscript/text()").
             extract()))
     images = left.xpath(
         "div[@id='booksImageBlock_feature_div']/div[@id='imageBlockOuter']/div[@id='imageBlockThumbs']/span/div/img/@src"
     ).extract()
     bigImages = map(
         lambda x: x.replace(
             '_AC_SY60_CR,0,0,60,60_', '_SY498_BO1,204,203,200_').replace(
                 '_AC_SX60_CR,0,0,60,60_', '_SX443_BO1,204,203,200_'),
         images)
     b['images'] = '#'.join(images)
     b['images_big'] = '#'.join(bigImages)
     #
     buybox = right.xpath(
         "div[@id='buybox_feature_div']/div[@id='combinedBuyBox']/form[@id='addToCart']/div[@id='buybox']/div/div[@class='a-box-inner']/div"
     )
     b['sale_price'] = FmtSQLCharater(
         first_item(
             buybox.xpath(
                 "//*[@id='a-autoid-5-announce']/span[2]/span").extract()))
     b['discount'] = FmtSQLCharater(
         first_item(
             buybox.xpath(
                 "div[@id='buyNewSection']/div/div[@id='soldByThirdParty']/span[2]/text()"
             ).extract()))
     b['original_price'] = FmtSQLCharater(
         first_item(
             buybox.xpath(
                 "//*[@id='a-autoid-4-announce']/span[2]").extract()))
     b['sale_price'] = b['sale_price'].replace('¥', '')
     b['discount'] = b['discount'].replace(' (', '').replace(u'折) ', '')
     b['original_price'] = b['original_price'].replace(u'¥', '')
     #基本信息
     bullets = hxs.xpath(
         "//div[@id='productDetails']/table/tr/td[@class='bucket']/div[@class='content']/ul/li"
     )
     for li in bullets:
         log.msg('Book-base-info')
         if li.xpath(u"b[contains(text(), 'Publisher')]"):
             publisher = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
             #未来出版社; 第1版 (2011年11月1日)
             match = re.search(u'(.+); 第(.+)版 \((.+)\)', publisher,
                               re.I | re.M)
             if match:
                 b['publish_publisher'] = match.group(1)
                 b['publish_version_num'] = match.group(2)
                 b['publish_publish_date'] = match.group(3)
         elif li.xpath(u"b[contains(text(), 'Series')]"):
             b['product_name'] = FmtSQLCharater(
                 first_item(li.xpath("a/text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Paperback')]"):
             b['publish_paper_quality'] = u'Paperback'
             b['publish_number_of_pages'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Hardcover')]"):
             b['publish_paper_quality'] = u'Hardcover'
             b['publish_number_of_pages'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), '纸板书')]"):
             b['publish_paper_quality'] = u'纸板书'
             b['publish_number_of_pages'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Age Range')]"):
             b['age'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'Language')]"):
             b['publish_subtitle_language'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), '开本')]"):
             b['publish_product_size'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         elif li.xpath(u"b[contains(text(), 'ISBN-13')]"):
             b['publish_standard_id'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).lstrip())
         #elif li.xpath(u"b[contains(text(), '条形码')]"):
         #    b['publish_barcode'] = first_item(li.xpath("text()").extract()).lstrip()
         elif li.xpath(u"b[contains(text(), 'Product Dimensions')]"):
             b['publish_product_size2'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).replace(
                     '\n', '').lstrip().rstrip())
         elif li.xpath(u"b[contains(text(), 'Shipping Weight')]"):
             b['publish_product_weight'] = FmtSQLCharater(
                 first_item(li.xpath("text()").extract()).replace(
                     '\n', '').lstrip().rstrip())
         #elif li.xpath(u"b[contains(text(), '品牌')]"):
         #    b['brand'] = first_item(li.xpath("text()").extract()).lstrip()
     #商品描述
     begin = data.find('var iframeContent =')
     end = data.find('obj.onloadCallback = onloadCallback;')
     if begin and end:
         desc = data[begin + 21:end - 10]
         desc = urllib2.unquote(desc)
         hxs = Selector(None, desc)
         b['recommendation'] = first_item(
             hxs.xpath(
                 u"//div[@class='content']/h3[contains(text(), '编辑推荐')]/following-sibling::div[1]/text()"
             ).extract())
         b['catalog'] = first_item(
             hxs.xpath(
                 u"//div[@class='content']/h3[contains(text(), '目录')]/following-sibling::div[1]/text()"
             ).extract())
         b['more_information'] = first_item(
             hxs.xpath(
                 u"//div[@class='content']/h3[contains(text(), '文摘')]/following-sibling::div[1]/text()"
             ).extract())
     #
     yield b
Exemplo n.º 20
0
 def parse_job(self, response):
     try:
         msg = ET.fromstring(response.body)
     except BaseException as e:
         log.msg(u'职位类别<{}>,职位编号<{}>返回结果非法!'.format(response.meta['funcName'], response.meta['jobid']), level = log.ERROR)
         return
     #
     func = response.meta['func']
     funcName = response.meta['funcName']
     jobid = response.meta['jobid']
     #
     if msg.find('result').text == '1':
         item = msg.find('resultbody')
         webJob = WebJob()
         webJob['SiteID'] = self.site
         webJob['JobTitle'] = FmtSQLCharater(text_(item.find('jobname')))
         webJob['Company'] = FmtCmpNameCharacter(text_(item.find('coname')))
         webJob['PublishTime'] = FmtAnnounceDateToDateTime(text_(item.find('issuedate')), '-')[0]
         webJob['RefreshTime'] = webJob['PublishTime']
         webJob['JobType'] = 1
         webJob['SalaryType'] = 0
         webJob['Salary'] = text_(item.find('providesalary'))
         webJob['Eduacation'] = text_(item.find('degree'))
         webJob['Number'] = text_(item.find('jobnum'))
         webJob['Exercise'] = text_(item.find('workyear'))
         webJob['SSWelfare'] = text_(item.find('welfare'))
         webJob['SBWelfare'] = text_(item.find('jobtag'))
         webJob['LinkID'] = jobid
         webJob['JobCode'] = str(int(func))
         webJob['JobName'] = funcName
         webJob['Sex'] = u'不限'
         webJob['Require'] = u'招%s人|学历%s|经验%s|性别%s' % (webJob['Number'], webJob['Eduacation'], webJob['Exercise'], webJob['Sex'])
         jobarea = text_(item.find('jobarea')).split('-')
         CityName = jobarea[0]
         webJob['CityName'] = CityName
         webJob['WorkArea'] = CityName
         webJob['WorkArea1'] = ''
         webJob['WorkArea2'] = ''
         if len(jobarea) > 1:
             webJob['WorkArea1'] = jobarea[1]
         if len(jobarea) > 2:
             webJob['WorkArea2'] = jobarea[2]
         webJob['AreaCode'] = FmtAreaCodeSimple('remote_252_1', CityName)
         webJob['JobAddress'] = FmtSQLCharater(item.find('address').text)
         coid = item.find('coid').text
         webJob['CompanyLink'] = 'wuyao_' + coid
         webJob['SyncStatus'] = 0
         webJob['AnFmtID'] = 0
         webJob['KeyValue'] = ''
         webJob['ClickTimes'] = 0
         webJob['SBWelfare'] = ''
         webJob['OtherWelfare'] = ''
         webJob['Relation'] = ''
         webJob['Mobile'] = ''
         webJob['Email'] = ''
         webJob['Tag'] = ''
         webJob['ProvinceName'] = ''
         webJob['Telphone1'] = ''
         webJob['Telphone2'] = ''
         webJob['Age'] = 0
         webJob['ValidDate'] = ''
         webJob['ParentName'] = ''
         webJob['EduacationValue'] = 0
         webJob['SalaryMin'] = 0.0
         webJob['SalaryMax'] = 0.0
         webJob['NumberValue'] = 0
         webJob['SexValue'] = 0
         webJob['OperStatus'] = 0
         webJob['PropertyTag'] = ''
         webJob['SalaryValue'] = 0
         webJob['ExerciseValue'] = 0
         webJob['Valid'] = 'T'
         webJob['JobWorkTime'] = ''
         webJob['JobComputerSkill'] = ''
         webJob['ForeignLanguage'] = ''
         webJob['JobFunction'] = ''
         webJob['JobRequest'] = ''
         webJob['BusinessCode'] = ''
         webJob['InsertTime'] = datetime.datetime.today()
         webJob['LastModifyTime'] = datetime.datetime.today()
         #替换\符号
         webJob['SrcUrl'] = text_(item.find('share_url'))
         webJob['GisLongitude'] = text_(item.find('joblon'))
         webJob['GisLatitude'] = text_(item.find('joblat'))
         webJob['JobDesc'] = text_(item.find('jobinfo'))
         webJob['CompanyType'] = text_(item.find('cotype'))
         webJob['CompanyScale'] = text_(item.find('cosize'))
         #
         link_url = self.co_url.replace('<?coid?>', coid)
         link_url = self.create_url(link_url)
         yield Request(link_url, meta = {'coid': coid, 'job': webJob}, callback = self.parse_co)
Exemplo n.º 21
0
 def parse_info(self, response):
     if response.status == 200:
         data = response.body
         hxs = Selector(response)
         #开始解析
         title = first_item(
             hxs.xpath('//h1[@itemprop="title"]/text()').extract())
         salary = first_item(
             hxs.xpath('//span[@itemprop="baseSalary"]/text()').extract())
         location = first_item(
             hxs.xpath('//span[@itemprop="address"]/text()').extract())
         jobtype = first_item(
             hxs.xpath(
                 '//span[@itemprop="employmentType"]/text()').extract())
         companyname = first_item(
             hxs.xpath('//span[@itemprop="name"]/text()').extract())
         postdate = first_item(
             hxs.xpath('//span[@itemprop="datePosted"]/text()').extract())
         jobdesc = first_item(
             hxs.xpath('//section[@class="description"]/div[@class="well"]'
                       ).extract())
         logourl = first_item(
             hxs.xpath(
                 '//section[@class="brandInfo"]/div[@class="well"]/h2/img/@src'
             ).extract())
         if logourl != '':
             logourl = self.create_url(logourl)
         #
         match = re.search(r'<label>Contact:</label>\s*(.+)</li>', data,
                           re.I | re.M)
         if match:
             contact = match.group(1)
         else:
             contact = ''
         #
         match = re.search(r'<label>Address:</label>\s*(.+)</li>', data,
                           re.I | re.M)
         if match:
             address = match.group(1)
         else:
             address = ''
         #
         match = re.search(r'<label>Phone:</label>\s*(.+)</li>', data,
                           re.I | re.M)
         if match:
             phone = match.group(1)
         else:
             phone = ''
         #
         match = re.search(r'<label>Email:</label>\s*(.+)</li>', data,
                           re.I | re.M)
         if match:
             email = match.group(1)
         else:
             email = ''
         #
         match = re.search(r'<label>Website:</label>\s*<a href="(.+)" ',
                           data, re.I | re.M)
         if match:
             website = match.group(1)
         else:
             website = ''
         title = FmtSQLCharater(title)
         companyname = FmtSQLCharater(companyname)
         location = FmtSQLCharater(location)
         address = FmtSQLCharater(address)
         #
         job = JobsDB_Job()
         job['SiteID'] = self.site_id
         match = re.search(r'\.id(.+)\?', response.url, re.I | re.M)
         if match:
             job['LinkID'] = str(int(match.group(1)))
         job['JobTitle'] = title
         job['Company'] = companyname
         job['JobName'] = response.meta['sector']
         job['JobDesc'] = FmtSQLCharater(jobdesc)
         job['Salary'] = salary
         if jobtype.find('Full time') > 0:
             job['JobType'] = 1
         else:
             job['JobType'] = 0
         job['SrcUrl'] = response.url
         job['Number'] = 'one person'
         #时间格式化
         if postdate == '':
             postdate = datetime.today()
         else:
             postdate = datetime.strptime(postdate, '%d %b %y')
         job['PublishTime'] = postdate
         job['RefreshTime'] = postdate
         job['CityName'] = location
         job['WorkArea'] = job['CityName']
         job['JobAddress'] = address
         job['Mobile'] = phone
         #
         company = JobsDB_Company()
         company['WebSiteID'] = self.site_id
         company['CompanyName'] = companyname
         company['CompanyAddress'] = address
         company['WebSite'] = website
         company['CompanyLogoUrl'] = logourl
         company['AreaName'] = job['CityName']
         company['Mobile'] = phone
         #
         yield company
         yield job
     else:
         log.msg(u'职位详情请求结果解析异常.url=%s' % response.url, level=log.INFO)