def get_huangye(self, html): status = 2 try: company_intro = html.xpath( '//*[@id="site_content"]/div[1]/div/div[1]/div/div[2]/div[2]/span' )[0].text if company_intro is None or len(str(company_intro)) < 5: company_intro = html.xpath( '//*[@id="site_content"]/div[1]/div/div/div/div[2]/div[2]/span/text()' ) self.resultItem['company_intro'] = process_str(company_intro) except Exception as e: print(str(self.company) + str(e.message) + ",company_intro") try: company_hidder = html.xpath( '//*[@id="site_content"]/div[1]/div/div[1]/div/div[2]/p' )[0].text if company_hidder is None or str(company_hidder) < 5: company_hidder = html.xpath( '//*[@id="site_content"]/div[1]/div/div/div/div[2]/p' )[0].text self.resultItem['company_hidder'] = process_str(company_hidder) self.resultItem['title'] = process_str( html.xpath('/html/head/title')[0].text) except: print(str(self.company) + str(e.message) + ",company_hidder") try: self.resultItem['classifications'] = process_str( html.xpath('//meta[@name="keywords"]/@content')) except: print(str(self.company) + str(e.message) + ",classifications") return status
def parse_company_info(self, html1): # 公司信息 try: html = etree.HTML(html1) except: return try: self.resultItem['shop_long'] = html.xpath( '//a[@href="https://cxt.1688.com/"]/text()') except Exception as e: print(str(self.company) + str(e.message)) #联系人姓名 self.resultItem['contact_name'] = self.get_contact_name(html) #电话号码 self.resultItem['telphone'] = self.get_telphone(html) #手机号码 self.resultItem['phone'] = self.get_phone(html) #经营年份 try: self.resultItem['bus_year'] = html.xpath( '//*[@id="site_content"]/div/div/div/div[2]/div/div[2]/div/div[1]/div/div/h1/a[2]/text()' ) except Exception as e: print(str(self.company) + str(e.message)) #信用等级 self.resultItem['credit_level'] = process_str( self.get_credit_level(html)) #公司简介 self.resultItem['company_intro'] = process_str( self.get_company_intro(html)) #公司经营品牌 self.resultItem['brand'] = process_str(self.get_company_brand(html)) #旺旺号码 self.resultItem['wang_wang'] = process_str(self.get_wangwang(html)) #注册资本 self.resultItem['registered_capital'] = process_str( self.get_original_money(html)) self.get_trade_credit_record(html) self.get_buyer_service_ablity(html) return 1
def company_baseinfo(self, html1): try: html = etree.HTML(html1) except: return url = '' # 公司名称 try: company_name = html.xpath( '//*[@id="offer1"]/div[1]/div[2]/div[1]/a[1]') new_company = stringQ2B( company_name[0].get('title')).lower().strip().replace( ')', '').replace('(', '') if cmp(new_company, self.company) != 0: print(self.company + ",公司名称不相等") return 1 self.resultItem['company'] = process_str( company_name[0].get('title')) url = company_name[0].get("href") self.resultItem['url'] = url except Exception as e: print(str(self.company) + str('公司名称不存在')) return 3 # 公司主营 try: main_c = html.xpath( '//*[@id="offer1"]/div[1]/div[2]/div[3]/div[1]/div[1]/a/*') main_d = [] for i in range(0, len(main_c)): main_d.append(main_c[i].text) self.resultItem['main_d'] = ','.join(main_d) except Exception as e: print(str(self.company) + str(e.message)) # 公司地址 try: self.resultItem['address'] = html.xpath( '//*[@id="offer1"]/div[1]/div[2]/div[3]/div[1]/div[2]/a' )[0].text except Exception as e: print(str(self.company) + str(e.message)) # 公司人数 try: self.resultItem['company_persons'] = html.xpath( '//*[@id="offer1"]/div[1]/div[2]/div[3]/div[1]/div[3]/a' )[0].get('title') except Exception as e: print(str(self.company) + str(e.message)) # 贸易类型 try: self.resultItem['model'] = html.xpath( '//*[@id="offer1"]/div[1]/div[2]/div[3]/div[2]/div[1]/b' )[0].text except Exception as e: print(str(self.company) + str(e.message)) return url
def get_yellow_page_info(self, html1): try: html = etree.HTML(html1) except: return status = 2 try: company_intro = html.xpath( '//*[@id="site_content"]/div[1]/div/div[1]/div/div[2]/div[2]/span' )[0].text except Exception as e: company_intro = html.xpath( '//*[@id="site_content"]/div[1]/div/div/div/div[2]/div[2]/span/text()' ) pass if company_intro is None or len(str(company_intro)) < 5: print('company_intro is has no scrape') else: self.resultItem['company_intro'] = company_intro try: company_hidder = html.xpath( '//*[@id="site_content"]/div[1]/div/div[1]/div/div[2]/p' )[0].text if company_hidder is None or str(company_hidder) < 5: company_hidder = html.xpath( '//*[@id="site_content"]/div[1]/div/div/div/div[2]/p' )[0].text self.resultItem['company_hidder'] = process_str(company_hidder) self.get_contract_name(self.resultItem['company_hidder']) self.resultItem['title'] = process_str( html.xpath('/html/head/title')[0].text) except: self.logger.error( str(self.company) + str(e.message) + ",company_hidder") try: self.resultItem['classifications'] = process_str("".join( html.xpath('//meta[@name="keywords"]/@content'))) except: self.logger.error( str(self.company) + str(e.message) + ",classifications") return status
def get_goods_title(self, html): try: list = [] titles = html.xpath( "//*[@class='offer-list-row']/li/div[3]/a/@title") index = 0 for title in titles: list.append(str(index) + ":" + process_str(title)) index += 1 if index > 20: break self.resultItem['goods_title'] = "@@".join(list) except Exception as e: print(str(self.company) + str(e.message) + ",contact_name")
def get_offer_info(self, html, i): resultItem = {} try: company_name = html.xpath( '//*[@id="offer%d"]/div[1]/div[2]/div[1]/a[1]' % i) new_company = stringQ2B( company_name[0].get('title')).lower().strip().replace( ')', '').replace('(', '') resultItem['company'] = new_company url = company_name[0].get("href") resultItem['url'] = url except Exception as e: print(str(resultItem.get('company')) + str('公司名称不存在')) # 公司主营 try: main_c = html.xpath( '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[1]/div[1]/a/*' % i) main_d = [] for i in range(0, len(main_c)): main_d.append(main_c[i].text) resultItem['main_d'] = process_str(','.join(main_d)) except Exception as e: print(str(resultItem.get('company')) + str(':公司主营解析错误')) # 公司地址 try: resultItem['address'] = process_str( html.xpath( '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[1]/div[2]/a' % i)[0].text) except Exception as e: print(str(resultItem.get('company')) + str(':公司地址解析错误')) try: resultItem['shop_name'] = process_str( html.xpath('//*[@id="offer%d"]/div[1]/div[2]/div[2]/span' % i)[0].text) except Exception as e: print(str(resultItem.get('company')) + str(':店铺名称解析错误')) # 公司人数 try: resultItem['company_persons'] = process_str( html.xpath( '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[1]/div[3]/a' % i)[0].get('title')) except Exception as e: print(str(resultItem.get('company')) + str('公司人数解析错误')) # 贸易类型 try: resultItem['model'] = html.xpath( '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[2]/div[1]/b' % i)[0].text except Exception as e: print(str(resultItem.get('company')) + str('贸易类型解析错误')) # for j in range(1, 4): try: resultItem[html.xpath( '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[2]/div[%d]/span' % (i, j + 1) )[0].text] = html.xpath( '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[2]/div[%d]/a' % (i, j + 1))[0].text except Exception as e: pass return resultItem