def parse_info(self, detail_url, guid, pro_name, pro_date, registration_num): global ar_name, com_name ar_name = '' com_name = '' pro_name = pro_name pro_date = pro_date registration_num = registration_num duration = '' money = str(0) headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Ajax-method': 'GetZBJGGSHXRNewByGuid', 'Connection': 'keep-alive', 'Content-Length': '53', 'Content-Type': 'text/plain; charset=UTF-8', 'Cookie': 'ASP.NET_SessionId=xdoj1k24jseqj1l4dnutjdf2', 'Host': 'www.kmggzy.com', 'Origin': 'https://www.kmggzy.com', 'Referer': 'https://www.kmggzy.com/Jyweb/ZBJGGSNewView2.aspx?isBG=0&guid=71be0490-2e7c-4ce1-8663-4a71cf95112c&subType2=11&subType=1&type=%E4%BA%A4%E6%98%93%E4%BF%A1%E6%81%AF&area=1&zbtype=0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36', } a = int(time.time() * 1000) p_data = '["' + guid + '"]' + str(a) url = 'https://www.kmggzy.com/TrueLoreAjax/TrueLore.Web.WebUI.WebAjaxService,TrueLore.Web.WebUI.ashx' original_url = detail_url print detail_url res = requests.post(url, data=p_data, headers=headers) if re.findall(',1:"(.*?)",', res.content): com_name = re.findall(',1:"(.*?)",', res.content)[1] ar_name = re.findall(',3:"(.*?)",', res.content)[1] ProjectModel().insert_project( self.name, com_name, ar_name, pro_name, pro_date, money, self.source, original_url, duration, registration_num, self.ch_area, self.ch_city, self.ch_region, self.en_area, self.en_city, self.en_region, self.crawler_id, self.spider_id, self.headers['User-Agent'])
def parse_info(self, response): global registration_num, pro_name, com_name, ar_name, money, tr_list money = '' tr_list = [] registration_num = '' pro_name = '' com_name = '' ar_name = '' duration = '' original_url = response.url pro_date = response.meta['pro_date'] ch_region = response.meta['ch_region'] en_region = response.meta['en_region'] soup = BeautifulSoup(response.body, 'lxml') if soup.find('div', class_='con'): tr_list = soup.find('div', class_='con').find('table').find_all('tr') for tr in tr_list: tr_content = tr.text if '备注说明:' not in tr_content: # 编号 if '标段编号:' in tr_content: registration_num = tr_content.split('标段编号:')[1].strip() # 项目名 if '标段名称:' in tr_content: pro_name = tr_content.split('标段名称:')[1].strip() # 中标公司 if '中标人:' in tr_content: com_name = tr_content.split('中标人:')[1].strip() # 项目负责人 if '项目经理:' in tr_content: ar_name = tr_content.split(':')[1].strip() # 项目金额 if '中标价:' in tr_content: money = tr_content.split('中标价:')[1].strip() if '万元' in money: money = money.split('万元')[0] if ar_name == '无' or ar_name == '/': ar_name = '' ProjectModel().insert_project(self.name, com_name, ar_name, pro_name, pro_date, money, self.source, original_url, duration, registration_num, self.ch_area, self.ch_city, ch_region, self.en_area, self.en_city, en_region, self.crawler_id, self.spider_id, self.headers['User-agent'])
def parse_info(self, response): global registration_num, pro_name, com_name, ar_name, money, tr_list money = '' tr_list = [] registration_num = '' pro_name = '' com_name = '' ar_name = '' duration = '' original_url = response.url pro_date = response.meta['pro_date'] ch_region = response.meta['ch_region'] en_region = response.meta['en_region'] soup = BeautifulSoup(response.body, 'lxml') if soup.find('div', class_='con'): tr_list = soup.find('div', class_='con').find('table').find_all('tr') for tr in tr_list: tr_content = tr.text if '备注说明:' not in tr_content: # 编号 if '标段编号:' in tr_content: registration_num = tr_content.split('标段编号:')[1].strip() # 项目名 elif '标段名称:' in tr_content: pro_name = tr_content.split('标段名称:')[1].strip() if '招标文件' in pro_name: pro_name = pro_name.split('招标文件')[0].strip() # 中标公司 elif '中标人:' in tr_content: com_name = tr_content.split('中标人:')[1].strip() # 项目负责人 elif '项目经理:' in tr_content: ar_name = tr_content.split(':')[1].strip() # 项目金额 elif '中标价:' in tr_content: money = tr_content.split('中标价:')[1].strip() if '万元' in money: money = money.split('万元')[0] if com_name != '': # print pro_name, com_name, ar_name, response.url ProjectModel().insert_project( self.name, com_name, ar_name, pro_name, pro_date, money, self.source, original_url, duration, registration_num, self.ch_area, self.ch_city, ch_region, self.en_area, self.en_city, en_region, self.crawler_id, self.spider_id, self.headers['User-agent']) elif soup.find('div', class_='news-layout'): tr_list = soup.find( 'div', class_='news-layout').find('table').find_all('tr') for tr in tr_list: tc = tr.text # 项目名 if '项目名称' in tc or '工程名称' in tc: pro_name = tr.text.split('名称')[1].strip() if ':' in pro_name: pro_name = pro_name.split(':')[1].strip() # 中标公司 elif '中标候选人' in tc: com_name = tr.text.split('中标候选人')[1].strip() if '无' in com_name: com_name = '' if '得分' in com_name: com_name = com_name.split('得分')[0].strip() if ':' in com_name: com_name = com_name.split(':')[1].strip() if ';' in com_name: com_name = com_name.split(';')[0].strip() # 项目负责人 elif '项目经理:' in tc: ar_name = tr.text.split(':')[1].split('项目经理编号')[0].strip() if com_name != '': # print pro_name, com_name, ar_name, response.url ProjectModel().insert_project( self.name, com_name, ar_name, pro_name, pro_date, money, self.source, original_url, duration, registration_num, self.ch_area, self.ch_city, ch_region, self.en_area, self.en_city, en_region, self.crawler_id, self.spider_id, self.headers['User-agent'])
def parse_info(self, response): global pro_name, com_name, ar_name, money, registration_num pro_name = '' com_name = '' ar_name = '' money = '' original_url = response.url registration_num = '' pro_date = response.meta['pro_date'] pro_name = response.meta['pro_name'] web_data = response.body # 项目编号 if re.match(".*?标段编号\:(.*?)标段", web_data, re.DOTALL): registration_num = remove_tags( re.match(".*?标段编号\:(.*?)标段", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace(":", "") else: registration_num = '' # 中标单位 if re.match(".*?中标人\:(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?中标人\:(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif re.match(".*?第一中标侯选人\:(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?第一中标侯选人\:(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "公司" in com_name: com_name = com_name.split("公司")[0] + "公司" if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif re.match(".*?中标单位\:(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?中标单位\:(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "公司" in com_name: com_name = com_name.split("公司")[0] + "公司" if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif re.match(".*?中标候选人\:(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?中标候选人\:(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "公司" in com_name: com_name = com_name.split("公司")[0] + "公司" if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif response.xpath( "//div[@class='detail_contect']/table/tr[8]/td/table/tr[2]/td[2]" ).extract(): com_name = remove_tags( response.xpath( "//div[@class='detail_contect']/table/tr[8]/td/table/tr[2]/td[2]" ).extract_first()) if "公司" in com_name: com_name = com_name elif response.xpath( "//div[@class='detail_contect']/table/tr[8]/td/table/tr[2]/td[1]" ).extract(): com_name = remove_tags( response.xpath( "//div[@class='detail_contect']/table/tr[8]/td/table/tr[2]/td[1]" ).extract_first()) else: com_name = "" elif response.xpath( "//table[@class='MsoNormalTable']/tbody/tr[2]/td[3]/p[@class='MsoNormal']/span[1]" ).extract(): com_name = remove_tags( response.xpath( "//table[@class='MsoNormalTable']/tbody/tr[2]/td[3]/p[@class='MsoNormal']/span[1]" ).extract_first()) elif re.match(".*?第一中标候选人为\:(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?.*?第一中标候选人为\:(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif re.match(".*?第一中标候选人(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?第一中标候选人(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif re.match(".*?中标单位(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?中标单位(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "公司" in com_name: com_name = com_name.split("公司")[0] + "公司" if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif re.match(".*?中标人名称(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?中标人名称(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "公司" in com_name: com_name = com_name.split("公司")[0] + "公司" if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif re.match(".*?拟中标人(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?拟中标人(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "公司" in com_name: com_name = com_name.split("公司")[0] + "公司" if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" elif re.match(".*?第一中标候选单位\:(.*?公司)", web_data, re.DOTALL): com_name = remove_tags( re.match(".*?第一中标候选单位\:(.*?公司)", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace("/", "").replace(":", "") if "公司" in com_name: com_name = com_name.split("公司")[0] + "公司" if "院" in com_name: com_name = com_name.split("院")[0] + "院" if "局" in com_name: com_name = com_name.split("局")[0] + "局" if "中心" in com_name: com_name = com_name.split("中心")[0] + "中心" if '"' in com_name: com_name = com_name.split('"')[-1] if "中标" in com_name or "招标" in com_name: com_name = "" else: com_name = '' # 中标价格 if re.match(".*?中标价\:(.*?)元", web_data, re.DOTALL): try: money = remove_tags( re.match(".*?中标价\:(.*?)元", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace( "\t", "").replace(" ", "").replace(":", "") if "万" in money: money = money.replace("万", "") money = float(money) else: money = float(money) / 10000 except Exception: money = 0 else: money = 0 # 工期 if re.match(".*?中标工期\:(.*?)天", web_data, re.DOTALL): duration = remove_tags( re.match(".*?中标工期\:(.*?)天", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace(":", "") if "日" in duration: duration = duration.split("日")[0] if len(duration) > 5: duration = str(0) else: duration = str(0) # 建造师或者项目负责人 if re.match(".*?项目经理\:(.*?)备注", web_data, re.DOTALL): ar_name = remove_tags( re.match(".*?项目经理\:(.*?)备注", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace(":", "") if "招标" in ar_name: ar_name = ar_name.split("招标")[0] if "," in ar_name: ar_name = ar_name.split(",")[0] if len(ar_name) > 3: ar_name = "" elif re.match(".*?建造师名称(.*?)等", web_data, re.DOTALL): ar_name = remove_tags( re.match(".*?建造师名称(.*?)等", web_data, re.DOTALL).group(1)).strip().replace( "\n", "").replace("\r", "").replace("\t", "").replace( " ", "").replace(":", "") if "招标" in ar_name: ar_name = ar_name.split("招标")[0] if "," in ar_name: ar_name = ar_name.split(",")[0] if len(ar_name) > 3: ar_name = "" else: ar_name = "" money = str(money) ProjectModel().insert_project( self.name, com_name, ar_name, pro_name, pro_date, money, self.source, original_url, duration, registration_num, self.ch_area, self.ch_city, self.ch_region, self.en_area, self.en_city, self.en_region, self.crawler_id, self.spider_id, self.headers['User-agent'])