def migrate_company(): try: cnx = mysql.connector.connect( host=self.host, user=self.username, password=self.password, database=self.database, ) cursor = cnx.cursor() query = "SELECT * FROM company" cursor.execute(query) for (id, name, city, timezone, alarm_email, address, tel, logo, manager, extra, creator, createdat) in cursor: u = Company() u.id = id u.name = name u.city = city u.timezone = timezone u.alarm_email = alarm_email u.address = address u.telephone = tel u.logo = logo if manager in [0, -1]: manager = 10 u.manager = manager u.extra = extra u.created_by = 10 u.created_on = createdat db.session.add(u) db.session.commit() cursor.close() cnx.close() return "done" except Exception as e: return str(e), 500
def get_company_info(type, compType, pageNo, pageSize): # current_comp_info_file = open('current_comp_info_file.txt', 'r+') # 获取已爬取的最新的链接 # current_comp_info_href = current_comp_info_file.readline() response = requests.get( url= "http://jxsggzy.cn/jxggzy/services/JyxxWebservice/getTradeList?response=application/json&pageIndex=" + pageNo + "&pageSize=" + pageSize + "&&dsname=ztb_data&bname=&qytype=" + type + "&itemvalue=" + compType, timeout=60000) # 获取返回的内容为json字符串 text = response.text textJson = json.loads(text) returnJson = json.loads(textJson['return']) # 公司列表 tableJson = returnJson['Table'] # latest_flag = False companys = [] i = 0 for table in tableJson: company = Company() alink = table['alink'] city = table['szdq'] company.city = city.replace('·', '/') company.compName = table['qymc'] # 详细信息的url detailUrl = 'http://ggzyjy.jxsggzy.cn/hygs/huiyuaninfo/pages/dailiinfo/jxpDaiLiInfoDetailForWebAction.action?cmd=page_Load&DanWeiType=' + compType + '&isCommondto=true&DanWeiGuid=' + alink # 详细页面的url pageUrl = "http://ggzyjy.jxsggzy.cn/hygs/huiyuaninfo/pages/FrameAll?DanWeiType=" + compType + "&DanWeiGuid=" + alink # 人员信息的url personnelUrl = 'http://ggzyjy.jxsggzy.cn/hygs/huiyuaninfo/pages/pminfo/jxpJtgcSgPmTempForWebListAction.action?cmd=page_Load&DanWeiType=' + compType + '&DanWeiGuid=' + alink + '&isCommondto=true' # 如果已爬取的最新链接不等于现在抓取的链接,则表示网站有更新新数据 # if current_comp_info_href != detailUrl: # if not latest_flag: # # 清除文件原内容 # current_comp_info_file.seek(0) # current_comp_info_file.truncate() # # 记录新链接到文件中 # current_comp_info_file.write(detailUrl) # latest_flag = True # else: # # 网站没有更新新数据,则停止爬取 # break print(i) print(pageUrl) company.sourceUrl = pageUrl requestsCookies = getCookies(pageUrl) scrfcokie = requestsCookies['_CSRFCOOKIE'] controls = get_detail_info(requestsCookies, scrfcokie, detailUrl, compType, alink) company = set_company_type(type, compType, company, requestsCookies, scrfcokie, personnelUrl, alink) company = set_company_info(controls, company) # 方案一 # detailResponse = requests.get(pageUrl, timeout=60000) # # 获取文本原来编码,使两者编码一致才能正确显示 # detailResponse.encoding = detailResponse.apparent_encoding # # 使用的是html解析,一般使用lxml解析更好 # detailSoup = BeautifulSoup(detailResponse.text, 'html5lib') # company.contentDetail = str(detailSoup) # 方案二 # company.contentDetail = getHtmlText(pageUrl) # 关闭文件 # current_comp_info_file.close() companys.append(company.__dict__) i = i + 1 return companys