Пример #1
0
        def migrate_company():
            try:
                cnx = mysql.connector.connect(
                    host=self.host,
                    user=self.username,
                    password=self.password,
                    database=self.database,
                )
                cursor = cnx.cursor()
                query = "SELECT * FROM company"
                cursor.execute(query)

                for (id, name, city, timezone, alarm_email, address, tel, logo,
                     manager, extra, creator, createdat) in cursor:
                    u = Company()
                    u.id = id
                    u.name = name
                    u.city = city
                    u.timezone = timezone
                    u.alarm_email = alarm_email
                    u.address = address
                    u.telephone = tel
                    u.logo = logo
                    if manager in [0, -1]:
                        manager = 10
                    u.manager = manager
                    u.extra = extra
                    u.created_by = 10
                    u.created_on = createdat
                    db.session.add(u)
                    db.session.commit()
                cursor.close()
                cnx.close()
                return "done"
            except Exception as e:
                return str(e), 500
def get_company_info(type, compType, pageNo, pageSize):
    # current_comp_info_file = open('current_comp_info_file.txt', 'r+')
    # 获取已爬取的最新的链接
    # current_comp_info_href = current_comp_info_file.readline()
    response = requests.get(
        url=
        "http://jxsggzy.cn/jxggzy/services/JyxxWebservice/getTradeList?response=application/json&pageIndex="
        + pageNo + "&pageSize=" + pageSize +
        "&&dsname=ztb_data&bname=&qytype=" + type + "&itemvalue=" + compType,
        timeout=60000)
    # 获取返回的内容为json字符串
    text = response.text
    textJson = json.loads(text)
    returnJson = json.loads(textJson['return'])
    # 公司列表
    tableJson = returnJson['Table']

    # latest_flag = False
    companys = []
    i = 0
    for table in tableJson:
        company = Company()
        alink = table['alink']
        city = table['szdq']
        company.city = city.replace('·', '/')
        company.compName = table['qymc']
        # 详细信息的url
        detailUrl = 'http://ggzyjy.jxsggzy.cn/hygs/huiyuaninfo/pages/dailiinfo/jxpDaiLiInfoDetailForWebAction.action?cmd=page_Load&DanWeiType=' + compType + '&isCommondto=true&DanWeiGuid=' + alink
        # 详细页面的url
        pageUrl = "http://ggzyjy.jxsggzy.cn/hygs/huiyuaninfo/pages/FrameAll?DanWeiType=" + compType + "&DanWeiGuid=" + alink
        # 人员信息的url
        personnelUrl = 'http://ggzyjy.jxsggzy.cn/hygs/huiyuaninfo/pages/pminfo/jxpJtgcSgPmTempForWebListAction.action?cmd=page_Load&DanWeiType=' + compType + '&DanWeiGuid=' + alink + '&isCommondto=true'
        # 如果已爬取的最新链接不等于现在抓取的链接,则表示网站有更新新数据
        # if current_comp_info_href != detailUrl:
        #     if not latest_flag:
        #         # 清除文件原内容
        #         current_comp_info_file.seek(0)
        #         current_comp_info_file.truncate()
        #         # 记录新链接到文件中
        #         current_comp_info_file.write(detailUrl)
        #         latest_flag = True
        # else:
        #     # 网站没有更新新数据,则停止爬取
        #     break

        print(i)
        print(pageUrl)
        company.sourceUrl = pageUrl

        requestsCookies = getCookies(pageUrl)
        scrfcokie = requestsCookies['_CSRFCOOKIE']
        controls = get_detail_info(requestsCookies, scrfcokie, detailUrl,
                                   compType, alink)
        company = set_company_type(type, compType, company, requestsCookies,
                                   scrfcokie, personnelUrl, alink)
        company = set_company_info(controls, company)
        # 方案一
        # detailResponse = requests.get(pageUrl, timeout=60000)
        # # 获取文本原来编码,使两者编码一致才能正确显示
        # detailResponse.encoding = detailResponse.apparent_encoding
        # # 使用的是html解析,一般使用lxml解析更好
        # detailSoup = BeautifulSoup(detailResponse.text, 'html5lib')
        # company.contentDetail = str(detailSoup)
        # 方案二
        # company.contentDetail = getHtmlText(pageUrl)
        # 关闭文件
        # current_comp_info_file.close()
        companys.append(company.__dict__)
        i = i + 1
    return companys