示例#1
0
def parseWebsite():
    log = open('../logs/parseweb.log', 'a')
    # 每次取5000条
    count = TempLz.select().count()
    pagesize = 5000
    pagecount = int(math.ceil(float(count) / float(pagesize)))
    for i in range(pagecount):
        datas = TempLz.select().where(TempLz.id>18879).order_by(TempLz.id).paginate(i + 1, pagesize)
        if datas is not None:
            for d in datas:
                data = d.lzpage
                if data is not None:
                    parseData = parserCompanyAndWeb(data)
                    com = buildCompany(parseData['company'])
                    web = buildWebsite(parseData['web'])
                    if com is not None and web is not None:
                        c = Company.getOne(Company.coname == com.coname)
                        if c is not None:
                            web.regID = c
                            impWebsite(web)
                        else:
                            impCompanyInfo(com)
                            tempCom = Company.getOne(Company.regID == com.regID)
                            web.regID = tempCom
                            impWebsite(web)
                log.write(str(d.id)+ "\n")
                print d.id
    log.flush()
    log.close()
示例#2
0
def fetchLzPage(isLzUrl, lzPath, shortUrl, subTask):
    # 根据亮照的完整url进行抓取
    status = downloadByPath(isLzUrl, lzPath)
    if not os.path.exists(lzPath):
        print "lzpath:",lzPath
        #logger.debug('亮照页面无法访问:', isLzUrl)
        #dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
        # qw = Website.update(updateDate=dt).where(Website.webId == subTask.webId.webId)
        # qw.execute()
        q = TaskInfo.update(state='5', remark=isLzUrl).where(TaskInfo.id == subTask.id)
        q.execute()
    else:
        try:
            f = open(lzPath, 'r')
            parseData = parserCompanyAndWeb(f.read())
            com = buildCompany(parseData['company'])
            tempBuildWeb = buildWebsite(parseData['web'])
            judgeLzResult(com, tempBuildWeb, shortUrl, subTask)
        except Exception, e:
            print e
示例#3
0
    # print TempLz.select().where(TempLz.lzpage.is_null()).count()==0

    # lzpage = TempLz.getOne(TempLz.id==1).lzpage
    # c = TempLz.select(fn.Count(fn.Distinct(TempLz.name))).scalar()
    #TempLz.select().where(TempLz.lzpage.is_null(False)).order_by(TempLz.id).paginate(1, 10)
    #parseWebsite()
    # lzpage = TempLz.select().where(TempLz.name=="上海昊锌科技有限公司")
    # for l in lzpage:
    #     page = l.lzpage
    #     parseData = parserCompanyAndWeb(page)
    #     w =  buildWebsite(parseData['web'])
    #     print parseData['web']
    # str = u'\u57df\u540d:http://www.haoxinkj.com/'
    #
    # arrs = str.split(':')

    # str = u'\u57df\u540d:http://www.haoxinkj.com/'
    # i = str.find(":")+1
    #
    # print str[i:]
    templz = TempLz.getOne(TempLz.id==5512)
    lzpage = templz.lzpage
    parseData = parserCompanyAndWeb(lzpage)
    web = parseData.get('web')
    tw = buildWebsite(web)