예제 #1
0
def impData():
    while(parsePage()!=[]):
        companyData = parsePage()
        for data in companyData:
            #解析当前内容
            result = parser(data)
            com = buildCompany(result)
            impCompanyInfo(com)
    data = TempLz.select().where(TempLz.lzID=='20120323163520958')
    for d in data:
        result = parser(d.lzpage)
        for r in result:
            print r
예제 #2
0
파일: impdb.py 프로젝트: strongant/gsorglz
def parsePage():
    # 将已经解析过的亮照id写入impdbdata.log
    # 读取impdbdata.log

    log = None
    try:
        # 直到所有页面抓取完毕为止
        while (True):
            f = open('../logs/impdbdata.log', 'r')
            lines = f.readlines()
            arrs = []
            for l in lines:
                arrs.append(l)
            f.close()

            log = open('../logs/impdbdata.log', 'a')
            # 获取此文件中的所有regID
            if arrs != []:
                lzs = TempLz.select().where(
                        (TempLz.lzpage != '')
                        & (TempLz.lzID.not_in(arrs))
                )
                print 'result count:', TempLz.select().where(
                        (TempLz.lzpage != '')
                        & (TempLz.lzID.not_in(arrs))).count()

            else:
                lzs = TempLz.select().where(
                        (TempLz.lzpage != '')
                )
            i = 0
            for lz in lzs:
                i += 1
                result = parser(lz.lzpage)
                com = buildCompany(result)
                # 构建公司信息
                impCompanyInfo(com)
                log.write(lz.lzID + '\n')
                log.flush()
    except Exception, e:
        print e