def impData(): while(parsePage()!=[]): companyData = parsePage() for data in companyData: #解析当前内容 result = parser(data) com = buildCompany(result) impCompanyInfo(com) data = TempLz.select().where(TempLz.lzID=='20120323163520958') for d in data: result = parser(d.lzpage) for r in result: print r
def parsePage(): # 将已经解析过的亮照id写入impdbdata.log # 读取impdbdata.log log = None try: # 直到所有页面抓取完毕为止 while (True): f = open('../logs/impdbdata.log', 'r') lines = f.readlines() arrs = [] for l in lines: arrs.append(l) f.close() log = open('../logs/impdbdata.log', 'a') # 获取此文件中的所有regID if arrs != []: lzs = TempLz.select().where( (TempLz.lzpage != '') & (TempLz.lzID.not_in(arrs)) ) print 'result count:', TempLz.select().where( (TempLz.lzpage != '') & (TempLz.lzID.not_in(arrs))).count() else: lzs = TempLz.select().where( (TempLz.lzpage != '') ) i = 0 for lz in lzs: i += 1 result = parser(lz.lzpage) com = buildCompany(result) # 构建公司信息 impCompanyInfo(com) log.write(lz.lzID + '\n') log.flush() except Exception, e: print e