def parseWebsite(): log = open('../logs/parseweb.log', 'a') # 每次取5000条 count = TempLz.select().count() pagesize = 5000 pagecount = int(math.ceil(float(count) / float(pagesize))) for i in range(pagecount): datas = TempLz.select().where(TempLz.id>18879).order_by(TempLz.id).paginate(i + 1, pagesize) if datas is not None: for d in datas: data = d.lzpage if data is not None: parseData = parserCompanyAndWeb(data) com = buildCompany(parseData['company']) web = buildWebsite(parseData['web']) if com is not None and web is not None: c = Company.getOne(Company.coname == com.coname) if c is not None: web.regID = c impWebsite(web) else: impCompanyInfo(com) tempCom = Company.getOne(Company.regID == com.regID) web.regID = tempCom impWebsite(web) log.write(str(d.id)+ "\n") print d.id log.flush() log.close()
def fetchLzPage(isLzUrl, lzPath, shortUrl, subTask): # 根据亮照的完整url进行抓取 status = downloadByPath(isLzUrl, lzPath) if not os.path.exists(lzPath): print "lzpath:",lzPath #logger.debug('亮照页面无法访问:', isLzUrl) #dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') # qw = Website.update(updateDate=dt).where(Website.webId == subTask.webId.webId) # qw.execute() q = TaskInfo.update(state='5', remark=isLzUrl).where(TaskInfo.id == subTask.id) q.execute() else: try: f = open(lzPath, 'r') parseData = parserCompanyAndWeb(f.read()) com = buildCompany(parseData['company']) tempBuildWeb = buildWebsite(parseData['web']) judgeLzResult(com, tempBuildWeb, shortUrl, subTask) except Exception, e: print e
# print TempLz.select().where(TempLz.lzpage.is_null()).count()==0 # lzpage = TempLz.getOne(TempLz.id==1).lzpage # c = TempLz.select(fn.Count(fn.Distinct(TempLz.name))).scalar() #TempLz.select().where(TempLz.lzpage.is_null(False)).order_by(TempLz.id).paginate(1, 10) #parseWebsite() # lzpage = TempLz.select().where(TempLz.name=="上海昊锌科技有限公司") # for l in lzpage: # page = l.lzpage # parseData = parserCompanyAndWeb(page) # w = buildWebsite(parseData['web']) # print parseData['web'] # str = u'\u57df\u540d:http://www.haoxinkj.com/' # # arrs = str.split(':') # str = u'\u57df\u540d:http://www.haoxinkj.com/' # i = str.find(":")+1 # # print str[i:] templz = TempLz.getOne(TempLz.id==5512) lzpage = templz.lzpage parseData = parserCompanyAndWeb(lzpage) web = parseData.get('web') tw = buildWebsite(web)