restCount = 0 badZipCount = 0 badReviewCount = 0 badTagCount = 0 p = ProgressBar(widgets=[Percentage(),Bar(),Timer()], maxval=len(NYCzips)) p.start() for index,zipcode in enumerate(NYCzips): lowlist = [] exec("picklefile = open('%s','r')" % (str(zipcode) + 'results2')) exec("output = open('%s','w')" % (str(zipcode)+'reviewslt'+str(revThreshold))) locationListstrings = pickle.load(picklefile) picklefile.close() for location in locationListstrings: soup = BeautifulSoup(location,'xml') if not testReviewCount(soup,revThreshold): #print soup.user_review_count.text badReviewCount+=1 lowlist.append(location) pickle.dump(lowlist,output) remCount+=len(lowlist) restCount+=len(locationListstrings) print str(len(lowlist)) + ' locations removed for zip: ' + str(zipcode) output.close() p.update(index+1) p.finish() print "Restaurants with # of reviews < " + str(revThreshold) + ": " + str(remCount) print str(restCount-remCount) + ' restaurants are ok'
restCount = 0 badZipCount = 0 badReviewCount = 0 badTagCount = 0 for zipcode in NYCzips: goodlist = [] exec ("picklefile = open('%s','r')" % (str(zipcode) + "results2")) exec ("output = open('%s','w')" % (str(zipcode) + "results2Filter1")) locationListstrings = pickle.load(picklefile) picklefile.close() for location in locationListstrings: soup = BeautifulSoup(location, "xml") if not testZip(soup, zipcode): badZipCount += 1 elif not testReviewCount(soup, 1): badReviewCount += 1 elif not checkTags(soup): badTagCount += 1 else: goodlist.append(location) pickle.dump(goodlist, output) restCount += len(goodlist) remCount += len(locationListstrings) - len(goodlist) print str(len(locationListstrings) - len(goodlist)) + " locations removed for zip: " + str(zipcode) output.close() print "Summary, post filter 1:" print str(remCount) + " locations removed; " + str(badZipCount) + " zips, " + str(badReviewCount) + " reviews, " + str( badTagCount