class Sniffy(object): def __init__(self): self.db = SpeedyDb() def findBetas(self): sites = self.db.getSites() for site in sites: siteId = site[0] siteName = site[1] urlBits = urlparse(site[2]) sitebit = urlBits.netloc.strip('www') self.checkSite('beta', sitebit) self.checkSite('new', sitebit) # self.checkSite('alpha', sitebit) def checkSite(self, type, sitebit): url = 'http://' + type + sitebit code = self.getSite(url) if (code == 200): print url def getSite(self, url): try: response = urllib2.urlopen(url, timeout=10) return response.code except: return 500
def runmonth(monthid): # stuff... here = os.path.dirname(__file__) folder = os.path.join(here, "../results/{0}/html".format(monthid)) sitecount = 0; con = lite.connect('speedyplus.db') cur = con.cursor() db = SpeedyDb() sites = db.getSites() for site in sites: siteName = site[1] siteFile = "{0}\\{1}.html".format(folder, siteName) print "{0:<3} {1:<25}".format(site[0], site[1]), if os.path.exists(siteFile): sitecount = sitecount + 1 print "{0:25}".format(os.path.split(siteFile)[1]), fo = open(siteFile, 'r') content = fo.read() trendyness = GetTheTrendy(content) linkcount = linkCounter(content) words = CountTheWords(content) fo.close() sql = trendlySql_insert.format(site[0], monthid, trendyness, linkcount, words) # print sql cur.execute(sql) con.commit() print '{0:<2} {1:<4} {2}'.format(trendyness, linkcount, words), print '.' print '' for i in range(len(trends)): print '{0:<30}: {1}\t{2:.0%}'.format(trends[i], trendcounts[i], percentage(trendcounts[i],sitecount)) for word, count in c.most_common(100): print word, count
def loaddata(): db = SpeedyDb() sites = db.getSites() for site in sites: name = site[1] print name, spider_ok = True data = getSiteInfo(name) if not (data is None): print 'Loading....', site[0], name, data['pages'], db.saveLinkInfo(site[0], int(data['pages']), int(data['docs']), int(data['broken']), int(data['queued'])) if int(data["pages"]) == 10000 or int(data['links']) == 20000 or int(data['broken']) == 1000 or int(data['queued']) > 0: spider_ok = False db.setSpiderStatus(site[0], spider_ok) # print spider_ok domains = getDomains(site[0], site[2], name, db)
def main(argv): db = SpeedyDb() sites = db.getSites() CheckSites(sites, THREAD_COUNT)