示例#1
0
class Sniffy(object):
	def __init__(self):
		self.db = SpeedyDb()
	
	def findBetas(self):
		sites = self.db.getSites()
		
		for site in sites:
			siteId = site[0]
			siteName = site[1]
			urlBits = urlparse(site[2])
		
			sitebit = urlBits.netloc.strip('www')
		
			self.checkSite('beta', sitebit)
			self.checkSite('new', sitebit)
			# self.checkSite('alpha', sitebit)

	def checkSite(self, type, sitebit):
	
		url = 'http://' + type + sitebit
		code = self.getSite(url)
			
		if (code == 200):
			print url
		
			
	def getSite(self, url):
		try:
			response = urllib2.urlopen(url, timeout=10)
			return response.code
		except:
			return 500
示例#2
0
def runmonth(monthid):
	# stuff...	
	here = os.path.dirname(__file__)
	folder = os.path.join(here, "../results/{0}/html".format(monthid))

	sitecount = 0;

	con = lite.connect('speedyplus.db')
	cur = con.cursor()

	db = SpeedyDb()
	sites = db.getSites()
	for site in sites:
		siteName = site[1]
		siteFile = "{0}\\{1}.html".format(folder, siteName)
		print "{0:<3} {1:<25}".format(site[0], site[1]),
		
		if os.path.exists(siteFile):
			sitecount = sitecount + 1 
			print "{0:25}".format(os.path.split(siteFile)[1]),
			fo = open(siteFile, 'r')
			content = fo.read()
			trendyness = GetTheTrendy(content) 
			linkcount = linkCounter(content)
			words = CountTheWords(content)
			fo.close()
			
			sql = trendlySql_insert.format(site[0], monthid, trendyness, linkcount, words)
			
			# print sql 
			cur.execute(sql)
			con.commit()		
			print '{0:<2} {1:<4} {2}'.format(trendyness, linkcount, words),
		
		print '.' 

	print ''
	for i in range(len(trends)):
		print '{0:<30}: {1}\t{2:.0%}'.format(trends[i], trendcounts[i], percentage(trendcounts[i],sitecount))

	for word, count in c.most_common(100):
		print word, count 
def loaddata():
    db = SpeedyDb()
    sites = db.getSites()

    for site in sites:
        name = site[1]
        print name,
        spider_ok = True 
        data = getSiteInfo(name)
        if not (data is None):
            print 'Loading....', site[0], name, data['pages'], 

            db.saveLinkInfo(site[0], int(data['pages']), int(data['docs']), int(data['broken']), int(data['queued']))

            if int(data["pages"]) == 10000 or int(data['links']) == 20000 or int(data['broken']) == 1000 or int(data['queued']) > 0:
                spider_ok = False 
            db.setSpiderStatus(site[0], spider_ok)
            # print spider_ok

        domains = getDomains(site[0], site[2], name, db)
示例#4
0
def main(argv):
    db = SpeedyDb()
    sites = db.getSites()

    CheckSites(sites, THREAD_COUNT)