示例#1
0
	def getDomainWebContent(self, domain):
		"""
		@param domain: a Domain object containing the domain that you want web_content for.
		@return a hashmap of all the webcontent for this domain.
		"""
		url = domain.value + '.' + domain.tld
		try:
			wpg = WebPageInfoGetter(url)
			wpg.setUpGetter(url)
		except Exception as e:
			self.web_exceptions.append(e)
		try:
			nilsimsa = wpg.getNilsimsaHash(url, False)
		except Exception as e:
			nilsimsa = -1
			self.web_exceptions.append(e)
		try:
			image = wpg.getImageHash(url, False)
		except Exception as e:
			image = -1
			self.web_exceptions.append(e)
		try:
			redirects = wpg.getNumberOfRedirects(url, False)
		except Exception as e:
			redirects = -1
			self.web_exceptions.append(e)
		return {"nilsimsa" : nilsimsa, "image" : image, "redirects" : redirects}
示例#2
0
	def work(self, index, domain):
		"""
		each thread does the work here
		"""
		url = domain + '.' + self.aTLD
		exceptions = []

		if self.web_content:
			#target webcontent with this thread
			try:
				wpg = WebPageInfoGetter(url)
				wpg.id += str(index)
				wpg.setUpGetter(url)
			except Exception as e:
				exceptions.append(e)
			try:
				nilsimsa = wpg.getNilsimsaHash(url, False)
			except Exception as e:
				nilsimsa = None
				exceptions.append(e)
			try:
				image = wpg.getImageHash(url, False)
			except Exception as e:
				image = None
				exceptions.append(e)
			try:
				redirects = wpg.getNumberOfRedirects(url, False)
			except Exception as e:
				redirects = None
				exceptions.append(e)
			info = "-Domain: {}\nNilsimsa: {}\nImageHash: {}\nRedirects: {}\nExceptions: {}\n".format(url, nilsimsa, image, redirects, exceptions)
			self.window[index%self.window_size] = info
		else:
			#target only the whois content with this thread
			try:
				whois_parser = Whois_Parser()
				whois_server = whois_parser.server_info['.' + self.aTLD][0]
			except Exception as e:
				exceptions.append(e)
			try:
				creation_date = whois_parser.getCreationDate(url, whois_server)
			except Exception as e:
				creation_date = None
				exceptions.append(e)
			try:
				privacy_prot = whois_parser.isWhoisPrivacyProtected(url, whois_server)
			except Exception as e:
				privacy_prot = None
				exceptions.append(e)
			try:
				is_parking = whois_parser.isParking(url, whois_server)
			except Exception as e:
				is_parking = None
				exceptions.append(e)
			info = "-Domain: {}\nCreationDate: {}\nPrivacy: {}\nParking: {}\nExceptions: {}\n".format(url, creation_date, privacy_prot, is_parking, exceptions)
			self.window[index%self.window_size] = info
示例#3
0
	def _record_domain_info(self, a_domain, a_tld, a_file, switch=True):
		"""
		Record all information for a domain 
		"""
		exceptions = []
		domain_ctypos = self._generate_ctypos_for_domain(a_domain)
		#first we grab all the content we can via loading up the url
		try:
			wpg = WebPageInfoGetter(a_domain)
			wpg.setUpGetter(a_domain)
		except Exception as e:
			exceptions.append(e)
		try:
			nilsimsa = wpg.getNilsimsaHash(a_domain, False)
		except Exception as e:
			nilsimsa = None
			exceptions.append(e)
		try:
			image = wpg.getImageHash(a_domain, False)
		except Exception as e:
			image = None
			exceptions.append(e)
		try:
			redirects = wpg.getNumberOfRedirects(a_domain, False)
		except Exception as e:
			redirects = None
			exceptions.append(e)

		#next we grab all the whois content
		whois_server_found = False
		try:
			whois_parser = Whois_Parser()
			whois_server = whois_parser.server_info['.' + a_tld][0]
			whois_server_found = True
		except Exception as e:
			whois_server_found = False
			exceptions.append(e)
		try:
			if whois_server_found: 
				creation_date = whois_parser.getCreationDate(a_domain, whois_server)
			else:
				creation_date = None
		except Exception as e:
			creation_date = None
			exceptions.append(e)
		try:
			if whois_server_found: 
				privacy_prot = whois_parser.isWhoisPrivacyProtected(a_domain, whois_server)
			else:
				privacy_prot = None
		except Exception as e:
			privacy_prot = None
			exceptions.append(e)
		try:
			if whois_server_found: 
				is_parking = whois_parser.isParking(a_domain, whois_server)
			else:
				is_parking = None
		except Exception as e:
			is_parking = None
			exceptions.append(e)

		#next we grab Alexa info
		#try:
		#	is_top = self.alexa_reader.isDomainInAlexaTop(a_domain)
		#except Exception as e:
		#	is_top = None
		#	exceptions.append(e)

		with open(a_file, "a") as data_fp:
			#write out all of our data to the file
			data_fp.write("-Domain: {}\n".format(a_domain))
			data_fp.write("NumberOfCandidates: {}\n".format(len(domain_ctypos)))
			data_fp.write("Candidates: {}\n".format(str(domain_ctypos)))
			data_fp.write("Nilsimsa: {}\n".format(nilsimsa))
			data_fp.write("ImageHash: {}\n".format(image))
			data_fp.write("Redirects: {}\n".format(redirects))
			data_fp.write("CreationDate: {}\n".format(creation_date))
			data_fp.write("Privacy: {}\n".format(privacy_prot))
			data_fp.write("Parking: {}\n".format(is_parking))
			for exception in exceptions:
				data_fp.write("Exception: {}\n".format(exception))
	def navigateZoneFile(self, aGzipFile, aTLD="com"):
		"""
		Method to navigate all the domains -- and their candidates -- in a file
		"""
		dataFileName = aGzipFile.split('.')[0]
		#load the appropriate files into memory
		tld_files = self._loadCurrPrevAndNextFromFile(aGzipFile)
		for domain in tld_files["current"].keys():
			#STORE ALL INFORMATION FOR THE FILE
			#First, now that contents are in memory, go after the candidates
			candidates = []
			exceptions = []
			#generate typos for the domain in question
			gtypos = self._generate_typos_inhash(domain.lower())
			#iterate through gtypos looking if it exists in the files in memory. if so, we have a candidate
			for typo in gtypos:
				if self.isDomainCandidate(typo, tld_files["previous"], tld_files["current"], tld_files["next"]) and typo not in candidates:
					candidates.append(typo)
					
			url = domain + '.' + aTLD

			try:
				wpg = WebPageInfoGetter(url)
				wpg.setUpGetter(url)
			except Exception as e:
				exceptions.append(e)
			try:
				nilsimsa = wpg.getNilsimsaHash(url, False)
			except Exception as e:
				nilsimsa = None
				exceptions.append(e)
			try:
				image = wpg.getImageHash(url, False)
			except Exception as e:
				image = None
				exceptions.append(e)
			try:
				redirects = wpg.getNumberOfRedirects(url, False)
			except Exception as e:
				redirects = None
				exceptions.append(e)

			#next we grab all the whois content
			try:
				whois_parser = Whois_Parser()
				whois_server = whois_parser.server_info['.' + aTLD][0]
			except Exception as e:
				exceptions.append(e)
			try:
				creation_date = whois_parser.getCreationDate(url, whois_server)
			except Exception as e:
				creation_date = None
				exceptions.append(e)
			try:
				privacy_prot = whois_parser.isWhoisPrivacyProtected(url, whois_server)
			except Exception as e:
				privacy_prot = None
				exceptions.append(e)
			try:
				is_parking = whois_parser.isParking(url, whois_server)
			except Exception as e:
				is_parking = None
				exceptions.append(e)

			with open("/home/engelsjo/Documents/Research/tld_file_parser/data/{}_data/{}.data".format(aTLD, dataFileName), "a") as data_fp:
				#write out all of our data to the file
				data_fp.write("-Domain: {}\n".format(url))
				data_fp.write("NumberOfCandidates: {}\n".format(len(candidates)))
				data_fp.write("Candidates: {}\n".format(str(candidates)))
				data_fp.write("Nilsimsa: {}\n".format(nilsimsa))
				data_fp.write("ImageHash: {}\n".format(image))
				data_fp.write("Redirects: {}\n".format(redirects))
				data_fp.write("CreationDate: {}\n".format(creation_date))
				data_fp.write("Privacy: {}\n".format(privacy_prot))
				data_fp.write("Parking: {}\n".format(is_parking))
				for exception in exceptions:
					data_fp.write("Exception: {}\n".format(exception))	
		print("done with file")