Пример #1
0
	def __init__(self, arguments):
		self.arguments = arguments
		self.date = time.strftime("%Y-%m-%d")  # Date Format ISO 8601
		self.start = time.strftime("%I_%M")  # Time
		self.exec_time = str(time.strftime("%I_%M_%p"))  # Time
		self.base = Base()
		self.log_dir = self.base.get_log_dir()
		self.main()
Пример #2
0
	def __init__(self, log, arguments):
		self.arguments = arguments
		self.log = log.copy()
		self.base = Base()
		self.unique_requests = list()
		self.session = requests.session()
		if self.arguments.web_username and self.arguments.web_password:
			print("Setting Auth with username: " + str(self.arguments.web_username))
			self.session.auth = (self.arguments.web_username, self.arguments.web_password)
		urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Пример #3
0
	def __init__(self, arguments):
		self.arguments = arguments
		self.urls = self.arguments.urls
		self.base = Base()
		self.status_results = dict()
		self.session = requests.session()
		if self.arguments.web_username and self.arguments.web_password:
			print("Setting Auth with username: " + str(self.arguments.web_username))
			self.session.auth = (self.arguments.web_username, self.arguments.web_password)
		multiprocessing.freeze_support()
		urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Пример #4
0
	def __init__(self, arguments):
		self.arguments = arguments
		self.urls = self.arguments.urls
		self.base = Base()
		self.scrape_results = dict()
		self.sorted_results = dict()
		self.scraped_total = 0
		self.session = requests.session()
		if self.arguments.web_username and self.arguments.web_password:
			print("Setting Auth with username: " + str(self.arguments.web_username))
			self.session.auth = (self.arguments.web_username, self.arguments.web_password)
		manager = multiprocessing.Manager()
		urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Пример #5
0
class Verify:
	def __init__(self, log, arguments):
		self.arguments = arguments
		self.log = log.copy()
		self.base = Base()
		self.unique_requests = list()
		self.session = requests.session()
		if self.arguments.web_username and self.arguments.web_password:
			print("Setting Auth with username: "******"Total Target Urls: " + str(counter))
	
	def _worker(self):
		print("Unique Target Urls: " + str(len(self.unique_requests)))
		print("Verifying Unique Targets\n")
		with multiprocessing.Pool(processes=10) as pool:  # Start Multiprocessing pool
			results = pool.map(self._verify, self.unique_requests)
		# queue = dict(pair for d in results for pair in d.items())  # convert the returned list to dictionary
		for result in results:
			# print(result)
			target_url = result[0]
			response_data = result[1]
			for url_key in self.log.keys():  # Loop Through URL Keys
				for element_type in self.log[url_key].keys():  # Loop Through element type keys
					if not element_type.startswith(('ignored_', 'forms')):  # Ignore some keys
						for index, value in self.log[url_key][element_type].items():  # Append data to list
							dict_target_url = value['target_url']
							if target_url == dict_target_url:
								# print([element_url, element_type, element_index, element_data['target_url'], element_data['status']])
								self.log[url_key][element_type][index]['status'] = response_data['status']
								try:
									self.log[url_key][element_type][index]['redirectedURL'] = response_data['redirectedURL']
								except Exception as e:
									pass
								self.log[url_key][element_type][index]['message'] = response_data['message']
								self.log[url_key][element_type][index]['pageTitle'] = response_data['pageTitle']
	
	def _verify(self, url):
		response_data, self.session = self.base.session_get_response(self.session, url, False)
		return [url, response_data]
Пример #6
0
class Status:
	def __init__(self, arguments):
		self.arguments = arguments
		self.urls = self.arguments.urls
		self.base = Base()
		self.status_results = dict()
		self.session = requests.session()
		if self.arguments.web_username and self.arguments.web_password:
			print("Setting Auth with username: "******"Checking URL Statuses")
		print("# of Urls Defined: " + str(len(self.urls)))
		self._worker()
		return self.status_results
	
	def _worker(self):
		unique_urls = list()
		malformed_urls = list()
		for url in self.urls:
			valid = self.base.detect_valid_url(url)
			if valid == True:
				if url not in unique_urls:
					unique_urls.append(url)
			else:
				malformed_urls.append(url)
		print("# of Unique Urls to request: " + str(len(unique_urls)))
		print("# of Malformed URLs: " + str(len(malformed_urls)))
		print(str(malformed_urls) + "\n")
		with multiprocessing.Pool(processes=10) as pool:  # Start Multiprocessing pool
			results = pool.map(self._verify, unique_urls)
		self.status_results = results
		print("\n")
	
	def _verify(self, url):
		response_data, session = self.base.session_get_response(self.session, url, False)
		return {url: response_data}
Пример #7
0
	def main(self):
		logger = Base()
		start_time = time.time()
		
		if self.arguments.exclude:
			print(self.arguments.exclude)
			print("\n")

		if self.arguments.status:
			from src.modules.status import Status
			url_status = Status(self.arguments)  # Set Variables in status.py
			self.status_log = url_status.main()  # Request all unique urls and get a list of statuses
			if self.arguments.excel_output:
				from src.modules.parse_results import Parse_Excel
				parser = Parse_Excel(self.arguments)
				logger.write_log(self.status_log, 'statusCheck')  # Write Log to json File
				out_file = parser.status_to_excel(self.status_log, 'statusCheck')
				# parser.scraper_to_excel(self.status_log, 'statusCheck')  # Write Excel Output
			else:
				out_file = logger.write_log(self.status_log, 'statusCheck')  # Write Log to json File
			self.open_out_file(out_file)
			
		if self.arguments.scrape:
			from src.modules.scraper import Scrape
			scraper = Scrape(self.arguments)  # Set Variables in scraper.py
			self.scrape_log = scraper.main()  # Scrape content and return dictionary
			
			if self.arguments.verify:
				from src.modules.verifier import Verify
				verifier = Verify(self.scrape_log, self.arguments)  # Define Verifier
				self.verified_log = verifier.main()  # Run Verifier Method
				out_file = logger.write_log(self.verified_log, 'verifiedInfo')  # Write Log to json File
			else:
				out_file = logger.write_log(self.scrape_log, 'scrapedInfo')  # Write Scraped Dictionary to json File
		
			if self.arguments.excel_output:  # Write Scraped / Verified Data to file
				from src.modules.parse_results import Parse_Excel
				parser = Parse_Excel(self.arguments)
				if self.verified_log:
					out_file = parser.scraper_to_excel(self.verified_log, 'verifiedInfo')  # Write Log to tsv File
				else:
					out_file = parser.scraper_to_excel(self.scrape_log, 'scrapedInfo')  # Write Scraped Dictionary to tsv File
			
			self.open_out_file(out_file)
		end_time = '{:.2f}'.format((time.time() - start_time))
		print("\nTotal Runtime: " + str(end_time) + " (seconds)\n")
Пример #8
0
class Scrape:
	def __init__(self, arguments):
		self.arguments = arguments
		self.urls = self.arguments.urls
		self.base = Base()
		self.scrape_results = dict()
		self.sorted_results = dict()
		self.scraped_total = 0
		self.session = requests.session()
		if self.arguments.web_username and self.arguments.web_password:
			print("Setting Auth with username: "******"total scraped results: " + str(self.scraped_total) + "\n")
		return self.sorted_results

	def _worker(self, urls):
		element_results = dict()
		with multiprocessing.Pool(processes=10) as pool:  # Start Multiprocessing pool
			results = pool.map(self._scrape, urls)
			# queue = dict(pair for d in results for pair in d.items())  # convert the returned list to dictionary
		for result in results:
			for item in result:
				self.scraped_total += 1
				element_url = str(item['url'])
				element_type = str(item['elementType'])
				element_index = str(item['index'])
				element_data = item['data']
				element_data['htmlTag'] = str(item['htmlTag'])
			
				if element_url not in element_results:  # IF url as key not exist, create it
					element_results[element_url] = {}
				if element_type not in element_results[element_url]:  # If Element Type not exist, create it
					element_results[element_url][element_type] = {element_index: element_data}
				if element_index not in element_results[element_url][element_type]:  # If Element Results not exist, create it
					element_results[element_url][element_type][element_index] = element_data
			self.scrape_results = element_results  # Set Class Log to element_results dictionary
	
	def _scrape(self, url):
		results = list()
		manual = ('java', '#', 'data:')
		print("Scraping data from: " + str(url))
		response, page_source, self.session = self.base.session_get_response(self.session, url, True)
		soup = BeautifulSoup(page_source, 'html.parser')
		# print("URL: " + str(url))
		
		for index, type in enumerate(ScrapeRequirements):
			element_type = str(type).split(".", 1)[1].lower()
			# print("Checking " + str(element_type) + " on: " + str(url))
			element_tags = ScrapeRequirements[element_type.upper()].value[0]
			attributes = ScrapeRequirements[element_type.upper()].value[1]
			
			elements = list()
			for tag in element_tags:
					temp = soup.find_all(tag)
					for t in temp:
						elements.append({'tag': str(tag), 'value': t})
			
			# print(str(element_tags) + " tags found: " + str(len(elements)))
			# print(elements)
			for x, y in enumerate(elements):
				tag = elements[x]['tag']
				element = elements[x]['value']
				element_log = dict()
				for attribute in attributes:
					try:
						# print("scraping " + str(attribute))
						temp = element[attribute]
						if isinstance(temp, list):
							temp = temp[0]
						if attribute in ['href', 'src']:
							if temp.startswith("https://") or temp.startswith("http://"):
								element_log['target_url'] = temp
							elif temp.startswith("//"):
								element_log['target_url'] = self.base.get_protocol(url) + temp
							elif temp.startswith("/"):
								element_log['target_url'] = str(self.base.get_site_root(url)) + temp
							elif temp.startswith(manual):
								pass
							else:
								pass
							if element_log['target_url']:
								valid_url = self.base.detect_valid_url(element_log['target_url'])
								element_log['valid_url'] = valid_url
						element_log[str(attribute)] = str(temp)
					except:
						pass
				element_log['scraped_from'] = str(url)
				result = {'url': str(url),
						'elementType': str(element_type),
						'index': str(x),
						'htmlTag': str(tag),
						'data': element_log}
				if elements[x]['value'].content:
					content = str(element.content).replace("\\t", "").replace("\\r", "").replace("\\n", ",").strip()  # Remove encoded characters
					new_content = str(re.sub("\s{3,}", ",", content))  # Replace 3+ spaces with a comma
					try:
						string = self.base.unicode_to_ascii(new_content)
						result['data']['content'] = string
					except Exception as e:
						result['data']['content'] = new_content
						# print("Content Exception: " + str(e))
						pass
				if elements[x]['value'].text:
					text = str(element.text).replace("\\t", "").replace("\\r", "").replace("\\n", "").strip()  # Remove encoded characters
					new_text = str(re.sub("\s{3,}", ",", text))
					try:
						string = self.base.unicode_to_ascii(new_text)
						result['data']['text'] = string
					except Exception as e:
						result['data']['text'] = str(new_text)
						# print("Text Exception: " + str(e))
						pass
				
				# Domain URL Filters
				if self.arguments.limit:
					if 'target_url' in result['data']:
						target_domain = self.base.get_site_root(result['data']['target_url'])
						protocol = self.base.get_protocol(target_domain)
						# target_domain = target_domain.replace(protocol, '')
						if target_domain in self.arguments.limit:
							results.append(result)
					else:
						results.append(result)
				elif self.arguments.exclude:
					if 'target_url' in result['data']:
						target_domain = self.base.get_site_root(result['data']['target_url'])
						protocol = self.base.get_protocol(target_domain)
						# target_domain = target_domain.replace(protocol, '')
						if not target_domain in self.arguments.exclude:
							results.append(result)
							# print("Excluding link: " + str(result['data']['target_url']))
					else:
						results.append(result)
				else:
					results.append(result)
		return results
	
	def _sort_dict(self):
		# logger = self.base
		print("Sorting Scraped Results")
		verifiable = ['images', 'links']
		for url_key in self.scrape_results.keys():  # Sort Through URLs dictionary and organize it
			for et_key, et_value in self.scrape_results[url_key].items():  # Sort Through Element Types (images, links, forms, etc)
				ignored_count = 0
				x = 0
				if et_key not in verifiable:  # If not a link or image, skip and add to dictionary
					if url_key not in self.sorted_results:
						self.sorted_results[url_key] = {}
					self.sorted_results[url_key][et_key] = et_value
				else:
					for index, value in self.scrape_results[url_key][et_key].items():  # If Element Type is an image or link
						# print("\nKey: " + str(index) + ":\nValue: " + str(value))
						# If not a verifiable link, add to dictionary under ignored_<key>
						# if ('target_url' not in value) or ('href' in value.keys() and (value['href'].startswith(('java', '#', 'data')))) or \
						# 		('src' in value.keys() and value['src'].startswith(('data:'))):
						if 'target_url' not in value:
							ignored_count += 1
							# Add Item to Ignored Key in New Dictionary
							if url_key not in self.sorted_results:
								self.sorted_results[url_key] = {}
							if "ignored_" + str(et_key) not in self.sorted_results[url_key]:
								self.sorted_results[url_key]['ignored_' + str(et_key)] = {}
							if ignored_count not in self.sorted_results[url_key]['ignored_' + str(et_key)]:
								value['original_scraped_index'] = int(index)
								self.sorted_results[url_key]['ignored_' + str(et_key)][ignored_count] = value
						else:
							x += 1
							# Add Item to Ignored Key in New Dictionary
							if url_key not in self.sorted_results:
								self.sorted_results[url_key] = {}
							if str(et_key) not in self.sorted_results[url_key]:
								self.sorted_results[url_key][str(et_key)] = {}
							if x not in self.sorted_results[url_key][str(et_key)]:
								value['original_scraped_index'] = int(index)
								self.sorted_results[url_key][str(et_key)][x] = value
Пример #9
0
class Parse_Excel:
	def __init__(self, arguments):
		self.arguments = arguments
		self.date = time.strftime("%Y-%m-%d")  # Date Format ISO 8601
		self.start = time.strftime("%I_%M")  # Time
		self.exec_time = str(time.strftime("%I_%M_%p"))  # Time
		self.base = Base()
		self.log_dir = self.base.get_log_dir()
		self.main()
		
	def main(self):
		if not os.path.isdir(self.log_dir):
			os.makedirs(self.log_dir)

	def scraper_to_excel(self, json_results: dict, filename=''):
		headers = dict()
		total_records = dict()
		
		if filename:
			report_path = self.log_dir + filename + "-" + self.date + "-" + self.exec_time + ".xlsx"
		else:
			report_path = self.log_dir + self.date + "-" + self.exec_time + ".xlsx"
		
		# print(json_results.keys())
		print("\nWriting results to: " + str(report_path))
		
		# Get Unique Headers
		for url, url_data in json_results.items():
			for element_type, type_data in url_data.items():
				if element_type not in headers:
					headers[element_type] = list()
				for index, data in type_data.items():
					# json_results[url][element_type][index]['scraped_from'] = url
					for key, value in data.items():
						if key not in headers[element_type]:
							headers[element_type].append(key)
		# print(headers)
		
		# sort headers
		for element_type in headers.keys():
			if 'scraped_from' in headers[element_type]:
				headers[element_type].insert(0, headers[element_type].pop(headers[element_type].index('scraped_from')))
			if 'text' in headers[element_type]:
				headers[element_type].insert(1, headers[element_type].pop(headers[element_type].index('text')))
			if 'target_url' in headers[element_type]:
				headers[element_type].insert(2, headers[element_type].pop(headers[element_type].index('target_url')))
			
			if 'href' in headers[element_type]:
				headers[element_type].insert(3, headers[element_type].pop(headers[element_type].index('href')))
			elif 'src' in headers[element_type]:
				headers[element_type].insert(3, headers[element_type].pop(headers[element_type].index('src')))
			
			if 'htmlTag' in headers[element_type]:
				headers[element_type].insert(4, headers[element_type].pop(headers[element_type].index('htmlTag')))
			
			if 'status' in headers[element_type]:
				headers[element_type].insert(5, headers[element_type].pop(headers[element_type].index('status')))
				if 'message' in headers[element_type]:
					headers[element_type].insert(6, headers[element_type].pop(headers[element_type].index('message')))
				if 'pageTitle' in headers[element_type]:
					headers[element_type].insert(7, headers[element_type].pop(headers[element_type].index('pageTitle')))
			if 'valid_url' in headers[element_type]:
				headers[element_type].insert(-1, headers[element_type].pop(headers[element_type].index('valid_url')))
				
		# Combine dictionary results by element_type
		for url, url_data in json_results.items():
			for element_type, type_data in url_data.items():
				if element_type not in total_records.keys():
					total_records[element_type] = list()
				for index, data in type_data.items():
					total_records[element_type].append(data)
		
		workbook = xlsxwriter.Workbook(report_path, {'strings_to_urls': False})
		header_cells = workbook.add_format()
		header_cells.set_bold()
		header_cells.set_align('center')
		
		for element_type, type_data in total_records.items():
			row = 0
			column = 0
			worksheet = workbook.add_worksheet(str(element_type))
			for head in headers[element_type]:
				worksheet.write(row, column, head, header_cells)
				column += 1
			row += 1
			for item in type_data:
				for key, value in item.items():
					index = headers[element_type].index(key)
					column = index
					worksheet.write(row, column, str(value))
				row += 1
		workbook.close()
		return report_path
	
	def status_to_excel(self, json_results: dict, filename=''):
		print("Writing json to excel")
		report_path = self.log_dir + filename + "-" + self.date + "-" + self.exec_time + ".xlsx"
		# Append List of dictionary results to a single dictionary
		json_dictionary = {}
		for d in json_results:
			for url, data in d.items():
				json_dictionary[url] = data
		
		# data_frame = pandas.DataFrame(json_normalize(json_dictionary))
		# print(data_frame)
		# print(data_frame.columns.values)
		loop_data = list()
		for url, data in json_dictionary.items():
			# print(url)
			# print(data)
			loop_data.append(data)
			# data_frame = pandas.DataFrame(json_normalize(json_results))
		df = pandas.DataFrame(loop_data)
		columns = list(df.columns.values)
		if 'url' in columns:
			columns.insert(0, columns.pop(columns.index('url')))
		if 'status' in columns:
			columns.insert(1, columns.pop(columns.index('status')))
		if 'pageTitle' in columns:
			columns.insert(2, columns.pop(columns.index('pageTitle')))
		# print(columns)
		# print(df)
		workbook = xlsxwriter.Workbook(str(report_path), {'strings_to_urls': False})
		header_cells = workbook.add_format()
		header_cells.set_bold()
		header_cells.set_align('center')
		workbook.add_worksheet('Status')  # Add Named Sheet to Workbook
		workbook.close()
		writer = pandas.ExcelWriter(str(report_path), engine='xlsxwriter', options={'strings_to_urls': False})  # Write DataFrame to excel
		df[columns].to_excel(writer, sheet_name='Status')  # Write sorted Dataframe
		writer.close()
		return report_path