def run(self, query): self.browser.open("https://www.startpage.com/") form = self.browser.get_form(id="search_form") form['query'].value = "\"" + query + "\"" form['abp'].value = "true" self.browser.submit_form(form) parser = self.browser.parsed # Prepare tables headlines = [] links = [] descriptions = [] # Return number of results from search count = parser.find('div', { 'id': 'results_content' }).find('p', {'id': 'results_count_p'}) sanitized_count = str(count) first = "About" last = "results (" self.results['result_count'] = self.extract_string( sanitized_count, first, last) # Return first page of results # page = parser.find('div', {'id': 'results'}) section = [] for element in page: if element.name == "ol": section = element.find_all('li') break for li in section: try: headline = li.h3.text link = li.a['href'] description = li.find('p', {'class': 'desc'}).text result = [headline, link, description] self.results['top_results'].append(result) except: pass return self.results
def run(self, ip): results = [] url_param = ip.replace(".", "/") url = "https://www.robtex.com/en/advisory/ip/" + url_param + "/shared.html" self.browser.open(url) parser = self.browser.parsed search = parser.find("span", {"id": "shared_ma"}) if search is not None: # count = self.extract_string(search.text, "(", " shown") # if int(count) <= 50: for result in search.parent.parent.find("ol", { "class": "xbul" }).findChildren('li'): result_value = result.text if ' ' in result_value: result_value = re.sub(' ', '.', result_value) results.append(result_value) else: results.append(result_value) # else: # results.append("%s domains identified" % str(count)) return results
def run(self, ip): results = [] url_param = ip.replace(".", "/") url = "https://www.robtex.com/en/advisory/ip/" + url_param + "/shared.html" self.browser.open(url) parser = self.browser.parsed search = parser.find("span", {"id": "shared_ma"}) if search is not None: # count = self.extract_string(search.text, "(", " shown") # if int(count) <= 50: for result in search.parent.parent.find("ol", {"class": "xbul"}).findChildren('li'): result_value = result.text if ' ' in result_value: result_value = re.sub(' ', '.', result_value) results.append(result_value) else: results.append(result_value) # else: # results.append("%s domains identified" % str(count)) return results
def run(self, query): self.browser.open("https://www.startpage.com/") form = self.browser.get_form(id="search_form") form['query'].value = "\"" + query + "\"" form['abp'].value = "true" self.browser.submit_form(form) parser = self.browser.parsed # Prepare tables headlines = [] links = [] descriptions = [] # Return number of results from search count = parser.find('div', {'id': 'results_content'}).find('p', {'id': 'results_count_p'}) sanitized_count = str(count) first = "About" last = "results (" self.results['result_count'] = self.extract_string(sanitized_count, first, last) # Return first page of results # page = parser.find('div', {'id': 'results'}) section = [] for element in page: if element.name == "ol": section = element.find_all('li') break for li in section: try: headline = li.h3.text link = li.a['href'] description = li.find('p', {'class': 'desc'}).text result = [headline, link, description] self.results['top_results'].append(result) except: pass return self.results
def run(self, indicator): results = [] self.browser.open("http://www.threatexpert.com/reports.aspx") form = self.browser.get_form(action="reports.aspx") form['find'].value = "\"" + indicator + "\"" self.browser.submit_form(form) parser = self.browser.parsed # Return number of results from search [0] + number of pages of results [1] section = parser.find('span', {'id': 'txtResults'}).find_all('table') if section: if len(section) > 1: page_count = len( section[1].find_all('td')) - 1 # Acquire page count else: page_count = 1 # scrape current page data = section[0].find_all('tr') page = self.scrape_page(data, indicator) results.extend(page) # Gather records from subsequent pages for x in range(2, page_count + 1): url = "http://www.threatexpert.com/reports.aspx?page=%s&find=%s" % ( x, indicator) self.browser.open(url) parser = self.browser.parsed section = parser.find('span', { 'id': 'txtResults' }).find('table') if section: data = section.find_all('tr') page = self.scrape_page(data, indicator) results.extend(page) return results
def run(self, indicator): results = [] self.browser.open("http://www.threatexpert.com/reports.aspx") form = self.browser.get_form(action="reports.aspx") form['find'].value = "\"" + indicator + "\"" self.browser.submit_form(form) parser = self.browser.parsed # Return number of results from search [0] + number of pages of results [1] section = parser.find('span', {'id': 'txtResults'}).find_all('table') if section: if len(section) > 1: page_count = len(section[1].find_all('td')) - 1 # Acquire page count else: page_count = 1 # scrape current page data = section[0].find_all('tr') page = self.scrape_page(data, indicator) results.extend(page) # Gather records from subsequent pages for x in range(2, page_count + 1): url = "http://www.threatexpert.com/reports.aspx?page=%s&find=%s" % (x, indicator) self.browser.open(url) parser = self.browser.parsed section = parser.find('span', {'id': 'txtResults'}).find('table') if section: data = section.find_all('tr') page = self.scrape_page(data, indicator) results.extend(page) return results
def run(self, ip): """ Created by: LNguyen Date: 26January2017 Updated scraping logic because of existing bug that was dependent finding an ID = shared_ma that no longer existed in the Robtex web pages. The new logic finds a list of shared domains located in the tag <ol class:xbul.> :param ip: The ip address to scrape the Robtex web page for :return: A list of domains found for the given ip address """ results = [] url_param = ip.replace(".", "/") url = "https://www.robtex.com/en/advisory/ip/" + url_param + "/shared.html" # print("url:",url) self.browser.open(url) parser = self.browser.parsed search = parser.find("ol", {"class": "xbul"}) # print("search: ", search) total = 0 if search is not None: for result in search.find_all("li"): total += 1 if total > 100: break else: result_value = result.text # print("result_value: ",result.text) if ' ' in result_value: result_value = re.sub(' ', '.', result_value) results.append(result_value) else: results.append(result_value) # print("scraperesults:",results) # print("robtex_total:",total) return results
def scrape_data(self, indicator, query_type): passive_table = [] # search period 7 is "complete history" search_period = '7' # 0 = Current Day # 1 = Past 72 Hours # 2 = Past Week # 3 = Past Month # 4 = Past 3 Months # 5 = Past 6 Months # 6 = Past Year format = '0' # 0 = Display results on screen # 1 = Output to CSV file (Comma separated w/o quotes) # 2 = Output to CSV file (Comma separated with quotes) # 3 = Output to CSV file (Tab separated w/o quotes) # 4 = Output to CSV file (Tab separated with quotes) # 5 = Output to CSV file (Pipe separated w/o quotes) # 6 = Output to CSV file (Pipe separated with quotes) # queryType # A = Query IP Address or CIDR, # H = Query Hostname # X = Query Domain Name for Hosts # D = Query Domain for Authoritative Nameservers # N = Query Nameserver for Authoritative Domains url = "https://research.iad.internetidentity.com/index.php?period=" + search_period + "&format=" + format + "&queryType=" + query_type + "&target=" + indicator + "&submit=Submit+Query" self.browser.open(url, timeout=20000) parser = self.browser.parsed search = parser.find("table", { "style": "text-align: left; margin-left: auto; margin-right: auto;" }) for tr in search.find('tbody'): #for tr in parser.find_all('tr')[7:]: tds = [] if tr != "\n": for td in tr.find_all('td'): tds.append(td.text.strip()) # check that table data exists if len(tds) == 4: IID_seen = tds[0] IID_host = tds[1] IID_qType = tds[2] IID_ip = tds[3] passive_table.append({ 'ip': IID_ip, 'domain': IID_host, 'date': IID_seen, 'firstseen': IID_seen, 'lastseen': {}, 'ip_location': {} }) tds[:] = [] self.results.extend(passive_table)
#-*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup as bs import sqlite3 import dateutil.parser # dateutil module refines the date data to the desired value. #html crawling part url = "http://www.espnfc.com/player/149945/son-heung-min" response = requests.get(url) source = response.content #html parsing part parser = bs(source, "html.parser") navigator = parser.find("div", { "id": "player-appearances-2018" }).find_all("tr")[ 1] # The latest game content // div(id:??) => tr's second one => td #data refining part dic = { "Date": "", "League": "", "Vs": "", "Result": "", "Goal": "", "Assist": "", "Sh": "", "ShT": "", "Yel": "", "Red": "", "Appearance": ""
#-*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup as bs import dateutil.parser import re import sqlite3 import dateutil.relativedelta url = "https://www.skysports.com/tottenham-hotspur-results" response = requests.get(url) source = response.content parser = bs(source, "html.parser") objlink = parser.find("div", { "class": "fixres__item" }).find("a", href=True)["href"] #updating rating data to epl.db response2 = requests.get(objlink) source2 = response2.content parser2 = bs(source2, "html.parser") UpdatedDate = parser2.find("div", { "class": "article__header-details" }).find("p", { "class": "article__header-date-time" }).text[14:] UpdatedDate = UpdatedDate[3:6] + UpdatedDate[:2] + UpdatedDate[5:] UpdatedDate = str(dateutil.parser.parse(UpdatedDate))[2:][:8].replace("-", ".") conn = sqlite3.connect("/home/ubuntu/epl/epl.db") cur = conn.cursor()