def read_warc_archive(self, archive_path): with open(archive_path, 'rb') as stream: for record in ArchiveIterator(stream): if record.rec_type == 'response': try: parser = BeautifulSoup(record.content_stream().read(), features="html.parser") except: continue links = parser.find_all("a") if links: for link in links: href = link.attrs.get("href") if href is not None: if self.domain in href and href.startswith( "http"): path = urlparse(href).path domain_link = self.proper_domain + path self.data.append({ '{0}_link'.format(self.domain_name): domain_link, 'reference_link': record.rec_headers.get_header( 'WARC-TARGET-URI'), 'warc_date': dateutil.parser.parse( record.rec_headers.get_header( 'WARC-Date')) })
def get_timetable(origin_id, dest_id): today = datetime.now(tz=tz).date().isoformat() query_params = { "OriginStationId": origin_id, "OriginStationName": stations.by_id[origin_id], "DestStationId": dest_id, "DestStationName": stations.by_id[dest_id], "GoingTrainCln": today, "ReturnningTrainCln": today } query_params.update(base_query) session = requests.Session() session.get(base_uri, params=query_params) query_params[ 'isExcel'] = 'true' # actually not excel, but a bare HTML file which is easier to parse # Doing two queries because the website is not stateless :( timetable_html = session.get(base_uri, params=query_params).text parser = BeautifulSoup(timetable_html, "lxml") timetable = parser.find_all('table')[ -1] # Last table in the page is the timetable out = [] column_names = [ 'Train#', 'Platform', 'Departure', 'Arrival', 'Duration', 'Transfers', 'extra', 'route_type' ] for row in timetable.find_all('tr'): out.append({ col: cell.text.strip() for col, cell in zip(column_names, row.find_all('td')) }) out.pop(0) # remove column headers return (out)
def process_record(self, record): if record.rec_type != 'response': return content_type = record.http_headers.get_header('content-type', None) if content_type is None or 'html' not in content_type: # skip non-HTML or unknown content types return try: parser = BeautifulSoup(record.content_stream().read(), features="html.parser") except: return links = parser.find_all("a") if links: for link in links: href = link.attrs.get("href") if href is not None: if self.domain in href and href.startswith("http"): path = urlparse(href).path domain_link = self.proper_domain + path if domain_link[-1] != '/': domain_link += '/' link_data = [{ '{0}_link'.format(self.domain_name): domain_link, 'reference_link': record.rec_headers.get_header('WARC-TARGET-URI'), 'warc_date': dateutil.parser.parse( record.rec_headers.get_header('WARC-Date')) }] yield link_data
def scrape_data(self, indicator, query_type): passive_table = [] #search period 7 is "complete history" search_period = '7' # 0 = Current Day # 1 = Past 72 Hours # 2 = Past Week # 3 = Past Month # 4 = Past 3 Months # 5 = Past 6 Months # 6 = Past Year format = '0' # 0 = Display results on screen # 1 = Output to CSV file (Comma separated w/o quotes) # 2 = Output to CSV file (Comma separated with quotes) # 3 = Output to CSV file (Tab separated w/o quotes) # 4 = Output to CSV file (Tab separated with quotes) # 5 = Output to CSV file (Pipe separated w/o quotes) # 6 = Output to CSV file (Pipe separated with quotes) # queryType # A = Query IP Address or CIDR, # H = Query Hostname # X = Query Domain Name for Hosts # D = Query Domain for Authoritative Nameservers # N = Query Nameserver for Authoritative Domains url = "https://research.iad.internetidentity.com/index.php?period=" + search_period + "&format=" + format + "&queryType=" + query_type + "&target=" + indicator + "&submit=Submit+Query" self.browser.open(url) parser = self.browser.parsed for tr in parser.find_all('tr')[7:]: tds = [] for td in tr.find_all('td'): tds.append(td.text.strip()) # check that table data exists if len(tds) == 4: IID_seen = tds[0] IID_host = tds[1] IID_qType = tds[2] IID_ip = tds[3] passive_table.append({ 'ip': IID_ip, 'domain': IID_host, 'date': IID_seen, 'ip_location': {} }) tds[:] = [] self.results.extend(passive_table)
def scrape_data(self, indicator, query_type): passive_table = [] #search period 7 is "complete history" search_period = '7' # 0 = Current Day # 1 = Past 72 Hours # 2 = Past Week # 3 = Past Month # 4 = Past 3 Months # 5 = Past 6 Months # 6 = Past Year format = '0' # 0 = Display results on screen # 1 = Output to CSV file (Comma separated w/o quotes) # 2 = Output to CSV file (Comma separated with quotes) # 3 = Output to CSV file (Tab separated w/o quotes) # 4 = Output to CSV file (Tab separated with quotes) # 5 = Output to CSV file (Pipe separated w/o quotes) # 6 = Output to CSV file (Pipe separated with quotes) # queryType # A = Query IP Address or CIDR, # H = Query Hostname # X = Query Domain Name for Hosts # D = Query Domain for Authoritative Nameservers # N = Query Nameserver for Authoritative Domains url = "https://research.iad.internetidentity.com/index.php?period=" + search_period + "&format=" + format + "&queryType=" + query_type + "&target=" + indicator + "&submit=Submit+Query" self.browser.open(url) parser = self.browser.parsed for tr in parser.find_all('tr')[7:]: tds = [] for td in tr.find_all('td'): tds.append(td.text.strip()) # check that table data exists if len(tds) == 4: IID_seen = tds[0] IID_host = tds[1] IID_qType = tds[2] IID_ip = tds[3] passive_table.append({'ip': IID_ip, 'domain': IID_host, 'date': IID_seen, 'ip_location': {}}) tds[:] = [] self.results.extend(passive_table)
def check_cookie(self): url = "https://research.iad.internetidentity.com" self.browser.open(url, verify=False) parser = self.browser.parsed # Verify login succeeded login_test = parser.find_all('a', {'href': '/logout.php'}) if login_test: return True return False
def check_cookie(self): url = "https://research.iad.internetidentity.com" self.browser.open(url) parser = self.browser.parsed # Verify login succeeded login_test = parser.find_all('a', {'href': '/logout.php'}) if login_test: return True return False
def get_timetable(origin_id, dest_id): today = datetime.now(tz=tz).date().isoformat() query_params = {"OriginStationId": origin_id, "OriginStationName": stations.by_id[origin_id], "DestStationId": dest_id, "DestStationName": stations.by_id[dest_id], "GoingTrainCln": today, "ReturnningTrainCln": today} query_params.update(base_query) session = requests.Session() session.get(base_uri, params=query_params) query_params['isExcel'] = 'true' # actually not excel, but a bare HTML file which is easier to parse # Doing two queries because the website is not stateless :( timetable_html = session.get(base_uri, params=query_params).text parser = BeautifulSoup(timetable_html, "lxml") timetable = parser.find_all('table')[-1] # Last table in the page is the timetable out = [] column_names = ['Train#', 'Platform', 'Departure', 'Arrival', 'Duration', 'Transfers', 'extra', 'route_type'] for row in timetable.find_all('tr'): out.append({col: cell.text.strip() for col, cell in zip(column_names, row.find_all('td'))}) out.pop(0) # remove column headers return(out)
def scrape_data(self, indicator, query_type): passive_table = [] search_period = '5' # 1 = Current day # 2 = Current month # 3 = Past 6 months # 4 = Past year # 5 = Full Historical format = '0' # 0 = Display results on screen # 1 = Output to CSV file (Comma separated w/o quotes) # 2 = Output to CSV file (Comma separated with quotes) # 3 = Output to CSV file (Tab separated w/o quotes) # 4 = Output to CSV file (Tab separated with quotes) # 5 = Output to CSV file (Pipe separated w/o quotes) # 6 = Output to CSV file (Pipe separated with quotes) # queryType # A = Query IP Address or CIDR, # H = Query Hostname # X = Query Domain Name for Hosts # D = Query Domain for Authoritative Nameservers # N = Query Nameserver for Authoritative Domains url = "https://research.iad.internetidentity.com/index.php?search_period=" + search_period + "&format=" + format + "&queryType=" + query_type + "&target=" + indicator + "&submit=Submit+Query" self.browser.open(url) parser = self.browser.parsed passive_row = [] for tr in parser.find_all('tr')[7:]: tds = [] for td in tr.find_all('td'): tds.append(td.text.strip()) # check for querytype to correctly display output if query_type == 'A' or query_type == 'X': IID_ip = tds[0] IID_asn = tds[1] IID_bgp = tds[2] IID_seen = tds[3] IID_host = tds[4] else: IID_host = tds[0] IID_seen = tds[1] IID_ip = tds[2] IID_asn = tds[3] IID_bgp = tds[4] passive_row = [IID_seen, IID_ip, IID_host] passive_table.append(passive_row[:]) passive_row[:] = [] tds[:] = [] self.results.extend(passive_table)