def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") victim_divs = soup.find("div", class_="row mt-3 mb-3").find_all( "div", recursive=False) for div in victim_divs: # parse all the stuff out of the html parent_div = div.find("div") header_div = parent_div.find("div", class_="header") # get the name from the header h5 = header_div.find("div").find("div", class_="col-8").find("h5") name = h5.text.split("- ")[0].strip() # get the published date from the header published_span = header_div.find("div").find( "div", class_="col-4 text-right").find("span") published_dt = datetime.strptime(published_span.text.strip(), "%d.%m.%Y") # parse out the details link # this is ugly but it works body_div = parent_div.find("div", class_="body") link_div = body_div.find_all("div")[-1] a = body_div.find_all("div") b = a[-1] c = b.find("a") url = c.attrs["href"] logging.debug(f"Found victim: {name}") # check if the org is already seen (search by url because name isn't guarenteed unique) q = self.session.query(Victim).filter_by(url=url, site=self.site) if q.count() == 0: # new org v = Victim(name=name, url=url, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.site.last_scraped = datetime.utcnow() self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}/rss", headers=self.headers) soup = BeautifulSoup(r.content, features="xml") items = soup.findAll('item') for item in items: name = item.title.text logging.debug(f"Found victim: {name}") publish_dt = datetime.strptime(item.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z") q = self.session.query(Victim).filter_by(site=self.site, name=name) if q.count() == 0: # new victim v = Victim(name=name, url=None, published=publish_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.site.last_scraped = datetime.utcnow() self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number page_list = soup.find("ul", class_="pages") last_li = page_list.find_all("li")[-1] max_page_num = int(last_li.find("a").attrs["href"].split("/")[2]) # start at the last page and go backwards, in case a new victim was added while running (unlikely but possible) for i in range(max_page_num, 0, -1): r = p.get(f"{self.url}/page/{i}", headers=self.headers) self._handle_page(r.content.decode()) # check one past the last page to see if new orgs were added that caused another page to be added r = p.get(f"{self.url}/page/{max_page_num+1}", headers=self.headers) self._handle_page(r.content.decode()) self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") script_list = soup.find_all("script") # they include the list in javascript code instead of HTML # So we have to parse it javascript_code = "" for script in script_list: script = str(script) if "var post_links = " in script: javascript_code = script break start_index = javascript_code.find("var post_links = ") end_index = javascript_code[start_index:].find("var baseUrl") + start_index javascript_code = javascript_code[start_index:end_index].strip() start_index = javascript_code.find("[") end_index = javascript_code.rfind("]") + 1 javascript_code = javascript_code[start_index:end_index].strip().replace("null", "None") # convert javascript list of dictionary to python's list of dictionary victim_list = list(eval(javascript_code)) for victim in victim_list: victim_name = victim["title"] if "-" in victim_name: victim_name = victim_name[:victim_name.find("-")] published = int(victim["timestamp"]) published_dt = datetime.utcfromtimestamp(published) victim_leak_site = self.url + "/?" + victim["link"] + "/" q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") script_list = soup.find_all("script") # they include the list in javascript code instead of HTML # So we have to parse it js_victims_raw = "" js_marker = "var post_links = " for script in script_list: script = str(script) if js_marker in script: js_victims_raw = script break if not js_victims_raw: raise Exception(f"js victim list not found (tried to locate '{js_marker}')") raw_victim_list = js_victims_raw.split(f"{js_marker}[{{")[1].split( "}]" )[0] victim_list = json.loads(f"[{{{raw_victim_list}}}]") for victim in victim_list: victim_name = victim["title"] if "-" in victim_name: victim_name = victim_name[:victim_name.find("-")] published = int(victim["timestamp"]) published_dt = datetime.utcfromtimestamp(published) victim_leak_site = self.url + "/?" + victim["link"] + "/" q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: # there is a pagination-container div right now but nothing in it # once there are multiple pages of victims, this can be updated to # support that r = p.get(f"{self.url}" + '/archives/', headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") self._handle_page(soup)
def scrape_victims(self): with Proxy() as p: url = self.url + '/partners.html' r = p.get(f"{url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="page-header") for victim in victim_list: victim_name = victim.find_all("a")[0].text.strip() published = victim.find_all("span")[1].text.strip() published_dt = None # they use a bunch of different date format... if published == "29/01/21": published_dt = datetime.strptime(published, "%d/%m/%y") elif published[6:8] == "20" and published[8:] != "": published_dt = datetime.strptime(published, "%m/%d/%Y") else: published_dt = datetime.strptime(published, "%m/%d/%y") victim_leak_site = self.url + '/' + victim.find_all( "a")[0].attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") inner_sites = soup.find_all("div", class_="ajax-load-more-wrap default") for site in inner_sites: site_url = site.attrs["data-canonical-url"] r = p.get(site_url, headers=self.headers) self._handle_page(r.content.decode(), p)
def _handle_page(self, body: str, p: Proxy): soup = BeautifulSoup(body, "html.parser") victim_list = soup.find_all("div", class_="list-text") for victim in victim_list: victim_description = victim.find("a").find("p").text.strip().split( " ") # extract company name by getting only the first few words that start with a capitalized letter victim_name = "" for word in victim_description: if word[0].isupper() or word == "and": victim_name += word + " " else: break victim_name = victim_name[:-1] # Delete the last space if victim_name[-2:] == "is": # hard-code this. They forgot to add a space to one name, so I can't properly scrape it victim_name = victim_name[:-2] # they put the published date in the victim's leak page victim_leak_site = victim.find("a").attrs["href"] r = p.get(victim_leak_site, headers=self.headers) published_dt = self.extract_published_date(r.content.decode()) q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit()
def scrape_victims(self): with Proxy() as p: url = self.url + '/partners.html' r = p.get(f"{url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="page-header") for victim in victim_list: victim_name = victim.find_all("a")[0].text.strip() published = victim.find_all("span")[1].text.strip() # they use a bunch of different date format # use a nice dateparsing library to handle them all in an easier manner published_dt = dateparser.parse(published) # sometimes they don't have a timestamp if published_dt is None and len(published) > 0: logging.warning(f"couldn't parse timestamp: {published}") victim_leak_site = self.url + '/' + victim.find_all( "a")[0].attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def is_site_up(self) -> bool: # can't use the parent class is_site_up() because the / route doesn't exist on the API server with Proxy() as p: try: r = p.get(f"{self.url}/rss", headers=self.headers, timeout=Config["timeout"]) if r.status_code >= 400: return False except Exception as e: print(e) return False self.site.last_up = datetime.utcnow() return True
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get the number of pages on the site page_nav = soup.find("ul", class_="pagination") num_pages = max([int(x.text) for x in page_nav.findAll("a")]) for pg_num in range(1,num_pages+1): # scrape each page r = p.get(f"{self.url}/?page={pg_num}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") self._handle_page(soup)
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") page_count = 0 while True: page_nav = soup.find("div", class_="nav-previous") if page_nav is None: break url = page_nav.find("a").attrs["href"] r = p.get(f"{url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") self._handle_page(soup)
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # find all pages page_nav = soup.find_all("a", class_="pagination-link") site_list = [] for page in page_nav: site_list.append(self.url + "/" + page.attrs["href"]) for site in site_list: r = p.get(site, headers=self.headers) self._handle_page(r.content.decode())
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="lot-card row m-0") for victim in victim_list: victim_name = victim.find( "div", class_="text-left text-grey d-block overflow-hidden").find( "a").attrs["href"] published_dt = None victim_leak_site = None q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="card-body") for victim in victim_list: victim_name = victim.find("h5", class_="card-title").text.strip() published = victim.find( "p", class_="card-text mt-3 text-secondary").text[11:21] published_dt = datetime.strptime(published, "%Y-%m-%d") victim_leak_site = self.url + victim.find( "a", class_="btn btn-outline-primary").attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all( "div", class_="blog-post blog-main posts_at_first") for victim in victim_list: print(victim) victim_name = victim.find( "h2", class_="blog-post-title").find("a").text.strip() published = "" victim_leak_site = self.url + victim.find( "h2", class_="blog-post-title").find("a").attr["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="blog-one__single") for victim in victim_list: victim_name = victim.find("h3").text.strip() client_site = victim.find("h3").find("a", title="Visit Client Website").text.strip() victim_name = victim_name.replace(client_site, "").strip() published = victim.find("div", class_="blog-one__meta").text.strip()[:10] published_dt = datetime.strptime( published, "%Y-%m-%d") victim_leak_site = self.url + "/" + victim.find("h3").find("a").attrs["href"] q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("a", class_="leak-card p-3") for victim in victim_list: victim_name = victim.find("h5").text.strip() published = victim.find("div", class_="col-auto published") published_dt = datetime.strptime( published.text.strip(), "%Y-%m-%d %H:%M:%S") if victim_name == "Hello world 1" or victim_name == "Mercy, journalists,chonky boi": # skipping news and updates continue victim_leak_site = self.url + victim.attrs["href"] q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # find all pages page_nav = soup.find_all("a", class_="page-numbers") site_list = [] site_list.append(self.url) for page in page_nav: # might exist repetition if page.attrs["href"] not in site_list: site_list.append(page.attrs["href"]) for site in site_list: r = p.get(site, headers=self.headers) self._handle_page(r.content.decode())
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find("div", class_="collapse-section").find_all("li") for victim in victim_list: victim_name = victim.find("a").text.strip() if victim_name == "HOME" or victim_name == "HOW TO DOWNLOAD?": continue victim_leak_site = self.url + victim.find("a").attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=None, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="col p-4 d-flex flex-column position-static") for victim in victim_list: victim_name = victim.find("h3", class_="mb-0").text.strip() victim_name = victim_name[:victim_name.find("\n")] published = victim.find("div", class_="mb-1 text-muted") published_dt = datetime.strptime( published.text.strip(), "%Y-%m-%d") victim_leak_site = self.url + victim.find("a", class_="stretched-link").attrs["href"] q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def is_site_up(self) -> bool: """ check if the site is up this might have specific criteria for some sites """ with Proxy() as p: try: r = p.get(self.url, headers=self.headers, timeout=Config["timeout"]) if r.status_code >= 400: return False except Exception as e: return False self.site.last_up = datetime.utcnow() return True
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="col py-3") for victim in victim_list: victim_name = victim.find("h3", class_="mb-3").text.strip() # it's less than ideal that there aren't other properties to search on # but I don't want to store leak data URLs q = self.session.query(Victim).filter_by(site=self.site, name=victim_name) if q.count() == 0: # new victim v = Victim(name=victim_name, published=None, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}/v1/companies/disclosed", headers=self.headers) j = r.json() for entry in j: name = entry["title"] logging.debug(f"Found victim: {name}") publish_dt = datetime.strptime(entry["disclosed_at"], "%Y-%m-%dT%H:%M:%SZ") q = self.session.query(Victim).filter_by(site=self.site, name=name) if q.count() == 0: # new victim v = Victim(name=name, url=None, published=publish_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.site.last_scraped = datetime.utcnow() self.session.commit()
def _handle_page_type(self, url: str): with Proxy() as p: r = p.get(f"{url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number page_list = soup.find("ul", class_="pagination pagination-sm justify-content-center mb-0") last_li = page_list.find_all("li")[-2] max_page_num = int(last_li.find("a").text) # start at the last page and go backwards, in case a new victim was added while running (unlikely but possible) for i in range(max_page_num, 0, -1): r = p.get(f"{url}?page={i}", headers=self.headers) self._handle_page(r.content.decode()) # check one past the last page to see if new orgs were added that caused another page to be added r = p.get(f"{url}?page={max_page_num+1}", headers=self.headers) self._handle_page(r.content.decode()) # just for good measure self.session.commit()