Пример #1
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            victim_divs = soup.find("div", class_="row mt-3 mb-3").find_all(
                "div", recursive=False)

            for div in victim_divs:
                # parse all the stuff out of the html
                parent_div = div.find("div")
                header_div = parent_div.find("div", class_="header")

                # get the name from the header
                h5 = header_div.find("div").find("div",
                                                 class_="col-8").find("h5")
                name = h5.text.split("- ")[0].strip()

                # get the published date from the header
                published_span = header_div.find("div").find(
                    "div", class_="col-4 text-right").find("span")
                published_dt = datetime.strptime(published_span.text.strip(),
                                                 "%d.%m.%Y")

                # parse out the details link
                # this is ugly but it works
                body_div = parent_div.find("div", class_="body")
                link_div = body_div.find_all("div")[-1]
                a = body_div.find_all("div")
                b = a[-1]
                c = b.find("a")
                url = c.attrs["href"]

                logging.debug(f"Found victim: {name}")

                # check if the org is already seen (search by url because name isn't guarenteed unique)
                q = self.session.query(Victim).filter_by(url=url,
                                                         site=self.site)

                if q.count() == 0:
                    # new org
                    v = Victim(name=name,
                               url=url,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)

            self.site.last_scraped = datetime.utcnow()
            self.session.commit()
Пример #2
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}/rss", headers=self.headers)

            soup = BeautifulSoup(r.content, features="xml")
            items = soup.findAll('item')

            for item in items:
                name = item.title.text

                logging.debug(f"Found victim: {name}")

                publish_dt = datetime.strptime(item.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z")

                q = self.session.query(Victim).filter_by(site=self.site, name=name)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=name, url=None, published=publish_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)

        self.site.last_scraped = datetime.utcnow()
        self.session.commit()
Пример #3
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            page_list = soup.find("ul", class_="pages")
            last_li = page_list.find_all("li")[-1]

            max_page_num = int(last_li.find("a").attrs["href"].split("/")[2])

            # start at the last page and go backwards, in case a new victim was added while running (unlikely but possible)
            for i in range(max_page_num, 0, -1):
                r = p.get(f"{self.url}/page/{i}", headers=self.headers)

                self._handle_page(r.content.decode())

            # check one past the last page to see if new orgs were added that caused another page to be added
            r = p.get(f"{self.url}/page/{max_page_num+1}",
                      headers=self.headers)
            self._handle_page(r.content.decode())

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #4
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")
            
            script_list = soup.find_all("script")
            # they include the list in javascript code instead of HTML
            # So we have to parse it
            javascript_code = ""
            for script in script_list:
                script = str(script)
                if "var post_links = " in script:
                    javascript_code = script
                    break
            start_index = javascript_code.find("var post_links = ")
            end_index = javascript_code[start_index:].find("var baseUrl") + start_index
            javascript_code = javascript_code[start_index:end_index].strip()
            
            start_index = javascript_code.find("[")
            end_index = javascript_code.rfind("]") + 1
            javascript_code = javascript_code[start_index:end_index].strip().replace("null", "None")
            
            # convert javascript list of dictionary to python's list of dictionary
            victim_list = list(eval(javascript_code))

            for victim in victim_list:
                victim_name = victim["title"]
                
                if "-" in victim_name:
                    victim_name = victim_name[:victim_name.find("-")]
                
                published = int(victim["timestamp"])
                published_dt = datetime.utcfromtimestamp(published)

                victim_leak_site = self.url + "/?" + victim["link"] + "/"
                
                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()
                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #5
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")
            
            script_list = soup.find_all("script")
            # they include the list in javascript code instead of HTML
            # So we have to parse it
            js_victims_raw = ""
            js_marker = "var post_links = "

            for script in script_list:
                script = str(script)
                if js_marker in script:
                    js_victims_raw = script
                    break

            if not js_victims_raw:
                raise Exception(f"js victim list not found (tried to locate '{js_marker}')")

            raw_victim_list = js_victims_raw.split(f"{js_marker}[{{")[1].split(
                "}]"
            )[0]
            victim_list = json.loads(f"[{{{raw_victim_list}}}]")

            for victim in victim_list:
                victim_name = victim["title"]
                
                if "-" in victim_name:
                    victim_name = victim_name[:victim_name.find("-")]
                
                published = int(victim["timestamp"])
                published_dt = datetime.utcfromtimestamp(published)

                victim_leak_site = self.url + "/?" + victim["link"] + "/"
                
                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()
                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #6
0
    def scrape_victims(self):
        with Proxy() as p:
            # there is a pagination-container div right now but nothing in it
            # once there are multiple pages of victims, this can be updated to
            # support that

            r = p.get(f"{self.url}" + '/archives/', headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")
            self._handle_page(soup)
Пример #7
0
    def scrape_victims(self):
        with Proxy() as p:
            url = self.url + '/partners.html'
            r = p.get(f"{url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="page-header")
            for victim in victim_list:
                victim_name = victim.find_all("a")[0].text.strip()

                published = victim.find_all("span")[1].text.strip()

                published_dt = None
                # they use a bunch of different date format...
                if published == "29/01/21":
                    published_dt = datetime.strptime(published, "%d/%m/%y")
                elif published[6:8] == "20" and published[8:] != "":
                    published_dt = datetime.strptime(published, "%m/%d/%Y")
                else:
                    published_dt = datetime.strptime(published, "%m/%d/%y")

                victim_leak_site = self.url + '/' + victim.find_all(
                    "a")[0].attrs["href"]

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #8
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            inner_sites = soup.find_all("div",
                                        class_="ajax-load-more-wrap default")

            for site in inner_sites:
                site_url = site.attrs["data-canonical-url"]
                r = p.get(site_url, headers=self.headers)
                self._handle_page(r.content.decode(), p)
Пример #9
0
    def _handle_page(self, body: str, p: Proxy):
        soup = BeautifulSoup(body, "html.parser")

        victim_list = soup.find_all("div", class_="list-text")

        for victim in victim_list:
            victim_description = victim.find("a").find("p").text.strip().split(
                " ")

            # extract company name by getting only the first few words that start with a capitalized letter
            victim_name = ""

            for word in victim_description:
                if word[0].isupper() or word == "and":
                    victim_name += word + " "
                else:
                    break

            victim_name = victim_name[:-1]  # Delete the last space

            if victim_name[-2:] == "is":
                # hard-code this. They forgot to add a space to one name, so I can't properly scrape it
                victim_name = victim_name[:-2]

            # they put the published date in the victim's leak page
            victim_leak_site = victim.find("a").attrs["href"]

            r = p.get(victim_leak_site, headers=self.headers)
            published_dt = self.extract_published_date(r.content.decode())

            q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                     site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name,
                           url=victim_leak_site,
                           published=published_dt,
                           first_seen=datetime.utcnow(),
                           last_seen=datetime.utcnow(),
                           site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)
        self.session.commit()
Пример #10
0
    def scrape_victims(self):
        with Proxy() as p:
            url = self.url + '/partners.html'
            r = p.get(f"{url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="page-header")

            for victim in victim_list:
                victim_name = victim.find_all("a")[0].text.strip()
                published = victim.find_all("span")[1].text.strip()

                # they use a bunch of different date format
                # use a nice dateparsing library to handle them all in an easier manner
                published_dt = dateparser.parse(published)
                # sometimes they don't have a timestamp
                if published_dt is None and len(published) > 0:
                    logging.warning(f"couldn't parse timestamp: {published}")

                victim_leak_site = self.url + '/' + victim.find_all(
                    "a")[0].attrs["href"]
                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #11
0
    def is_site_up(self) -> bool:
        # can't use the parent class is_site_up() because the / route doesn't exist on the API server
        with Proxy() as p:
            try:
                r = p.get(f"{self.url}/rss", headers=self.headers, timeout=Config["timeout"])

                if r.status_code >= 400:
                    return False
            except Exception as e:
                print(e)
                return False

        self.site.last_up = datetime.utcnow()

        return True
Пример #12
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get the number of pages on the site            
            page_nav = soup.find("ul", class_="pagination")
            num_pages = max([int(x.text) for x in page_nav.findAll("a")])

            for pg_num in range(1,num_pages+1):
                # scrape each page
                r = p.get(f"{self.url}/?page={pg_num}", headers=self.headers)
                soup = BeautifulSoup(r.content.decode(), "html.parser")
                self._handle_page(soup)
Пример #13
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            page_count = 0
            while True:
                page_nav = soup.find("div", class_="nav-previous")
                if page_nav is None:
                    break

                url = page_nav.find("a").attrs["href"]
                r = p.get(f"{url}", headers=self.headers)
                soup = BeautifulSoup(r.content.decode(), "html.parser")
                self._handle_page(soup)
Пример #14
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)
            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # find all pages
            page_nav = soup.find_all("a", class_="pagination-link")

            site_list = []

            for page in page_nav:
                site_list.append(self.url + "/" + page.attrs["href"])

            for site in site_list:
                r = p.get(site, headers=self.headers)
                self._handle_page(r.content.decode())
Пример #15
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="lot-card row m-0")

            for victim in victim_list:

                victim_name = victim.find(
                    "div",
                    class_="text-left text-grey d-block overflow-hidden").find(
                        "a").attrs["href"]

                published_dt = None

                victim_leak_site = None

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #16
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="card-body")

            for victim in victim_list:
                victim_name = victim.find("h5",
                                          class_="card-title").text.strip()

                published = victim.find(
                    "p", class_="card-text mt-3 text-secondary").text[11:21]
                published_dt = datetime.strptime(published, "%Y-%m-%d")

                victim_leak_site = self.url + victim.find(
                    "a", class_="btn btn-outline-primary").attrs["href"]

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #17
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all(
                "div", class_="blog-post blog-main posts_at_first")

            for victim in victim_list:
                print(victim)
                victim_name = victim.find(
                    "h2", class_="blog-post-title").find("a").text.strip()

                published = ""

                victim_leak_site = self.url + victim.find(
                    "h2", class_="blog-post-title").find("a").attr["href"]

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #18
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="blog-one__single")

            for victim in victim_list:
                victim_name = victim.find("h3").text.strip()

                client_site = victim.find("h3").find("a", title="Visit Client Website").text.strip()
                victim_name = victim_name.replace(client_site, "").strip()


                published = victim.find("div", class_="blog-one__meta").text.strip()[:10]
                
                published_dt = datetime.strptime(
                    published, "%Y-%m-%d")

                victim_leak_site = self.url + "/" + victim.find("h3").find("a").attrs["href"]

                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #19
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("a", class_="leak-card p-3")

            for victim in victim_list:
                victim_name = victim.find("h5").text.strip()

                published = victim.find("div", class_="col-auto published")
                published_dt = datetime.strptime(
                    published.text.strip(), "%Y-%m-%d %H:%M:%S")

                if victim_name == "Hello world 1" or victim_name == "Mercy, journalists,chonky boi":
                    # skipping news and updates
                    continue

                victim_leak_site = self.url + victim.attrs["href"]

                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #20
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)
            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # find all pages
            page_nav = soup.find_all("a", class_="page-numbers")

            site_list = []
            site_list.append(self.url)

            for page in page_nav:
                # might exist repetition
                if page.attrs["href"] not in site_list:
                    site_list.append(page.attrs["href"])

            for site in site_list:
                r = p.get(site, headers=self.headers)
                self._handle_page(r.content.decode())
Пример #21
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find("div",
                                    class_="collapse-section").find_all("li")
            for victim in victim_list:
                victim_name = victim.find("a").text.strip()

                if victim_name == "HOME" or victim_name == "HOW TO DOWNLOAD?":
                    continue

                victim_leak_site = self.url + victim.find("a").attrs["href"]

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=None,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #22
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="col p-4 d-flex flex-column position-static")

            for victim in victim_list:
                victim_name = victim.find("h3", class_="mb-0").text.strip()
                victim_name = victim_name[:victim_name.find("\n")]
                

                published = victim.find("div", class_="mb-1 text-muted")
                published_dt = datetime.strptime(
                    published.text.strip(), "%Y-%m-%d")

                victim_leak_site = self.url + victim.find("a", class_="stretched-link").attrs["href"]

                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #23
0
    def is_site_up(self) -> bool:
        """
        check if the site is up

        this might have specific criteria for some sites
        """

        with Proxy() as p:
            try:
                r = p.get(self.url,
                          headers=self.headers,
                          timeout=Config["timeout"])

                if r.status_code >= 400:
                    return False
            except Exception as e:
                return False

        self.site.last_up = datetime.utcnow()

        return True
Пример #24
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="col py-3")

            for victim in victim_list:
                victim_name = victim.find("h3", class_="mb-3").text.strip()

                # it's less than ideal that there aren't other properties to search on
                # but I don't want to store leak data URLs
                q = self.session.query(Victim).filter_by(site=self.site,
                                                         name=victim_name)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               published=None,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #25
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}/v1/companies/disclosed",
                      headers=self.headers)

            j = r.json()

            for entry in j:
                name = entry["title"]

                logging.debug(f"Found victim: {name}")

                publish_dt = datetime.strptime(entry["disclosed_at"],
                                               "%Y-%m-%dT%H:%M:%SZ")

                q = self.session.query(Victim).filter_by(site=self.site,
                                                         name=name)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=name,
                               url=None,
                               published=publish_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)

        self.site.last_scraped = datetime.utcnow()
        self.session.commit()
Пример #26
0
    def _handle_page_type(self, url: str):
        with Proxy() as p:
            r = p.get(f"{url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            page_list = soup.find("ul", class_="pagination pagination-sm justify-content-center mb-0")
            last_li = page_list.find_all("li")[-2]

            max_page_num = int(last_li.find("a").text)

            # start at the last page and go backwards, in case a new victim was added while running (unlikely but possible)
            for i in range(max_page_num, 0, -1):
                r = p.get(f"{url}?page={i}", headers=self.headers)

                self._handle_page(r.content.decode())
                
            # check one past the last page to see if new orgs were added that caused another page to be added
            r = p.get(f"{url}?page={max_page_num+1}", headers=self.headers)
            self._handle_page(r.content.decode())

        # just for good measure
        self.session.commit()