Пример #1
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            victim_divs = soup.find("div", class_="row mt-3 mb-3").find_all(
                "div", recursive=False)

            for div in victim_divs:
                # parse all the stuff out of the html
                parent_div = div.find("div")
                header_div = parent_div.find("div", class_="header")

                # get the name from the header
                h5 = header_div.find("div").find("div",
                                                 class_="col-8").find("h5")
                name = h5.text.split("- ")[0].strip()

                # get the published date from the header
                published_span = header_div.find("div").find(
                    "div", class_="col-4 text-right").find("span")
                published_dt = datetime.strptime(published_span.text.strip(),
                                                 "%d.%m.%Y")

                # parse out the details link
                # this is ugly but it works
                body_div = parent_div.find("div", class_="body")
                link_div = body_div.find_all("div")[-1]
                a = body_div.find_all("div")
                b = a[-1]
                c = b.find("a")
                url = c.attrs["href"]

                logging.debug(f"Found victim: {name}")

                # check if the org is already seen (search by url because name isn't guarenteed unique)
                q = self.session.query(Victim).filter_by(url=url,
                                                         site=self.site)

                if q.count() == 0:
                    # new org
                    v = Victim(name=name,
                               url=url,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)

            self.site.last_scraped = datetime.utcnow()
            self.session.commit()
Пример #2
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}/rss", headers=self.headers)

            soup = BeautifulSoup(r.content, features="xml")
            items = soup.findAll('item')

            for item in items:
                name = item.title.text

                logging.debug(f"Found victim: {name}")

                publish_dt = datetime.strptime(item.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z")

                q = self.session.query(Victim).filter_by(site=self.site, name=name)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=name, url=None, published=publish_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)

        self.site.last_scraped = datetime.utcnow()
        self.session.commit()
Пример #3
0
    def _handle_page(self, body: str):
        soup = BeautifulSoup(body, "html.parser")

        victim_divs = soup.find_all("div", class_="border-top border-light pt-3 mb-4")

        for div in victim_divs:
            # parse all the stuff out of the html
            name = div.find("h3").text.split("\n")[0].strip()

            url = div.find_all("div")[-1].find("a").attrs["href"]

            logging.debug(f"Found victim: {name}")

            # check if the org is already seen (search by url because name isn't guarenteed unique)
            q = self.session.query(Victim).filter_by(url=url, site=self.site)

            if q.count() == 0:
                # new org
                v = Victim(name=name, url=url, published=None, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)

        self.session.commit()
Пример #4
0
    def _handle_page(self, body: str):
        soup = BeautifulSoup(body, "html.parser")

        victim_list = soup.find_all("div", class_="post-block")

        for victim in victim_list:
            victim_name = victim.find("div", class_="post-title").text.strip()
            victim_leak_site = victim.find(
                "div", class_="post-block-body").find("a").attrs["href"]
            published_dt = datetime.now()
            q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                     site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name,
                           url=victim_leak_site,
                           published=published_dt,
                           first_seen=datetime.utcnow(),
                           last_seen=datetime.utcnow(),
                           site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)
        self.session.commit()
Пример #5
0
    def _handle_page(self, body: str):
        soup = BeautifulSoup(body, "html.parser")

        victim_list = soup.find_all("div", class_="col p-4 d-flex flex-column position-static")

        for victim in victim_list:
            victim_name = victim.find("h3", class_="mb-0").text.strip()
            victim_name = victim_name[:victim_name.find("\n")]

            victim_leak_site = self.url + victim.find("a").attrs["href"]

            published = victim.find("div", class_="mb-1 text-muted").text.strip()
            published_dt = datetime.strptime(
                published, "%Y-%m-%d")
            q = self.session.query(Victim).filter_by(
                url=victim_leak_site, site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                            first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                #print(v)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)
        self.session.commit()
Пример #6
0
    def _handle_page(self, soup):
        victim_list = soup.find_all("a", class_="post")
        for victim in victim_list:
            victim_name = victim.find("h2", class_="post-title").text.strip()
            published = victim.find("div", class_="time").text.strip()
            published_dt = dateparser.parse(published)
            victim_leak_site = victim['href']

            q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                     site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name,
                           url=victim_leak_site,
                           published=published_dt,
                           first_seen=datetime.utcnow(),
                           last_seen=datetime.utcnow(),
                           site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            self.current_victims.append(v)

        self.session.commit()

        # Lets delay execution of next in case of timeout of server/proxy relay
        time.sleep(1.0)
Пример #7
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")
            
            script_list = soup.find_all("script")
            # they include the list in javascript code instead of HTML
            # So we have to parse it
            js_victims_raw = ""
            js_marker = "var post_links = "

            for script in script_list:
                script = str(script)
                if js_marker in script:
                    js_victims_raw = script
                    break

            if not js_victims_raw:
                raise Exception(f"js victim list not found (tried to locate '{js_marker}')")

            raw_victim_list = js_victims_raw.split(f"{js_marker}[{{")[1].split(
                "}]"
            )[0]
            victim_list = json.loads(f"[{{{raw_victim_list}}}]")

            for victim in victim_list:
                victim_name = victim["title"]
                
                if "-" in victim_name:
                    victim_name = victim_name[:victim_name.find("-")]
                
                published = int(victim["timestamp"])
                published_dt = datetime.utcfromtimestamp(published)

                victim_leak_site = self.url + "/?" + victim["link"] + "/"
                
                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()
                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #8
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")
            
            script_list = soup.find_all("script")
            # they include the list in javascript code instead of HTML
            # So we have to parse it
            javascript_code = ""
            for script in script_list:
                script = str(script)
                if "var post_links = " in script:
                    javascript_code = script
                    break
            start_index = javascript_code.find("var post_links = ")
            end_index = javascript_code[start_index:].find("var baseUrl") + start_index
            javascript_code = javascript_code[start_index:end_index].strip()
            
            start_index = javascript_code.find("[")
            end_index = javascript_code.rfind("]") + 1
            javascript_code = javascript_code[start_index:end_index].strip().replace("null", "None")
            
            # convert javascript list of dictionary to python's list of dictionary
            victim_list = list(eval(javascript_code))

            for victim in victim_list:
                victim_name = victim["title"]
                
                if "-" in victim_name:
                    victim_name = victim_name[:victim_name.find("-")]
                
                published = int(victim["timestamp"])
                published_dt = datetime.utcfromtimestamp(published)

                victim_leak_site = self.url + "/?" + victim["link"] + "/"
                
                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()
                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #9
0
    def scrape_victims(self):
        with Proxy() as p:
            url = self.url + '/partners.html'
            r = p.get(f"{url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="page-header")
            for victim in victim_list:
                victim_name = victim.find_all("a")[0].text.strip()

                published = victim.find_all("span")[1].text.strip()

                published_dt = None
                # they use a bunch of different date format...
                if published == "29/01/21":
                    published_dt = datetime.strptime(published, "%d/%m/%y")
                elif published[6:8] == "20" and published[8:] != "":
                    published_dt = datetime.strptime(published, "%m/%d/%Y")
                else:
                    published_dt = datetime.strptime(published, "%m/%d/%y")

                victim_leak_site = self.url + '/' + victim.find_all(
                    "a")[0].attrs["href"]

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #10
0
    def _handle_page(self, body: str, p: Proxy):
        soup = BeautifulSoup(body, "html.parser")

        victim_list = soup.find_all("div", class_="list-text")

        for victim in victim_list:
            victim_description = victim.find("a").find("p").text.strip().split(
                " ")

            # extract company name by getting only the first few words that start with a capitalized letter
            victim_name = ""

            for word in victim_description:
                if word[0].isupper() or word == "and":
                    victim_name += word + " "
                else:
                    break

            victim_name = victim_name[:-1]  # Delete the last space

            if victim_name[-2:] == "is":
                # hard-code this. They forgot to add a space to one name, so I can't properly scrape it
                victim_name = victim_name[:-2]

            # they put the published date in the victim's leak page
            victim_leak_site = victim.find("a").attrs["href"]

            r = p.get(victim_leak_site, headers=self.headers)
            published_dt = self.extract_published_date(r.content.decode())

            q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                     site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name,
                           url=victim_leak_site,
                           published=published_dt,
                           first_seen=datetime.utcnow(),
                           last_seen=datetime.utcnow(),
                           site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)
        self.session.commit()
Пример #11
0
    def scrape_victims(self):
        with Proxy() as p:
            url = self.url + '/partners.html'
            r = p.get(f"{url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="page-header")

            for victim in victim_list:
                victim_name = victim.find_all("a")[0].text.strip()
                published = victim.find_all("span")[1].text.strip()

                # they use a bunch of different date format
                # use a nice dateparsing library to handle them all in an easier manner
                published_dt = dateparser.parse(published)
                # sometimes they don't have a timestamp
                if published_dt is None and len(published) > 0:
                    logging.warning(f"couldn't parse timestamp: {published}")

                victim_leak_site = self.url + '/' + victim.find_all(
                    "a")[0].attrs["href"]
                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #12
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="card-body")

            for victim in victim_list:
                victim_name = victim.find("h5",
                                          class_="card-title").text.strip()

                published = victim.find(
                    "p", class_="card-text mt-3 text-secondary").text[11:21]
                published_dt = datetime.strptime(published, "%Y-%m-%d")

                victim_leak_site = self.url + victim.find(
                    "a", class_="btn btn-outline-primary").attrs["href"]

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #13
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="lot-card row m-0")

            for victim in victim_list:

                victim_name = victim.find(
                    "div",
                    class_="text-left text-grey d-block overflow-hidden").find(
                        "a").attrs["href"]

                published_dt = None

                victim_leak_site = None

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #14
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all(
                "div", class_="blog-post blog-main posts_at_first")

            for victim in victim_list:
                print(victim)
                victim_name = victim.find(
                    "h2", class_="blog-post-title").find("a").text.strip()

                published = ""

                victim_leak_site = self.url + victim.find(
                    "h2", class_="blog-post-title").find("a").attr["href"]

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=published_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #15
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="blog-one__single")

            for victim in victim_list:
                victim_name = victim.find("h3").text.strip()

                client_site = victim.find("h3").find("a", title="Visit Client Website").text.strip()
                victim_name = victim_name.replace(client_site, "").strip()


                published = victim.find("div", class_="blog-one__meta").text.strip()[:10]
                
                published_dt = datetime.strptime(
                    published, "%Y-%m-%d")

                victim_leak_site = self.url + "/" + victim.find("h3").find("a").attrs["href"]

                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #16
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("a", class_="leak-card p-3")

            for victim in victim_list:
                victim_name = victim.find("h5").text.strip()

                published = victim.find("div", class_="col-auto published")
                published_dt = datetime.strptime(
                    published.text.strip(), "%Y-%m-%d %H:%M:%S")

                if victim_name == "Hello world 1" or victim_name == "Mercy, journalists,chonky boi":
                    # skipping news and updates
                    continue

                victim_leak_site = self.url + victim.attrs["href"]

                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #17
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find("div",
                                    class_="collapse-section").find_all("li")
            for victim in victim_list:
                victim_name = victim.find("a").text.strip()

                if victim_name == "HOME" or victim_name == "HOW TO DOWNLOAD?":
                    continue

                victim_leak_site = self.url + victim.find("a").attrs["href"]

                q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                         site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               url=victim_leak_site,
                               published=None,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #18
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="col p-4 d-flex flex-column position-static")

            for victim in victim_list:
                victim_name = victim.find("h3", class_="mb-0").text.strip()
                victim_name = victim_name[:victim_name.find("\n")]
                

                published = victim.find("div", class_="mb-1 text-muted")
                published_dt = datetime.strptime(
                    published.text.strip(), "%Y-%m-%d")

                victim_leak_site = self.url + victim.find("a", class_="stretched-link").attrs["href"]

                q = self.session.query(Victim).filter_by(
                    url=victim_leak_site, site=self.site)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                               first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #19
0
    def _handle_page(self, body: str):
        soup = BeautifulSoup(body, "html.parser")
        victim_list = soup.find_all("div", {"id": re.compile("comp.*")})

        for victim in victim_list:
            victim_h3 = victim.find("div", class_="panel-heading").find("h3")
            if victim_h3 is None:
                # unpublished victims are in a h4
                continue
            victim_name = victim_h3.text.strip()
            victim_leak_site = self.url + "/#" + victim.get("id")

            if victim.find("span", class_="glyphicon"):
                published = victim.find("span",
                                        class_="glyphicon").next_sibling
                published = published.lstrip()
                published_dt = datetime.strptime(published,
                                                 "Posted %b %d, %Y.")
            else:
                published = ""

            q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                     site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name,
                           url=victim_leak_site,
                           published=published_dt,
                           first_seen=datetime.utcnow(),
                           last_seen=datetime.utcnow(),
                           site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)
        self.session.commit()
Пример #20
0
    def _handle_page(self, soup):
        victim_list = soup.find_all("header", class_="entry-header")
        for victim in victim_list:
            victim_title = victim.find("h2", class_="entry-title").text.strip()

            victim_name = victim_title[0:victim_title.find(". Part")]

            meta = victim.find("div", class_="entry-meta")

            published = meta.find("time",
                                  class_="entry-date").attrs["datetime"]
            published_dt = datetime.strptime(published.strip()[:-6],
                                             "%Y-%m-%dT%H:%M:%S")

            victim_leak_site = meta.find(
                "span", class_="posted-on").find("a").attrs["href"]

            q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                     site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name,
                           url=victim_leak_site,
                           published=published_dt,
                           first_seen=datetime.utcnow(),
                           last_seen=datetime.utcnow(),
                           site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()
            self.current_victims.append(v)
        self.session.commit()

        # server was timing out so slows it down a bit
        time.sleep(1.0)
Пример #21
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}", headers=self.headers)

            soup = BeautifulSoup(r.content.decode(), "html.parser")

            # get max page number
            victim_list = soup.find_all("div", class_="col py-3")

            for victim in victim_list:
                victim_name = victim.find("h3", class_="mb-3").text.strip()

                # it's less than ideal that there aren't other properties to search on
                # but I don't want to store leak data URLs
                q = self.session.query(Victim).filter_by(site=self.site,
                                                         name=victim_name)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=victim_name,
                               published=None,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)
            self.session.commit()

        self.site.last_scraped = datetime.utcnow()

        # just for good measure
        self.session.commit()
Пример #22
0
    def _handle_page(self, body: str):
        soup = BeautifulSoup(body, "html.parser")

        victim_list = soup.find_all(
            "header", class_="entry-header has-text-align-center")

        for victim in victim_list:
            victim_name = victim.find(
                "h2", class_="entry-title heading-size-1").text.strip()

            victim_leak_site = victim.find(
                "h2",
                class_="entry-title heading-size-1").find("a").attrs["href"]

            published = victim.find(
                "li", class_="post-date meta-wrapper").find("a").text.strip()
            published_dt = datetime.strptime(published, "%B %d, %Y")

            q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                     site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name,
                           url=victim_leak_site,
                           published=published_dt,
                           first_seen=datetime.utcnow(),
                           last_seen=datetime.utcnow(),
                           site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)
        self.session.commit()
Пример #23
0
    def scrape_victims(self):
        with Proxy() as p:
            r = p.get(f"{self.url}/v1/companies/disclosed",
                      headers=self.headers)

            j = r.json()

            for entry in j:
                name = entry["title"]

                logging.debug(f"Found victim: {name}")

                publish_dt = datetime.strptime(entry["disclosed_at"],
                                               "%Y-%m-%dT%H:%M:%SZ")

                q = self.session.query(Victim).filter_by(site=self.site,
                                                         name=name)

                if q.count() == 0:
                    # new victim
                    v = Victim(name=name,
                               url=None,
                               published=publish_dt,
                               first_seen=datetime.utcnow(),
                               last_seen=datetime.utcnow(),
                               site=self.site)
                    self.session.add(v)
                    self.new_victims.append(v)
                else:
                    # already seen, update last_seen
                    v = q.first()
                    v.last_seen = datetime.utcnow()

                # add the org to our seen list
                self.current_victims.append(v)

        self.site.last_scraped = datetime.utcnow()
        self.session.commit()
Пример #24
0
    def _handle_page(self, body: str):
        soup = BeautifulSoup(body, "html.parser")
        victim_list = soup.find_all("article", {"id": re.compile("post.*")})

        for victim in victim_list:
            victim_name = victim.find("h2",
                                      class_="type-list-title").text.strip()
            victim_leak_site = victim.find(
                "h2", class_="type-list-title").find("a").attrs["href"]

            published = victim.find("div",
                                    class_="type-list-date").text.strip()

            published_dt = dateparser.parse(published)
            if published_dt is None and len(published) > 0:
                logging.warning(f"couldn't parse timestamp: {published}")

            q = self.session.query(Victim).filter_by(url=victim_leak_site,
                                                     site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name,
                           url=victim_leak_site,
                           published=published_dt,
                           first_seen=datetime.utcnow(),
                           last_seen=datetime.utcnow(),
                           site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)
        self.session.commit()
Пример #25
0
    def _handle_page(self, soup):
        victim_list = soup.find_all("div", class_="card-body")

        for victim in victim_list:
            victim_name = victim.find("h5", class_="card-title").text.strip()

            published = victim.find("p", class_="card-text mt-3 text-secondary").text[11:21]
            published_dt = datetime.strptime(
                published, "%Y-%m-%d")

            victim_leak_site = self.url + victim.find("a", class_="btn btn-outline-primary").attrs["href"]

            q = self.session.query(Victim).filter_by(
                url=victim_leak_site, site=self.site)

            if q.count() == 0:
                # new victim
                v = Victim(name=victim_name, url=victim_leak_site, published=published_dt,
                           first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
                self.session.add(v)
                self.new_victims.append(v)
            else:
                # already seen, update last_seen
                v = q.first()
                v.last_seen = datetime.utcnow()

            # add the org to our seen list
            self.current_victims.append(v)

        self.session.commit()
        self.site.last_scraped = datetime.utcnow()
        # just for good measure
        self.session.commit()

        # in case server/tor proxy relay times out, slowing down scraping a bit
        time.sleep(1.0)