예제 #1
0
    def return_data(self, **kwargs) -> dict:
        """ Returns Alexa Rank score in 0-4 scale.
             0 - high
             1 - moderate 
             2 - low
             3 - very low
             4 - not indexed
        """
        if self.company_name in self.cache.index:
            result = self.cache.loc[self.company_name].values[0]
            return {"AlexaRank": result}

        found = []
        found_full = []
        for webpage in self.webpages:
            page = WebpageResolver.get_html(AlexaRank.ALEXA_ROOT+webpage, stash=False)

            try:
                soup = bs4.BeautifulSoup(page, features="lxml")
                rank = soup.find_all("div", class_="rankmini-rank")[0].text.strip()
                rank = int(rank.lstrip("#").replace(",",""))

                rank_digit = np.digitize(rank, AlexaRank.BINS)
                found.append(rank_digit)
                found_full.append(rank)
            except IndexError:
                # The page is so small that it's not even indexed in Alexa
                found.append(4)
                found_full.append(-1)

        rank_digit = min(found)
        rank = min(found_full)
        self.cache.loc[self.company_name] = rank
        self.cache.to_csv(AlexaRank.LOC+"cache.tsv", sep='\t')
        return {"AlexaRank": rank_digit, "AlexaRankScore": rank}
예제 #2
0
    def check_if_polish_text(self, website):
        def tag_visible(element):
            if element.parent.name in [
                    'style', 'script', 'head', 'title', 'meta', '[document]'
            ]:
                return False
            if isinstance(element, bs4.element.Comment):
                return False
            return True

        def text_from_html(body):
            soup = BeautifulSoup(body, 'html.parser')
            texts = soup.findAll(text=True)
            visible_texts = filter(tag_visible, texts)
            return u" ".join(t.strip() for t in visible_texts)

        for website in self.websites:
            try:
                text = text_from_html(WebpageResolver.get_html(website))
                ld = LanguageDetection()
                langs = ld.return_data(text=text)
                #print(langs, website)
            except:
                continue
            if 'pl' in langs and langs['pl'] > 0.25:
                return True
            return False
        return False