def __init__(self, company_name): super().__init__(company_name) self.company_name = WebpageResolver(company_name).company_name try: self.cache = pd.read_csv( Scamwatcher.LOC+"cache.tsv", sep='\t', index_col='company') except FileNotFoundError: self.cache = pd.DataFrame(columns=['company', 'rank']) self.cache.set_index('company')
def __init__(self, company_name): super().__init__(None) self.tax_havens = TaxHeaven().return_data()['tax_heaven'] self.cache = Cache('modules/WHO_IS/cache') resolv = WebpageResolver(company_name) self.company_name = resolv.company_name try: res = resolv.return_data()['webpage'] self.webpages = list(set(res)) except IndexError as e: print("WEBPAGE NOT FOUND") raise e
def return_data(self, **kwargs) -> dict: """ Returns Alexa Rank score in 0-4 scale. 0 - high 1 - moderate 2 - low 3 - very low 4 - not indexed """ if self.company_name in self.cache.index: result = self.cache.loc[self.company_name].values[0] return {"AlexaRank": result} found = [] found_full = [] for webpage in self.webpages: page = WebpageResolver.get_html(AlexaRank.ALEXA_ROOT+webpage, stash=False) try: soup = bs4.BeautifulSoup(page, features="lxml") rank = soup.find_all("div", class_="rankmini-rank")[0].text.strip() rank = int(rank.lstrip("#").replace(",","")) rank_digit = np.digitize(rank, AlexaRank.BINS) found.append(rank_digit) found_full.append(rank) except IndexError: # The page is so small that it's not even indexed in Alexa found.append(4) found_full.append(-1) rank_digit = min(found) rank = min(found_full) self.cache.loc[self.company_name] = rank self.cache.to_csv(AlexaRank.LOC+"cache.tsv", sep='\t') return {"AlexaRank": rank_digit, "AlexaRankScore": rank}
def check_if_polish_text(self, website): def tag_visible(element): if element.parent.name in [ 'style', 'script', 'head', 'title', 'meta', '[document]' ]: return False if isinstance(element, bs4.element.Comment): return False return True def text_from_html(body): soup = BeautifulSoup(body, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(tag_visible, texts) return u" ".join(t.strip() for t in visible_texts) for website in self.websites: try: text = text_from_html(WebpageResolver.get_html(website)) ld = LanguageDetection() langs = ld.return_data(text=text) #print(langs, website) except: continue if 'pl' in langs and langs['pl'] > 0.25: return True return False return False
def __init__(self, company_name): super().__init__(company_name) self.websites = WebpageResolver(company_name).return_data()['webpage'] try: self.cache = pd.read_csv(PolandCheck.LOC + "cache.tsv", sep='\t', index_col='company') except FileNotFoundError: self.cache = pd.DataFrame(columns=['company', 'rank']) self.cache.set_index('company')
class Scamwatcher(DataSource): LOC = "modules/SCAMWATCHER/" PAGE_ROOT = "https://www.scamwatcher.org/{0}-review/" BINS = [5000, 30000, 70000] def __init__(self, company_name): super().__init__(company_name) self.company_name = WebpageResolver(company_name).company_name try: self.cache = pd.read_csv( Scamwatcher.LOC+"cache.tsv", sep='\t', index_col='company') except FileNotFoundError: self.cache = pd.DataFrame(columns=['company', 'rank']) self.cache.set_index('company') def return_data(self, **kwargs) -> dict: """ Key: Scamwatcher """ if self.company_name in self.cache.index: data = self.cache.loc[self.company_name, 'rank'] return {"Scamwatcher": str(data)} page = Scamwatcher.PAGE_ROOT.format(self.company_name).replace(" ", "-") res = requests.get(page) found = "Oops! That page" not in res.text if not found: page = Scamwatcher.PAGE_ROOT.format(' '.join(self.company_name.split()[:-1])).replace(" ", "-") res = requests.get(page) found = "Oops! That page" not in res.text if not found: page = Scamwatcher.PAGE_ROOT.format(self.company_name.lower().replace("ltd", "limited")).replace(" ", "-") print(page) res = requests.get(page) found = "Oops! That page" not in res.text self.cache.loc[self.company_name, 'rank'] = found self.cache.to_csv(Scamwatcher.LOC+"cache.tsv", sep='\t') return {"Scamwatcher": str(bool(found))}
def __init__(self, company_name): super().__init__(company_name) try: self.cache = pd.read_csv( AlexaRank.LOC+"cache.tsv", sep='\t', index_col='company') except FileNotFoundError: self.cache = pd.DataFrame(columns=['company', 'rank']) self.cache.set_index('company') try: res = WebpageResolver(company_name).return_data()['webpage'] self.webpages = res except IndexError as e: print("WEBPAGE NOT FOUND") raise e
def run_scrapper(): df = pd.read_csv(MAIN_DATA, sep='\t', quotechar="\'", error_bad_lines=False, quoting=csv.QUOTE_NONE) with tqdm(df['name'].iloc[667 + 753 + 4099:]) as t: for company_name in t: t.set_postfix(company_name=company_name) try: res = WebpageResolver(company_name).return_data()['webpage'] except (UnicodeError, requests.exceptions.InvalidURL, requests.exceptions.MissingSchema, AttributeError, requests.exceptions.ConnectionError): continue
class BuiltWith(DataSource): def __init__(self, company_name): super().__init__(company_name) self.cache = Cache("modules/BUILTWITH/cache") self.resolv = WebpageResolver(company_name) self.company_name = self.resolv.company_name def return_data(self): temp_cache = self.cache.check_cache(self.company_name) if temp_cache is not None: return {"BuiltWith": temp_cache} out = [] for link in self.resolv.return_data()['webpage']: try: res = builtwith.builtwith(link) if res not in out: out.append(res) except Exception as e: print(e, "i co z tego") self.cache.append([self.company_name, out]) return {"BuiltWith": out}
def __init__(self, company_name): super().__init__(company_name) resolv = WebpageResolver(company_name) self.company_name = resolv.company_name self.data_sources = {"Webpages": resolv.cache}
def __init__(self, company_name): super().__init__(company_name) self.cache = Cache("modules/BUILTWITH/cache") self.resolv = WebpageResolver(company_name) self.company_name = self.resolv.company_name
from .network import Network from modules import WebpageResolver import numpy as np # WORKING EXAMPLES: Chinatsu and Partners cache = WebpageResolver('Mango').cache.index to_search = np.random.choice(cache, size=10) for i in to_search: print(i) module = Network(i) print(i, module.return_data()) print(module.find_company("bitcoin"))
def get_WebpageResolver(): res = WebpageResolver(request.args['name'].lower()).return_data() res = {i: str(j) for i, j in res.items()} return jsonify(res)