def _reviews(self, domain, api_key="", name=""): df = Google().search(' {0}'.format(name)) if df.empty: return url = df.ix[0].link r = BeautifulSoup(Crawlera().get(url).text) if not r.find("a", {"class": "sortByDate"}): return url = "" + r.find("a", {"class": "sortByDate"})["href"] print url r = requests.get( "http://localhost:8950/render.html?url={0}".format(url)) r = BeautifulSoup(r.text) rating = r.find('div', {'class': 'ratingNum'}) rating = rating.text if rating else "" # TODO - awards reviews = pd.DataFrame() for review in r.find_all('li', {'class': 'empReview'}): pros = review.find('p', {'class': 'pros'}) cons = review.find('p', {'class': 'cons'}) extra = review.find('p', {'class': 'notranslate'}) summary = review.find('span', {'class': 'summary'}) date = review.find('time', {'class': 'date'}) vals = [pros, cons, extra, summary, date] cols = ["pros", "cons", "extra", "summary", "date"] vals = [val.text.strip() if val else "" for val in vals] data = dict(zip(cols, vals)) data["timestamp"] = Helper()._str_to_timestamp(data["date"]) reviews = reviews.append(data, ignore_index=True) return reviews
def _html_to_dict(self, url): r = Crawlera().get(url).text try: name = BeautifulSoup(r).find('h1', {'id': 'company_name'}).text except: return {} desc = BeautifulSoup(r).find('span', {'id': 'desc_short'}) desc = desc.text if desc else "" data = {'name': name, 'description': desc} content = BeautifulSoup(r).find_all('td', {'class': 'metadata_content'}) links = [] for c in content: links = links + c.find_all('a') for i in links: if "website" in i.text: print i['href'] website = urllib.unquote(i['href']).split('=')[1] website = website.split('?')[0].split('&')[0] domain = "{}.{}".format( tldextract.extract(website).domain, tldextract.extract(website).tld) data["website"] = website data["domain"] = domain return data
def _search(self, qry, page, location='', country=None): ''' Input : Number of pages, job title qry Output : Array with raw html from all indeed pages ''' #print page, qry, location qry = { 'q': '{0}'.format(qry), 'sort': 'date', 'start': page * 50, 'limit': 50, 'l': location } print "INDEED LOCALE", location, country if country == "Canada": urls = ["" + urllib.urlencode(qry)] elif country == "USA": urls = ["" + urllib.urlencode(qry)] else: canadian_url = [ "" + urllib.urlencode(qry) ] american_url = [ "" + urllib.urlencode(qry) ] urls = canadian_url + american_url pages = [Crawlera().get(url).text for url in urls] return pages
def _html_to_dict(self, url): #r = requests.get(url).text r = Crawlera().get(url).text print url try: company_name = BeautifulSoup(r).find('h1', {'itemprop': 'name'}) company_name = company_name.find('strong').text except: return {"handle": url} address = BeautifulSoup(r).find('h1', { 'itemprop': 'name' }).find('span').text city = BeautifulSoup(r).find('span', { 'itemprop': 'addressLocality' }).text state = BeautifulSoup(r).find('span', { 'itemprop': 'addressRegion' }).text postal_code = BeautifulSoup(r).find('span', { 'itemprop': 'postalCode' }).text description = BeautifulSoup(r).find('article', { 'itemprop': 'description' }).text.strip().replace('\nMore...', '') logo = BeautifulSoup(r).find('figure').find('img')['src'] website = BeautifulSoup(r).find('li', { 'class': 'website' }).find('a')['href'].split('gourl?')[-1] domain = "{}.{}".format( tldextract.extract(website).domain, tldextract.extract(website).tld) ''' Phone ''' main = BeautifulSoup(r).find('li', { 'class': 'phone' }).find('strong', { 'class': 'primary' }).text numbers = BeautifulSoup(r).find('li', {'class': 'phone'}).findAll('li') nums = [number.find('span').text for number in numbers] names = [ number.text.split(number.find('span').text)[0] for number in numbers ] numbers = dict(zip(names, nums)) numbers['main'] = main _vars = [ company_name, address, city, state, postal_code, description, logo, website, domain ] labels = [ "name", "address", "city", "state", "postal_code", "description", "logo", "website", "domain" ] company = dict(zip(labels, _vars)) company["numbers"] = numbers company["handle"] = url return company
def _html_to_dict(self, url): bs = BeautifulSoup(Crawlera().get(url).text) info = bs.find('div', {'class': 'ataglanz'}) if info: info = info.text.split('\n') else: return {} info = dict([i.strip().split(': ') for i in info if ":" in i]) logo = bs.find('div', {'class': 'profileLeft'}).find('img')['src'] info['logo'] = logo info['description'] = bs.find('p', {'id': 'bio'}).text info['name'] = bs.find('hgroup').text info['handle'] = url return info
def _html(self, qry, page=1, location="", country=None): #qry, page = "inside sales", 1 if country: location = location + " " + country qry = { 'search': '{0}'.format(qry), 'page': page, 'location': location, 'days': 1 } _url = "{0}" _url = _url.format(urllib.urlencode(qry)) print _url return BeautifulSoup(Crawlera().get(_url).text)
def _parse_article_html(self, objectId, url, industry_press=None): #html = requests.get(url).text html = Crawlera().get(url).text article = BeautifulSoup(html).find("div", { "id": "ReleaseContent" }).text # ps = [ p.text.split("SOURCE ")[-1] for p in BeautifulSoup(r.text).find_all("p") if "SOURCE " in p.text ] company_name = ps[0] #q.enqueue(ClearSpark()._bulk_company_info, company_name) data = {"article": article, "company_name": company_name} if industry_press: r = Parse().update("IndustryPress", objectId, data) else: r = Parse().update("Press", objectId, data) print r.json()
def _html_to_dict(self, url): co = BeautifulSoup(Crawlera().get(url).text) name = co.find('span', {'itemprop': 'name'}) description = co.find('p', {'itemprop': 'description'}) address = co.find('div', {'itemprop': 'address'}) phone = co.find('div', {'itemprop': 'telephone'}) website = "" #co.find('div',{'id':'detailsContainer'}).find('a') # TODO - figure out why this is not working _vars = [name, description, address, phone, website] _vars = [var.text.strip() if var else "" for var in _vars] labels = ["name", "description", "address", "phone", "website"] print website data = dict(zip(labels, _vars)) if data["website"] != "": data['domain'] = "{}.{}".format( tldextract.extract(data["website"]).domain, tldextract.extract(data["website"]).tld) data['handle'] = url return data
def _html_to_dict(self, url): r = Crawlera().get(url).text company_name = BeautifulSoup(r).find('h1', {'class': 'biz-page-title'}) industry = BeautifulSoup(r).find('span', {'class': 'category-str-list'}) address = BeautifulSoup(r).find('address', {'itemprop': 'address'}) phone = BeautifulSoup(r).find('span', {'itemprop': 'telephone'}) website = BeautifulSoup(r).find('div', {'class': 'biz-website'}) website = website.find('a') if website else None _vars = [company_name, industry, address, phone, website] _vars = [var.text.strip() if var else "" for var in _vars] labels = ["name", "industry", "address", "phone", "website"] data = dict(zip(labels, _vars)) data["industry"] = [data["industry"]] print data if data["website"] != "": tld = tldextract.extract(self._remove_non_ascii(data["website"])) data['domain'] = "{}.{}".format(tld.domain, tld.tld) data["handle"] = url return data
def _html_to_dict(self, _url): url = _url bs = BeautifulSoup(Crawlera().get(url).text) name = bs.find('h1', {'itemprop': 'name'}) name = name.text.split('Company ')[0] if name else "" telephone = bs.find('span', {'itemprop': 'telephone'}) telephone = telephone.text if telephone else "" try: address = bs.find('p', { 'itemprop': 'address' }).text.split(telephone)[0].strip() except: address = "" url = bs.find('p', {'itemprop': 'address'}) url = url.find('a') if url else "" url = url.text if url else "" cols = ["name", "phone", "address", "website"] vals = [name, telephone, address, url] info = dict(zip(cols, vals)) info['handle'] = _url if "website" in info.keys(): tld = tldextract.extract(info["website"]) info['domain'] = "{}.{}".format(tld.domain, tld.tld) return info
def _search(self, qry): #html = Google().cache("") qry = qry.replace(" ", "%20") url = "{0}&src=typd" html = Crawlera()._get(url.format(qry)).text tw = BeautifulSoup(html) tweets = [] for tweet in tw.find_all("div", {"class": "tweet"}): text = tweet.find("p", {"class": "tweet-text"}) if text: text = text.text else: continue hashtags = [ hashtag["href"] for hashtag in tweet.find_all( "a", {"class": "twitter-hashtag"}) ] mentions = [ "" + reply["href"] for reply in tweet.find_all("a", {"class": "twitter-atreply"}) ] links = [ link["href"] for link in tweet.find_all( "a", {"class": "twitter-timeline-link"}) ] photos = [ img["src"] for img in tweet.find_all( "img", {"class": "TwitterPhoto-mediaSource"}) ] tweet = { "text": text, "hashtags": hashtags, "mentions": mentions, "links": links, "photos": photos, "name": tweet.find("strong", { "class": "fullname" }).text, "handle": tweet.find("span", { "class": "username" }).text, "profile_pic": tweet.find("img", {"class": "avatar"})["src"], "timestamp": tweet.find("span", {"class": "_timestamp"})["data-time"], "time_ago": tweet.find("span", { "class": "_timestamp" }).text } tweets.append(tweet) #CompanyExtraInfoCrawl()._persist(tweet, "tweets") tweets = pd.DataFrame(tweets) Parse()._batch_df_create("Tweet", tweets) # TODO - find company_name + title from twitter return tweets