def test(self, company_name): job = rq.get_current_job() print job.meta.keys() if "queue_name" in job.meta.keys(): print RQueue()._has_completed(job.meta["queue_name"]) print RQueue()._has_completed("queue_name") if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
def search_sources(self, domain, api_key, name=""): pattern = Toofr().get(domain) if pattern: ptn = { "domain": domain, "company_email_pattern": [{ "source": "toofr", "pattern": pattern }] } self._find_if_object_exists('EmailPattern', 'domain', domain, ptn) Webhook()._update_company_email_pattern(ptn) return pattern # syncronous jigsaw search # job_5 = q.enqueue(Sources()._jigsaw_search, domain) job_1 = q.enqueue(Sources()._whois_search, domain) job_2 = q.enqueue(Sources()._google_span_search, domain) job_3 = q.enqueue(Sources()._press_search, domain, api_key) job_4 = q.enqueue(Sources()._zoominfo_search, domain) jobs = [job_1, job_2, job_3, job_4] if name != "": job_5 = q.enqueue(Sources()._mx_server_check, name, domain) job_6 = q.enqueue(Sources()._linkedin_login_search, name, domain) jobs = jobs + [job_5, job_6] for job in jobs: RQueue()._meta(job, "{0}_{1}".format(domain, api_key))
def _email_pattern(self, domain, api_key=""): print ''' Score email pattern based on number of occurrences ''' qry = {'where':json.dumps({'domain': domain}),'limit':1000} crawls = Parse().get('CompanyEmailPatternCrawl', qry) crawls = pd.DataFrame(crawls.json()['results']) df = crawls[crawls.pattern.notnull()].drop_duplicates('email') _df = df[df.crawl_source != "mx_check"] df = df[df.crawl_source == "mx_check"].drop_duplicates('pattern') if len(df.pattern) > 2: df = df[df.crawl_source != "mx_check"] df = _df.append(df) df = df.pattern.value_counts() score = pd.DataFrame() score['pattern'], score['freq'] = df.index, df.values score['score'] = [freq / float(score.freq.sum()) for freq in score['freq']] score['source'], score['tried'] = 'clearspark', False score = score.fillna("") score = score.to_dict('records') #print score, api_key print "SCORE" print score score = {'domain':domain, 'company_email_pattern':score} self._find_if_object_exists('EmailPattern','domain', domain, score) # TODO - add date crawled to score if RQueue()._has_completed("{0}_{1}".format(domain, api_key)): if score['company_email_pattern'] == []: score['email_guess'] = EmailGuess()._random() #q.enqueue(Sources()._jigsaw_search, domain) Webhook()._update_company_email_pattern(score)
def employee_webhook(self, company_name, company_list, qry="", limit=5, list_id="", _report=""): _user, _company = company_list['user'], company_list['company'] employees = Companies()._employees(company_name, qry) company = Companies()._get_info(company_name) _company_list = company_list['objectId'] for index, row in employees.iterrows(): data = row.to_dict() company['user'], company['company'] = _user, _company prospect = company prospect['name'], prospect['pos'] = row['name'], row['title'] prospect['city'] = row['locale'] prospect['linkedin_url'] = row['linkedin_url'] prospect['lists'] = [Parse()._pointer('ProspectList', list_id)] if type(company['industry']) is list: company['industry'] = company['industry'][0] prospect['company_profile'] = company_list['profile'] r = Prospecter().create('Prospect', company) print "prospect_create_result", r.json() if RQueue()._has_completed("{0}_{1}".format(_company_list, list_id)): data = {'done': arrow.now().timestamp} r = Prospecter().update("SignalReport/" + _report, data) print "employee_webhook_has_completed -->", r.json()
def _press_search(self, domain, api_key): pw = Google().search('"{0}" site:prnewswire.com'.format(domain)) bw = Google().search('"{0}" site:businesswire.com'.format(domain)) #job_queue_lol = objectId+str(arrow.now().timestamp) print bw, pw pw = pw if not pw.empty else pd.DataFrame(columns=["link"]) bw = pw if not bw.empty else pd.DataFrame(columns=["link"]) queue = "press-check-" + domain for link in pw.link: job = q.enqueue(PRNewsWire()._email, domain, link, timeout=3600) RQueue()._meta(job, "{0}_{1}".format(domain, api_key)) for link in bw.link: job = q.enqueue(BusinessWire()._email, domain, link, timeout=3600) RQueue()._meta(job, "{0}_{1}".format(domain, api_key)) '''
def _domain_research(self, domain, api_key="", name="", prospect_name=""): # Primary Research if name == "": name = domain x = 6000 j1 = q.enqueue(Zoominfo()._domain_search, domain, api_key, name, timeout=x) j2 = q.enqueue(Linkedin()._domain_search, domain, api_key, name, timeout=x) j3 = q.enqueue(YellowPages()._domain_search, domain, api_key, name, timeout=x) j4 = q.enqueue(Yelp()._domain_search, domain, api_key, name, timeout=x) j5 = q.enqueue(Forbes()._domain_search, domain, api_key, name, timeout=x) j6 = q.enqueue(GlassDoor()._domain_search, domain, api_key, name, timeout=x) j7 = q.enqueue(Hoovers()._domain_search, domain, api_key, name, timeout=x) j8 = q.enqueue(Crunchbase()._domain_search, domain, api_key, name, timeout=x) j9 = q.enqueue(Facebook()._domain_search, domain, api_key, name, timeout=x) j10 = q.enqueue(Twitter()._domain_search, domain, api_key, name, timeout=x) j11 = q.enqueue(Indeed()._domain_search, domain, api_key, name, timeout=x) jobs = [j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11] for job in jobs: RQueue()._meta(job, "{0}_{1}".format(name, api_key), prospect_name)
def parse(self, url, company_name): cache = Google().cache(url) soup = BeautifulSoup(cache) p = [] for i in soup.find_all("div", {"class": "entityblock"}): try: img = i.find("img")["data-delayed-url"] except: img = i.find("img")["src"] profile = i.find("a")["href"] name = i.find("h3", {"class": "name"}) name = name.text if name else "" title = i.find("p", {"class": "headline"}) title = title.text if title else "" company = title.split("at ")[-1] title = title.split(" at ")[0] city = i.find("dd") city = city.text if city else "" cols = ["img", "profile", "name", "title", "city", "company"] vals = [img, profile, name, title, city, company] print vals p.append(dict(zip(cols, vals))) print p results = pd.DataFrame(p) if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(company_name, company) for company in results.company ] else: results['company_score'] = [ fuzz.ratio(company_name, company) for company in results.company ] results = results[(results.company_score > 64)] data = {'data': results.to_dict("r"), 'company_name': company_name} CompanyExtraInfoCrawl()._persist(data, "employees", "") job = rq.get_current_job() print job.meta.keys() if "queue_name" in job.meta.keys(): if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"]) return p
def _employees(self, domain, api_key="", company_name="", keyword=""): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"' args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) results = Google().search(qry, 10) if results.empty: if domain == "": ''' return results ''' else: results = Google().search(qry.format(domain, args, keyword)) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) if " " in company_name: results['company_score'] = [ fuzz.partial_ratio(_name, company) for company in results.company ] else: results['company_score'] = [ fuzz.ratio(_name, company) for company in results.company ] if keyword != "": results['score'] = [ fuzz.ratio(keyword, title) for title in results.title ] results = results[results.score > 75] results = results[results.company_score > 64] results = results.drop_duplicates() data = {'data': results.to_dict('r'), 'company_name': company_name} data["domain"] = domain CompanyExtraInfoCrawl()._persist(data, "employees", api_key) job = rq.get_current_job() if "queue_name" in job.meta.keys(): if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"]) return results
def _daily_secondary_research(self, name, domain, api_key=""): # Secondary Research - sometimes require location or domain if name == "": name = domain x = 6000 j0 = q.enqueue(Companies()._company_blog, domain, api_key, name, timeout=x) # Secondary Research - sometimes require location or domain if name == "": name = domain x = 6000 j0 = q.enqueue(Companies()._company_blog, domain, api_key, name, timeout=x) j2 = q.enqueue(GlassDoor()._reviews, domain, api_key, name, timeout=x) j3 = q.enqueue(Companies()._press_releases, domain, api_key, name, timeout=x) j4 = q.enqueue(Companies()._news, domain, api_key, name, timeout=x) j5 = q.enqueue(Companies()._hiring, domain, api_key, name, timeout=x) j6 = q.enqueue(Twitter()._daily_news, domain, api_key, name, timeout=x) j7 = q.enqueue(Facebook()._daily_news, domain, api_key, name, timeout=x) j8 = q.enqueue(Linkedin()._daily_news, domain, api_key, name, timeout=x) # TODO - general pages on their site jobs = [j0, j2, j3, j4, j5, j6, j7, j8] for job in jobs: RQueue()._meta(job, "{0}_{1}".format(name, api_key))
def _research(self, name, api_key="", prospect_name=""): # Primary Research j9 = q.enqueue(Facebook()._company_profile, name, api_key, timeout=6000) j10 = q.enqueue(Twitter()._company_profile, name, api_key, timeout=6000) j11 = q.enqueue(Indeed()._company_profile, name, api_key, timeout=6000) j0 = q.enqueue(BusinessWeek()._company_profile, name, api_key, timeout=6000) j1 = q.enqueue(Zoominfo()._company_profile, name, api_key, timeout=6000) j2 = q.enqueue(Linkedin()._company_profile, name, api_key, timeout=6000) j3 = q.enqueue(YellowPages()._company_profile, name, api_key, timeout=6000) j4 = q.enqueue(Yelp()._company_profile, name, api_key, timeout=6000) j5 = q.enqueue(Forbes()._company_profile, name, api_key, timeout=6000) j6 = q.enqueue(GlassDoor()._company_profile, name, api_key, timeout=6000) j7 = q.enqueue(Hoovers()._company_profile, name, api_key, timeout=6000) j8 = q.enqueue(Crunchbase()._company_profile, name, api_key, timeout=6000) jobs = [j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11] for job in jobs: RQueue()._meta(job, "{0}_{1}".format(name, api_key), prospect_name)
def _company_info(self, company_name, api_key=""): #TODO - company_name = self._remove_non_ascii(company_name) add to save qry = { 'where': json.dumps({'company_name': company_name}), 'limit': 1000 } qry['order'] = '-createdAt' crawls = Parse().get('CompanyInfoCrawl', qry).json()['results'] if not crawls: # start crawls return company_name crawls = self._source_score(pd.DataFrame(crawls)) crawls = self._logo_score(crawls) #crawls = crawls[crawls.api_key == api_key] crawls['name_score'] = [ fuzz.token_sort_ratio(row['name'], row.company_name) for index, row in crawls.iterrows() ] crawls = crawls[crawls.name_score > 70].append( crawls[crawls.name.isnull()]) logo = crawls.sort("logo_score", ascending=False) #logo=logo[(logo.logo != "") & (logo.logo.notnull())][["source","logo"]] logo = logo[(logo.logo != "") & (logo.logo.notnull())].logo.tolist() logo = logo[0] if logo else "" #crawls = crawls[["press", 'source_score', 'source', 'createdAt', 'domain']] final = {} #print crawls.press.dropna() for col in crawls.columns: if col in ['source_score', 'source', 'createdAt']: continue df = crawls[[col, 'source_score', 'source', 'createdAt']] if df[col].dropna().empty: continue if type(list(df[col].dropna())[0]) == list: df[col] = df[col].dropna().apply(tuple) try: df = df[df[col] != ""] except: "lol" try: df = df[df[col].notnull()] df = [ source[1].sort('createdAt').drop_duplicates(col, True) for source in df.groupby(col) ] df = [_df for _df in df if _df is not None] df = [pd.DataFrame( columns=['source_score', col])] if len(df) is 0 else df df = pd.concat(df).sort('source_score')[col] if list(df): final[col] = list(df)[-1] except: "lol" if 'industry' in final.keys(): try: final['industry'] = final['industry'][0] except: final["industry"] = "" try: final['industry_keywords'] = list( set(crawls.industry.dropna().sum())) except: final['industry_keywords'] = [] if 'address' in final.keys(): final['address'] = FullContact()._normalize_location( final['address']) try: final['handles'] = crawls[['source', 'handle']].dropna() final['handles'] = final['handles'].drop_duplicates().to_dict('r') except: "lol" tmp = crawls[['source', 'logo']].dropna() #print tmp #print "THE LOGO", logo final["logo"] = logo final['logos'] = tmp.drop_duplicates().to_dict('r') try: tmp = crawls[['source', 'phone']].dropna() final['phones'] = tmp.drop_duplicates().to_dict('r') except: """ """ # TODO - if company_name exists update # TODO - find if domain exists under different company_name then update final = self._prettify_fields(final) if "name_score" in final.keys(): del final["name_score"] #print json.dumps(final) self._add_to_clearspark_db('Company', 'company_name', company_name, final) # TODO - find main domain from domain -> ie canon.ca should be canon.com # clean data - ie titleify fields, and lowercase domain # TODO - start a domain search with the deduced domain and the company_name #print "RQUEUE CHECK" if "domain" in final.keys(): domain = final["domain"] ''' if len(RQueue()._results("{0}_{1}".format(company_name, api_key))) == 1: q.enqueue(Companies()._domain_research, domain, api_key, company_name) q.enqueue(Companies()._secondary_research, company_name, domain, api_key) ''' if RQueue()._has_completed("{0}_{1}".format(company_name, api_key)): #q.enqueue(Companies()._domain_research, domain, api_key, company_name) #q.enqueue(Companies()._secondary_research, company_name, domain, api_key) print "WEBHOOK <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" if "company_name" in final.keys(): Webhook()._update_company_info(final) ''' job = q.enqueue(EmailGuess().search_sources, final["domain"],api_key,"") job.meta["{0}_{1}".format(company_name, api_key)] = True job.save() for domain in crawls.domain.dropna().drop_duplicates(): job = q.enqueue(EmailGuess().search_sources, domain, api_key, "") RQueue()._meta(job, "{0}_{1}".format(company_name, api_key)) ''' return final