def _bulk_upload(self, data, user): print user data, user = json.loads(data), json.loads(user) _data = pd.DataFrame(data)[["company_name"]] _user = Parse()._pointer("_User", user["objectId"]) _data["user"] = [_user for i in _data.index] _data["user_company"] = [user["user_company"] for i in _data.index] _data["user_company"] _list = { "user": _user, "user_company": user["user_company"], "list_type": "upload", "name": "Upload - " + arrow.utcnow().format("DD-MM-YYYY") } _list = Prospecter().create("CompanyProspectList", _list).json() print _list _list = Parse()._pointer("CompanyProspectList", _list["objectId"]) _data["lists"] = [[_list] for i in _data.index] Prospecter()._batch_df_create("CompanyProspect", _data) for i in data: #q.enqueue(Companies()._bulk, i["company_name"]) r = requests.get( "https://clear-spark.herokuapp.com/v1/companies/research", params={ "bulk": "bulk", "api_key": "9a31a1defcdc87a618e12970435fd44741d7b88794f7396cbec486b8", "company_name": i["company_name"] }) print r.text
def _google_contact_import(self, access_token, user, user_company): print access_token, user, user_company GOOGLE_CLIENT_ID = "1949492796-qq27u1gnqoct2n6p3hctb0cto58qel5i.apps.googleusercontent.com" GOOGLE_CLIENT_SECRET = "GpZlpLB66sU5v9SDPnPf-Ov1" #access_token = "ya29.aQFZQT43xw5UeOwINZZoOwCa_X1iND9QmWfp1ZJ2laZx1dU6iJomXSmOaUw2bFAM5f8jhWLCrKWWkQ" #access_token = "ya29.aQFdBMBQlqXv8RxtyH-qhKxPpNRU7Y_dTQt0Jbt3wFjzlbR-oNbAiYD-mPgQZXyxAW56JDKK7kCADA" # GData with access token token = gdata.gauth.OAuth2Token( client_id=GOOGLE_CLIENT_ID, client_secret=GOOGLE_CLIENT_SECRET, scope='https://www.google.com/m8/feeds', user_agent='app.testing', access_token=access_token) contact_client = gdata.contacts.client.ContactsClient() token.authorize(contact_client) feed = contact_client.GetContacts() for entry in feed.entry: entry.title.text for e in entry.email: e.address # JSON with access token contacts = [] for i in range(0,10): index = i*50 if i != 0 else 1 url = 'https://www.google.com/m8/feeds/contacts/default/full?access_token={0}&alt=json&max-results=50&start-index={1}' url = url.format(access_token, index) res = requests.get(url).text data = json.loads(res) if "entry" not in data["feed"].keys(): break contacts = contacts + data["feed"]["entry"] print len(contacts) contacts_ = [] for i, row in pd.DataFrame(contacts)[["gd$email","title"]].iterrows(): #print row["gd$email"][0]["address"], row["title"]["$t"] contacts_.append({"email":row["gd$email"][0]["address"], "name": row["title"]["$t"], "domain": row["gd$email"][0]["address"].split("@")[-1]}) contacts_ = pd.DataFrame(contacts_) contacts_["source"] = "gmail" contacts_["db_type"] = "inbox" contacts_["user"]=[Parse()._pointer("_User", user) for i in contacts_.index] contacts_["user_company"] = [Parse()._pointer("UserCompany",user_company) for i in contacts_.index] Parse()._batch_df_create("UserContact", contacts_) Prospecter()._batch_df_create("UserContact", contacts_) print Prospecter().update("_User/"+user, {"google_integration":arrow.utcnow().timestamp, "google_token":access_token}).json()
def _add_reports(self, list_name, companies, company_list, _profile): company_list_id = company_list['objectId'] _user, _company = companies[0]['user'], companies[0]['company'] data = {'name': list_name, 'user': _user, 'company': _company} _company_list = Parse()._pointer('CompanyProspectList', company_list_id) data['parent_list'], data['list_type'] = _company_list, 'mining_job' _report = { 'report_type': 'company_employee_mining_job', 'profile': _profile } signal_report = Parse().create('SignalReport', _report).json() _report = Parse()._pointer('SignalReport', signal_report['objectId']) _list_id = Prospecter().create('ProspectList', data).json()['objectId'] _prospect_list = Parse()._pointer('ProspectList', _list_id) _report = {'reports': {'__op': 'AddUnique', 'objects': [_report]}} _list = { 'prospect_lists': { '__op': 'AddUnique', 'objects': [_prospect_list] } } r = Prospecter().update('CompanyProspectList/' + company_list_id, _list) rr = Prospecter().update('ProspectProfile/' + _profile['objectId'], _report) print r.json(), rr.json() return (signal_report['objectId'], _list_id)
def _score_report(self, _report): _report = Parse()._pointer("SignalReport", _report) qry = {"where": json.dumps({"report": _report})} qry["limit"] = 1000 # TODO - where companies are null / undefined signals = Prospecter().get("CompanySignal", qry).json()["results"] api_key = "9a31a1defcdc87a618e12970435fd44741d7b88794f7396cbec486b8" for company in signals: company_name = company["company_name"] q.enqueue(CompanyScore()._company_info, company_name)
def employee_webhook(self, company_name, company_list, qry="", limit=5, list_id="", _report=""): _user, _company = company_list['user'], company_list['company'] employees = Companies()._employees(company_name, qry) company = Companies()._get_info(company_name) _company_list = company_list['objectId'] for index, row in employees.iterrows(): data = row.to_dict() company['user'], company['company'] = _user, _company prospect = company prospect['name'], prospect['pos'] = row['name'], row['title'] prospect['city'] = row['locale'] prospect['linkedin_url'] = row['linkedin_url'] prospect['lists'] = [Parse()._pointer('ProspectList', list_id)] if type(company['industry']) is list: company['industry'] = company['industry'][0] prospect['company_profile'] = company_list['profile'] r = Prospecter().create('Prospect', company) print "prospect_create_result", r.json() if RQueue()._has_completed("{0}_{1}".format(_company_list, list_id)): data = {'done': arrow.now().timestamp} r = Prospecter().update("SignalReport/" + _report, data) print "employee_webhook_has_completed -->", r.json()
def _random(self): qry = {'order': '-createdAt'} patterns = Prospecter().get('EmailPattern', qry).json()['results'] email_guesses = [] for count, pattern in enumerate(patterns): data = { 'pattern': pattern['pattern'], 'tried': False, 'source': 'random_guess' } email_guesses.append(data) random.shuffle(email_guesses) return email_guesses
def _company_list_employees(self, company_list_id, list_name, title, limit): company_list = Prospecter().get('CompanyProspectList/' + company_list_id).json() print company_list _profile = company_list['profile'] qry = { "lists": Prospecter()._pointer("CompanyProspectList", company_list_id) } qry = {'where': json.dumps(qry), 'order': '-createdAt'} companies = Prospecter().get('CompanyProspect', qry).json()['results'] _report, _list = self._add_reports(list_name, companies, company_list, _profile) queue_name = "{0}_{1}".format(company_list_id, _list) for company in companies: job = q.enqueue(self.employee_webhook, company['name'], company_list, title, limit, _list, _report) job.meta[queue_name] = True job.save() return {'started': True}
def _salesforce_import(self, session_id, instance, user, user_company): #print session_id, instance, user, user_company #SESSION_ID = "00Dj0000001neXP!AQUAQIbUn9RsdTZH6MbFA7qaPtDovNU75.fOC6geI_KnEhJKyUzk2_yFx2TXgkth7zgFuJThY6qZQwH7Pq4UtlcW.Cq0aHt1" print instance print instance.replace("https://","") sf = Salesforce(instance=instance.replace("https://",""), session_id=session_id) lol = sf.query_all("SELECT Id, Name, Email FROM Contact") sf = pd.DataFrame(pd.DataFrame(lol).records.tolist()) sf = sf[["Name","Email"]] sf.columns = ["name","email"] sf = sf.dropna() sf["domain"] = [i.split("@")[-1] if i else "" for i in sf.email] sf["source"] = "salesforce" sf["db_type"] = "crm" sf["user"] = [Parse()._pointer("_User", user) for i in sf.index] sf["user_company"] = [Parse()._pointer("UserCompany",user_company) for i in sf.index] Parse()._batch_df_create("UserContact", sf) Prospecter()._batch_df_create("UserContact", sf) print Prospecter().update("_User/"+user, {"salesforce_integration":arrow.utcnow().timestamp, "salesforce_token":session_id}).json()
def _update_company_email_pattern(self, data): if not data: return 0 qry = {'where':json.dumps({'domain': data['domain']})} companies = Parse().get('Company', qry).json() while "error" in companies.keys(): time.sleep(3) companies = Parse().get('Company', qry).json() companies = companies['results'] pattern = {'email_pattern': data['company_email_pattern']} if data['company_email_pattern'] == []: pattern['email_guess'] = [] #_pusher['customero'].trigger(data["domain"], pattern) for company in companies: #print data data = {'email_pattern':data['company_email_pattern'], 'email_pattern_research': arrow.utcnow().timestamp} r = Parse().update('Company/'+company['objectId'], data) # pusher --> print r.json() try: ''' print data["domain"] ''' except: ''' print "wtf error ", data '''
def _old_start(self): print "started" cp = Parse()._bulk_get("CompanyProspect") p = Parse()._bulk_get("Prospect") uc = Parse()._bulk_get("UserContact") cl = Parse().get("ContactList", {"limit": 1000}).json()["results"] print cl cl = pd.DataFrame(cl) print cl.head() cl["user_id"] = [i["objectId"] for i in cl.user] for count, i in enumerate(cp): if "company" in i.keys(): if "domain" in i["company"].keys(): cp[count]["domain"] = i["company"]["domain"] for count, i in enumerate(p): if "company" in i.keys(): if "domain" in i["company"].keys(): p[count]["domain"] = i["company"]["domain"] for count, i in enumerate(uc): if "company" in i.keys(): if "name" in i["company"].keys(): uc[count]["company_name"] = i["company"]["name"] else: uc[count]["company_name"] = "" else: uc[count]["company_name"] = "" # Adding Lists To Contacts / Prospects for count, i in enumerate(cp): if "user" not in i.keys(): continue user_id = i["user"]["objectId"] _cl = cl[(cl.user_id == user_id) & (cl.db_type == "all_company_prospect")] al = cl[(cl.user_id == user_id) & (cl.db_type == "all_feed_prospect")] _cl, al = _cl.to_dict('r'), al.to_dict('r') all_feed_id = al[0]["objectId"] if al else "" list_id = _cl[0]["objectId"] if _cl else "" if "lists" in i.keys(): cp[count]["lists"] = cp[count]["lists"] + [{ "objectId": list_id }, { "objectId": all_feed_id }] else: cp[count]["lists"] = [{ "objectId": list_id }, { "objectId": all_feed_id }] for count, i in enumerate(p): if "user" not in i.keys(): continue user_id = i["user"]["objectId"] _cl = cl[(cl.user_id == user_id) & (cl.db_type == "all_prospect")] al = cl[(cl.user_id == user_id) & (cl.db_type == "all_feed_prospect")] _cl, al = _cl.to_dict('r'), al.to_dict('r') all_feed_id = al[0]["objectId"] if al else "" list_id = _cl[0]["objectId"] if _cl else "" if "lists" in i.keys(): p[count]["lists"] = p[count]["lists"] + [{ "objectId": list_id }, { "objectId": all_feed_id }] else: p[count]["lists"] = [{ "objectId": list_id }, { "objectId": all_feed_id }] for count, i in enumerate(uc): if "user" not in i.keys(): continue db_type, user_id = i["db_type"], i["user"]["objectId"] _cl = cl[(cl.user_id == user_id) & (cl.db_type == db_type)] al = cl[(cl.user_id == user_id) & (cl.db_type == "all_feed_prospect")] _cl, al = _cl.to_dict('r'), al.to_dict('r') all_feed_id = al[0]["objectId"] if al else "" list_id = _cl[0]["objectId"] if _cl else "" if "lists" in i.keys(): uc[count]["lists"] = uc[count]["lists"] + [{ "objectId": list_id }, { "objectId": all_feed_id }] else: uc[count]["lists"] = [{ "objectId": list_id }, { "objectId": all_feed_id }] _p, _cp, _uc = pd.DataFrame(p), pd.DataFrame(cp), pd.DataFrame(uc) #print _p[_p.domain.isnull()].shape, _p.shape #print _cp[_cp.domain.isnull()].shape, _cp.shape # for user pointer add user_contact_list pointer print _p.shape, _cp.shape, _uc.shape i, j, tmp = 0, 0, pd.concat([_cp, _p, _uc]).reset_index() print tmp.domain.drop_duplicates().shape #return for a, b in tmp[["domain", "lists", "company_name", "user"]].groupby("domain"): if a == ".": continue i = i + 1 if b.lists.dropna().sum(): j = j + 1 lists = [ ii["objectId"] for ii in b.lists.dropna().sum() if "objectId" in ii.keys() ] lists = pd.Series(lists).unique().tolist() company_name, domain = b.company_name.tolist()[0], a #print lists, a, b.company_name.tolist()[0] ''' r = requests.post("https://clear-spark.herokuapp.com/v1/clearspark/daily_news", #r = requests.post("http://localhost:4000/v1/clearspark/daily_news", headers={'Content-type': 'application/json'}, data=json.dumps({"company_name":company_name,"domain":domain, "lists":lists,"source":"blog"})) print r.text ''' api_key = "9a31a1defcdc87a618e12970435fd44741d7b88794f7396cbec486b8" #if i > 2: break x = 600000 #job = q.enqueue(Companies()._news, domain, api_key, company_name, timeout=x) company_name = self.remove_non_ascii(company_name) domain = self.remove_non_ascii(domain) print j, company_name, domain #, lists, tmp.shape job = q.enqueue(Companies()._daily_secondary_research, company_name, domain, api_key, lists, timeout=60000) ''' job = q.enqueue(Companies()._recent_webpages_published, domain, api_key, company_name, timeout=60000) #time.sleep(0.5) #print lists job.meta["lists"] = lists job.meta["_lists"] = lists job.save() #RQueue()._meta(job, "lists", lists) ''' '''
def _update_company_info(self, data, api_key="", name=""): print "DATA" print data company_name = self.remove_accents(data['company_name']) qry = {'where':json.dumps({'company_name':data['company_name']})} qry_1 ={'where':json.dumps({'company_name': company_name})} qry = {"where":json.dumps({"$or":[{"company_name":data["company_name"], "company_name": company_name}]})} company = Parse().get('Company', qry).json() while "results" not in company.keys(): time.sleep(0.5) company = Parse().get('Company', qry).json() companies = company['results'] data = self._unparsify_data(data) if companies == []: company = Parse().create('Company', data).json() while "objectId" not in company.keys(): time.sleep(0.5) company = Parse().create('Company', data).json() print "retrying", company print company companies = [Parse()._pointer('Company',company['objectId'])] print data["company_name"] company_name = data["company_name"].replace(' ','-') #_pusher['customero'].trigger(company_name, {'company': data}) print "__STARTED", len(companies) for company in companies: print "UPDATING COMPANY" #TODO batch_df update print Parse().update('Company/'+company['objectId'], data).json() _company = Parse()._pointer('Company', company['objectId']) classes = ['Prospect','CompanyProspect','PeopleSignal','CompanySignal'] objects = [] for _class in classes: df = pd.DataFrame() objects = Parse().get(_class, qry).json()['results'] data = {'company':_company, 'company_research': arrow.utcnow().timestamp} df["objectId"] = [i["objectId"] for i in objects] Parse()._batch_df_update(_class, df, data) #TODO - batch update for obj in objects: print "UPDATED", _class, obj _id = obj['objectId'] print Parse().update(_class+"/"+_id, data).json() #TODO - add name email guess - what is this code below name = "" if _class == 'Prospect': print company domain = company["domain"] #q.enqueue(EmailGuess().search_sources, domain, "", api_key) return "updated" # TODO BATCHIFY print "CREATING COMPANY" company = Parse().create('Company', data).json() _company = Parse()._pointer('Company', company['objectId']) classes = ['Prospect','CompanyProspect','PeopleSignal','CompanySignal'] for _class in classes: objects = Parse().get(_class, qry).json()['results'] for obj in objects: print "UPDATED", _class, obj _id = obj['objectId'] print Parse().update(_class+"/"+_id, {'company':_company}).json() p['customero'].trigger(data["company_name"], {'company': data})
def _old_start(self): print "started" cp = Parse()._bulk_get("CompanyProspect") p = Parse()._bulk_get("Prospect") uc = Parse()._bulk_get("UserContact") cl = Parse().get("ContactList",{"limit":1000}).json()["results"] print cl cl = pd.DataFrame(cl) print cl.head() cl["user_id"] = [i["objectId"] for i in cl.user] for count, i in enumerate(cp): if "company" in i.keys(): if "domain" in i["company"].keys(): cp[count]["domain"] = i["company"]["domain"] for count, i in enumerate(p): if "company" in i.keys(): if "domain" in i["company"].keys(): p[count]["domain"] = i["company"]["domain"] for count, i in enumerate(uc): if "company" in i.keys(): if "name" in i["company"].keys(): uc[count]["company_name"] = i["company"]["name"] else: uc[count]["company_name"] = "" else: uc[count]["company_name"] = "" # Adding Lists To Contacts / Prospects for count, i in enumerate(cp): if "user" not in i.keys(): continue user_id = i["user"]["objectId"] _cl = cl[(cl.user_id == user_id) & (cl.db_type =="all_company_prospect")] al = cl[(cl.user_id == user_id) & (cl.db_type =="all_feed_prospect")] _cl, al = _cl.to_dict('r'), al.to_dict('r') all_feed_id = al[0]["objectId"] if al else "" list_id = _cl[0]["objectId"] if _cl else "" if "lists" in i.keys(): cp[count]["lists"] = cp[count]["lists"]+[{"objectId":list_id},{"objectId":all_feed_id}] else: cp[count]["lists"] = [{"objectId":list_id},{"objectId":all_feed_id}] for count, i in enumerate(p): if "user" not in i.keys(): continue user_id = i["user"]["objectId"] _cl = cl[(cl.user_id == user_id) & (cl.db_type =="all_prospect")] al = cl[(cl.user_id == user_id) & (cl.db_type =="all_feed_prospect")] _cl, al = _cl.to_dict('r'), al.to_dict('r') all_feed_id = al[0]["objectId"] if al else "" list_id = _cl[0]["objectId"] if _cl else "" if "lists" in i.keys(): p[count]["lists"] = p[count]["lists"]+[{"objectId":list_id},{"objectId":all_feed_id}] else: p[count]["lists"] = [{"objectId":list_id},{"objectId":all_feed_id}] for count, i in enumerate(uc): if "user" not in i.keys(): continue db_type, user_id = i["db_type"], i["user"]["objectId"] _cl = cl[(cl.user_id == user_id) & (cl.db_type == db_type)] al = cl[(cl.user_id == user_id) & (cl.db_type =="all_feed_prospect")] _cl, al = _cl.to_dict('r'), al.to_dict('r') all_feed_id = al[0]["objectId"] if al else "" list_id = _cl[0]["objectId"] if _cl else "" if "lists" in i.keys(): uc[count]["lists"] = uc[count]["lists"]+[{"objectId":list_id},{"objectId":all_feed_id}] else: uc[count]["lists"] = [{"objectId":list_id},{"objectId":all_feed_id}] _p, _cp, _uc = pd.DataFrame(p), pd.DataFrame(cp), pd.DataFrame(uc) #print _p[_p.domain.isnull()].shape, _p.shape #print _cp[_cp.domain.isnull()].shape, _cp.shape # for user pointer add user_contact_list pointer print _p.shape, _cp.shape, _uc.shape i, j, tmp = 0, 0, pd.concat([_cp, _p, _uc]).reset_index() print tmp.domain.drop_duplicates().shape #return for a, b in tmp[["domain","lists","company_name","user"]].groupby("domain"): if a == ".": continue i = i + 1 if b.lists.dropna().sum(): j = j + 1 lists = [ii["objectId"] for ii in b.lists.dropna().sum() if "objectId" in ii.keys()] lists = pd.Series(lists).unique().tolist() company_name, domain = b.company_name.tolist()[0], a #print lists, a, b.company_name.tolist()[0] ''' r = requests.post("https://clear-spark.herokuapp.com/v1/clearspark/daily_news", #r = requests.post("http://localhost:4000/v1/clearspark/daily_news", headers={'Content-type': 'application/json'}, data=json.dumps({"company_name":company_name,"domain":domain, "lists":lists,"source":"blog"})) print r.text ''' api_key = "9a31a1defcdc87a618e12970435fd44741d7b88794f7396cbec486b8" #if i > 2: break x = 600000 #job = q.enqueue(Companies()._news, domain, api_key, company_name, timeout=x) company_name = self.remove_non_ascii(company_name) domain = self.remove_non_ascii(domain) print j, company_name, domain#, lists, tmp.shape job = q.enqueue(Companies()._daily_secondary_research, company_name, domain, api_key, lists, timeout=60000) ''' job = q.enqueue(Companies()._recent_webpages_published, domain, api_key, company_name, timeout=60000) #time.sleep(0.5) #print lists job.meta["lists"] = lists job.meta["_lists"] = lists job.save() #RQueue()._meta(job, "lists", lists) ''' '''