def enrich_person__truecaller(self): cookie_str = '__cfduid=d2554926f77b4da885e17e2e453d5e4c71481629449; tcToken=eyJpdiI6ImdLOFwvcFUxWWc0NnZHVnA4WkNRYVZLXC9PNjRMUGtmeXUzTldMSzZNZ1RNYz0iLCJ2YWx1ZSI6Ilpvb21DWVVFN0ptdUNKTFl4QnhiRjdKMVB0WTFrTHliRk13dFwvZHFzTklrMXczdnZPeEgzU2RXdnliNVg0bGxISG1IZU15dlRmaDNcL2xIclJoSCtwZUE9PSIsIm1hYyI6IjljOGJmMWQ4NzJmMDIzMDNjMjEwM2U2MzUwY2IxZDcyZWJlZjJhNWUzNGM3NmNhNDEwYzc3MThkMGM5Nzc3NGIifQ%3D%3D; _gat=1; _ga=GA1.2.979500326.1481629451; tcSession=eyJpdiI6IlZ6U2hsaGd1VDRPRHF1V1dwNDFHNnpVbkN6OEpFMk5LeElleWJrdDZuSms9IiwidmFsdWUiOiJzXC9TWldPYXV5WXBlWTk5dEdEZmxlQWwwR29OUThhc2k0eFBqMmF0VFJjVTVtNVhKYmZLR3hHOFdjMFJzYVVuOVlBSERpbVNnK0RNcEk4ME9IV1dnVnc9PSIsIm1hYyI6IjE1OWIzODM0MTFkOWZmOWRlNWVkNDQ1ZjhjYWIwOTEwYmQwNzc2MDIyZmQwMTY0NjM1MTdhNjlhMzgxNjdhNWEifQ%3D%3D; XLBS3=XLBS1|WE/k1|WE/fD' #cookie_str ='__cfduid=d2554926f77b4da885e17e2e453d5e4c71481629449; tcToken=eyJpdiI6ImdLOFwvcFUxWWc0NnZHVnA4WkNRYVZLXC9PNjRMUGtmeXUzTldMSzZNZ1RNYz0iLCJ2YWx1ZSI6Ilpvb21DWVVFN0ptdUNKTFl4QnhiRjdKMVB0WTFrTHliRk13dFwvZHFzTklrMXczdnZPeEgzU2RXdnliNVg0bGxISG1IZU15dlRmaDNcL2xIclJoSCtwZUE9PSIsIm1hYyI6IjljOGJmMWQ4NzJmMDIzMDNjMjEwM2U2MzUwY2IxZDcyZWJlZjJhNWUzNGM3NmNhNDEwYzc3MThkMGM5Nzc3NGIifQ%3D%3D; __gads=ID=69a98525476c8ae1:T=1481638323:S=ALNI_MYwuM-deZPXbwoejhOrxXl3ejZJyw; XLBS3=XLBS3|WFAMh|WFABt; tcSession=eyJpdiI6IjBuRFdMb1lmejZhSUFzVWI2cnVCRWdSaHE0QzFjQVNrajZ3RDVidjRTYms9IiwidmFsdWUiOiJMTThFZmtTTkhjeVZSV1FtTWx5MG9xaW55SFp4ZzEyUWY5SFhjMDZZNkpORDllVVp5Y3RScUlpdFU2OGxnbzdEOHJHQzBqaUFtZVZLSk5icGRKc1wvT0E9PSIsIm1hYyI6IjYxOTczZjMyMTQ1NGU2MGM3NjMyOGE0YWYyMTE4Zjg5OWEzOTE5MDMwODQzYjNjOWUyOTYwMWU0MTU2MDZiYWIifQ%3D%3D; _ga=GA1.2.979500326.1481629451' headers = { 'contentType': 'application/json; charset=utf-8', 'Cookie': cookie_str } # Create request url = 'https://www.truecaller.com/throttle/reset/throttleSearch' with requests_cache.disabled(): response = requests.post(url, headers=headers) if response.status_code != 200: raise EngagementException("%s. %s." % (response.status_code, response.text)) url = 'https://www.truecaller.com/il/0504333102' with requests_cache.disabled(): response = requests.get(url, headers=headers) if response.status_code != 200: raise EngagementException("%s. %s." % (response.status_code, response.text)) pass
def _get_company_info(self, domain): try: url = '%s/%s' % (CircleBackEngager.BASE_URL, CircleBackEngager.COMPANY_SERVICE) company_name = self.enriched_entity.deduced.get('name', '<No-Company-Name>') domain = self.enriched_entity.deduced.get('domain', None) if domain is None: raise EngagementException("Domain property of company %s not found." % company_name, fatal=True) payload = {'domains': [domain]} headers = {'contentType': 'application/json; charset=utf-8', 'X-CB-ApiKey': CircleBackEngager.THE_KEY} response = requests.post(url, json=payload, headers=headers) if response.status_code == 429: raise EngagementException("%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True) if response.status_code >= 500: raise EngagementException("Server Error (%d). Error: %s." % (response.status_code, response.reason), fatal=True) if response.status_code != 200: raise EngagementException("%s. %s." % (response.status_code, response.text), fatal=True) if hasattr(response, 'from_cache'): self.set_data("from_cache", response.from_cache) except EngagementException as e: raise e except Exception as e: raise EngagementException(e, True) return response.json()
def set_enrich_key(self): t = self.enriched_entity.__class__.__name__ if t == 'AcureRatePerson': email = self.get_pivot_email() phone = None # ToDo... fname = self.enriched_entity.deduced.get(P.FIRST_NAME, None) lname = self.enriched_entity.deduced.get(P.LAST_NAME, None) if email and fname and lname: self.enrich_key = "%s %s %s" % (email, fname, lname) elif email: self.enrich_key = email elif phone and fname and lname: self.enrich_key = "%s %s %s" % (phone, fname, lname) else: raise EngagementException( "CircleBack - cannot engage. No properties avaialable to set enrich key" ) elif t == 'AcureRateCompany': if C.DOMAIN not in self.enriched_entity.deduced: raise EngagementException( "CircleBack - cannot engage - no domain property available as enrich key" ) self.enrich_key = self.enriched_entity.deduced.get(C.DOMAIN) else: raise EngagementException( "CircleBack - cannot engage - cannot generate enrich key. Unknown entity type" )
def _handle_pipl_api_errors(self, response): if response.status_code == 200: # All is ok. return # Handle different errors. Documentation - https://www.fullcontact.com/developer/docs/ if response.status_code == 403: raise EngagementException("403. Quota Exceeded!", True) elif response.status_code == 400: raise EngagementException("400. Bad request", True) elif response.status_code == 500: raise EngagementException("500. Server Error", True) else: raise EngagementException("%s. Pipl engage error: %s" % (response.status_code, response.text))
def _get_person_info(self): try: url = '%s/%s' % (WhitePagesEngager.BASE_URL, WhitePagesEngager.FIND_PERSON_SERVICE) email = self.get_pivot_email() phone = None # TODO: implement fname = self.enriched_entity.deduced.get('first_name', None) lname = self.enriched_entity.deduced.get('last_name', None) req_id = self.enriched_entity.aid if hasattr( self.enriched_entity, 'aid') else 'no-attr' # build the payload for the request if fname and lname: parametrized_url = '%s?api_key=%s&name=%s%%20%s&address.city=Melville&address.state_code=NY&address.country_code=US' %\ (url, WhitePagesEngager.THE_KEY, fname, lname) else: return None # if email and fname and lname: # parametrized_url = '%s?api_key=%s&firstname=%s&lastname=%s&email_address=%s' % (url, WhitePagesEngager.THE_KEY, fname, lname, email) response = requests.get(parametrized_url) if response.status_code == 403: raise EngagementException( "%s. Forbidden. Error: %s." % (response.status_code, response.text), fatal=True) if response.status_code == 429: raise EngagementException( "%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True) if response.status_code >= 500: raise EngagementException( "Server Error (%d). Error: %s." % (response.status_code, response.reason), fatal=True) if response.status_code != 200: raise EngagementException( "%s. %s." % (response.status_code, response.text), fatal=True) # if response.json()['nbHits'] == 0: # raise EngagementException("No hits returned when searching for %s." % self.enrich_key) if hasattr(response, 'from_cache'): self.set_data("from_cache", response.from_cache) except EngagementException as e: raise e except Exception as e: raise EngagementException(e, True) return response.json()
def enrich_company(self): result_obj = self._get_company_info() if 'pending' in result_obj and result_obj['pending']: msg = 'Failed to get information on person %s. Pending (202)' % self.enrich_key raise EngagementException(msg) if 'company' not in result_obj or result_obj['company'] is None: msg = 'Failed to get information on company %s. Not Found (404)' % self.enrich_key raise EngagementException(msg) enriched = False company_data = result_obj['company'] return [C.NAME]
def set_enrich_key(self): phone = self.get_pivot_phone() if phone: self.enrich_key = "%s" % phone else: raise EngagementException( "OpenCnam Engager - cannot engage. Cannot create enrich key for %s", self.enriched_entity)
def _get_entity_by_type(self, entity_type): if entity_type == 'people': entity = AcureRatePerson() elif entity_type == 'company': entity = AcureRateCompany() else: raise EngagementException('Unknown entity type - %s', entity_type) return entity
def set_enrich_key(self): t = self.enriched_entity.__class__.__name__ if t == 'AcureRatePerson': email = self.get_pivot_email() if email is None: raise EngagementException( "FullContacts - cannot engage. No email available as enrich key" ) self.enrich_key = email elif t == 'AcureRateCompany': if C.DOMAIN not in self.enriched_entity.deduced: raise EngagementException( "FullContacts - cannot engage - no domain property to use as key" ) self.enrich_key = self.enriched_entity.deduced.get(C.DOMAIN) else: raise EngagementException( "FullContacts - cannot engage - cannot generate enrich key. Unknown entity type" )
def _get_person_info(self): try: response = clearbit.Enrichment.find(email=self.enrich_key) except EngagementException as e: raise e except Exception as e: raise EngagementException(e, True) return response
def _get_person_info(self): try: url = '%s/%s' % (CircleBackEngager.BASE_URL, CircleBackEngager.PEOPLE_SERVICE) email = self.get_pivot_email() phone = None # TODO: implement fname = self.enriched_entity.deduced.get('first_name', None) lname = self.enriched_entity.deduced.get('last_name', None) req_id = self.enriched_entity.aid if hasattr(self.enriched_entity, 'aid') else 'no-attr' # build the payload for the request match_request = {} if email and fname and lname: match_request = {'request_id': req_id, 'email': email, 'first_name': fname, 'last_name': lname} elif email: match_request = {'request_id': req_id, 'email': email} elif phone and fname and lname: match_request = {'request_id': req_id, 'phone_number': phone, 'first_name': fname, 'last_name': lname} else: return None # Build payload with the match requests. TODO: create more than one request (one with email, one with phone) payload = {'match_requests': [match_request]} headers = {'contentType': 'application/json; charset=utf-8', 'X-CB-ApiKey': CircleBackEngager.THE_KEY} response = requests.post(url, json=payload, headers=headers) if response.status_code == 429: raise EngagementException("%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True) if response.status_code >= 500: raise EngagementException("Server Error (%d). Error: %s." % (response.status_code, response.reason), fatal=True) if response.status_code != 200: raise EngagementException("%s. %s." % (response.status_code, response.text), fatal=True) # if response.json()['nbHits'] == 0: # raise EngagementException("No hits returned when searching for %s." % self.enrich_key) if hasattr(response, 'from_cache'): self.set_data("from_cache", response.from_cache) except EngagementException as e: raise e except Exception as e: raise EngagementException(e, True) return response.json()
def _get_company_info(self): try: response = clearbit.Company.find(domain=self.enrich_key, stream=True) except EngagementException as e: raise e except Exception as e: raise EngagementException(e, True) return response
def set_enrich_key(self): t = self.enriched_entity.__class__.__name__ if t == 'AcureRatePerson' and P.FULL_NAME in self.enriched_entity.deduced: name = self.enriched_entity.deduced[P.FULL_NAME] elif t == 'AcureRateCompany' and C.NAME in self.enriched_entity.deduced: name = self.enriched_entity.deduced[C.NAME] else: raise EngagementException( "BloombergScraper - cannot engage - cannot generate enrich key. Entity type: %s", t) self.enrich_key = name
def set_enrich_key(self): t = self.enriched_entity.__class__.__name__ if t == 'AcureRatePerson': if P.CB_PERMALINK in self.enriched_entity.deduced: self.enrich_key = self.enriched_entity.deduced[P.CB_PERMALINK] elif P.FULL_NAME in self.enriched_entity.deduced: name = self.enriched_entity.deduced[P.FULL_NAME] self.enrich_key = CrunchBaseEngager.formalize_permalink(name) else: raise EngagementException("CrunchBaseBot - cannot engage - cannot generate enrich key for person. No permalink or name", t) elif t == 'AcureRateCompany': if C.CRUNCHBASE_PERMALINK in self.enriched_entity.deduced: self.enrich_key = self.enriched_entity.deduced[C.CRUNCHBASE_PERMALINK] elif C.NAME in self.enriched_entity.deduced: name = self.enriched_entity.deduced[C.NAME] self.enrich_key = CrunchBaseEngager.formalize_permalink(name) else: raise EngagementException("CrunchBaseBot - cannot engage - cannot generate enrich key for company. No permalink or name", t) else: raise EngagementException("CrunchBaseBot - cannot engage - cannot generate enrich key. Entity type: %s", t)
def _deserialize(self, entity_type, _entity): if type(_entity) is str: entity_json = json.loads(_entity, object_hook=json_util.object_hook) if entity_type == 'people': entity = AcureRatePerson.reconstruct(entity_json) elif entity_type == 'company': entity = AcureRateCompany.reconstruct(entity_json) else: raise EngagementException('Unknown entity type - %s', entity_type) else: entity = _entity return entity
def _handle_fc_api_errors(self, response): if response.status_code == 200: # All is ok. return # Handle different errors. Documentation - https://www.fullcontact.com/developer/docs/ if response.status_code == 403: # Quota exceeded - need special treatment raise EngagementException("403. Quota Exceeded.", True) elif response.status_code == 405 or response.status_code == 410 or response.status_code == 422: raise EngagementException( "%s. Invalid request sent to FC %s" % (response.status_code, response.text), True) elif response.status_code == 404: raise EngagementException( "404. Searched in the past 24 hours and nothing was found: %s" % response.text) elif response.status_code == 500 or response.status_code == 503: raise EngagementException( "%s. Transient errors in FC server. Possible maintenance/downtime. %s" % (response.status_code, response.text), True) elif response.status_code == 202: # being processed... raise EngagementException( "202. Did not get info. Request is being processed. Return later." ) else: raise EngagementException( "%s. Unknown error: %s" % (response.status_code, response.text), True)
def _get_company_info(self, domain): try: response = self.fc.api_get('company', **{'domain': domain}) if hasattr(response, 'from_cache'): self.set_data("from_cache", response.from_cache) self._handle_fc_api_errors(response) except EngagementException as e: raise e except Exception as e: raise EngagementException(e, True) json = response.json() return json
def enrich_person(self): if self.what_to_do == 'error': raise EngagementException('Test Engager throwing test exception.') elif self.what_to_do == 'change': self.set_data('shoe_size', 44) self.set_data('eyes_color', 'green') name_dbl = self.enriched_entity.deduced.get( P.FULL_NAME, '<Noname>') * 2 self.add_data('emails', 'name_dbl%s' % '@nowhere.com') return ['show_size', 'eyes_color']
def set_enrich_key(self): email = self.get_pivot_email() fname = self.enriched_entity.deduced.get(P.FIRST_NAME, None) lname = self.enriched_entity.deduced.get(P.LAST_NAME, None) if email and fname and lname: self.enrich_key = "%s %s %s" % (email, fname, lname) elif email: self.enrich_key = email elif fname and lname: self.enrich_key = "%s %s" % (fname, lname) else: raise EngagementException( "Pipl - cannot engage. Cannot create enrich key for %s", self.enriched_entity)
def _get_person_info(self): try: response = self.fc.api_get('person', **{'email': self.enrich_key}) if hasattr(response, 'from_cache'): self.set_data("from_cache", response.from_cache) self._handle_fc_api_errors(response) # TODO: check if we can inspect the header and see our limit remaining... #r.headers['x-rate-limit-remaining'] except EngagementException as e: raise e except Exception as e: raise EngagementException(e, True) json = response.json() return json
def get_info(self): email = self.get_pivot_email() fname = self.enriched_entity.deduced.get('first_name', None) lname = self.enriched_entity.deduced.get('last_name', None) # build the Search request # TODO: need to pass in my request the matching criteria: "(email and name)" or "email", etc. if email and fname and lname: payload = { 'key': PiplEngager.THE_KEY, 'email': email, 'first_name': fname, 'last_name': lname } elif email: payload = {'key': PiplEngager.THE_KEY, 'email': email} elif fname and lname: payload = { 'key': PiplEngager.THE_KEY, 'first_name': fname, 'last_name': lname } else: return None # Set the match requirements payload['minimum_probability'] = 0.7 payload['minimum_match'] = 1 #payload['match_requirements'] = '(name and image)' try: # TODO: Look into header: {'X-APIKey-Quota-Current': '10' response = requests.get('https://api.pipl.com/search', params=payload) if hasattr(response, 'from_cache'): self.set_data("from_cache", response.from_cache) self._handle_pipl_api_errors(response) if hasattr(response, 'from_cache') and not response.from_cache: pass json_response = json.loads(response.text) except EngagementException as e: raise e except Exception as e: raise EngagementException(e, True) return json_response
def launch(self, provider, entity_type, entity_string, force): # Instantiate the provider instance = self._instantiate_provider(provider) if instance is None: raise EngagementException('Aborting launch. Failed to instantiate provider %s' % provider) self.logger.info('Provider %s instantiated and ready.', provider) try: entity = self._deserialize(entity_type, entity_string) self.logger.info('About to launch an engagement via %s on %s', provider, entity) engagement_result = instance.engage(entity_type, entity, force) except EngagementException as e: self.logger.error('Exception raised: %s', e) engagement_result = None return engagement_result.to_json_string()
def enrich_person(self): result_obj = self._get_person_info() if 'pending' in result_obj and result_obj['pending']: msg = 'Failed to get information on person %s. Pending (202)' % self.enrich_key raise EngagementException(msg) if 'person' not in result_obj or result_obj['person'] is None: msg = 'Failed to get information on person %s. Not Found (404)' % self.enrich_key raise EngagementException(msg) enriched = False person_data = result_obj['person'] # Get the name properties if 'name' in person_data: self.set_data(P.FIRST_NAME, person_data['name']['givenName']) self.set_data(P.LAST_NAME, person_data['name']['familyName']) self.set_data(P.FULL_NAME, person_data['name']['fullName']) if 'email' in person_data and person_data['email'] != self.enrich_key: self.set_data(P.EMAIL, person_data['email']) self.add_data(P.EMAILS, person_data['email']) if 'gender' in person_data and person_data['gender']: #enriched = True self.add_data(P.GENDER, person_data['gender']) if 'bio' in person_data and person_data['bio']: enriched = True self.add_data(P.SHORT_DESCRIPTION, person_data['bio']) if 'location' in person_data and person_data['location']: enriched = True self.add_data(P.LOCATIONS, person_data['location']) if 'facebook' in person_data and person_data['facebook']['handle']: enriched = True self.add_data(P.FACEBOOK_URL, person_data['facebook']['handle']) if 'linkedin' in person_data and person_data['linkedin']['handle']: enriched = True self.add_data(P.LINKEDIN_URL, result_obj['person']['linkedin']) if 'twitter' in person_data and person_data['twitter']['handle']: enriched = True self.add_data(P.TWITTER_URL, result_obj['person']['twitter']) if 'googleplus' in person_data and person_data['googleplus']['handle']: enriched = True self.add_data(P.GOOGLEPLUS_URL, result_obj['person']['googleplus']) if 'employment' in person_data: job = {} if person_data['employment'].get('name', None) is not None: job[P.JOB_NAME] = person_data['employment'].get('name', []) if person_data['employment'].get('title', None) is not None: job[P.JOB_TITLE] = person_data['employment'].get('title', []) if person_data['employment'].get('role', None) is not None: job[P.JOB_ROLE] = person_data['employment'].get('role', []) if job != {}: enriched = True self.add_data(P.JOBS, job) # TODO: gravatar, aboutme, github if not enriched: msg = 'Failed: no information added to person %s' % self.enrich_key raise EngagementException(msg) return [P.JOBS]
def enrich_company(self): try: name = self.enriched_entity.deduced['name'] if not self.enrich_key: # Search for all the companies with this name response = self._make_request(CrunchBaseEngager.PEOPLE_URL, {'name': name}) data = response.json().get('data') if not data or data.get('error'): raise EngagementException( "CrunchBaseEngager: error in retrieving company %s." % name) if len(data["items"]) > 1: raise EngagementException( "CrunchBaseEngager: company %s not ambiguous. Found %d people with this name." % (name, len(data["items"]))) if len(data["items"]) == 0: raise EngagementException( "CrunchBaseEngager: company %s not found." % name) # TODO: Future: Iterate over the returned people and check if there's another matching attribute (like social url) we can use to choose the right person permalink = data['items'][0]['properties']['permalink'] else: permalink = self.enrich_key response = self.get_node('organizations', permalink) if hasattr(response, 'from_cache'): self.set_data("from_cache", response.from_cache) if not response.from_cache: pass # code for debugging purposes data = response.json().get('data') org = Organization(data) # Name if org.name: self.set_data(C.NAME, org.name) # Get company logo if org.primary_image and len(org.primary_image) > 0: logo_url = org.primary_image[0].asset_path self.add_data(C.LOGOS, { C.LOGO_URL: logo_url, C.LOGO_SOURCE: 'crunchbase' }) # Get overview stats (acquisitions, total funds, etc.) if org.acquired_by and hasattr(org.acquired_by, 'acquirer'): acquiring_company = org.acquired_by.acquirer.name self.set_data(C.ACQUIRED_BY, acquiring_company) # Get headquarters if org.headquarters and len(org.headquarters) > 0: headquarters = '%s, %s' % (org.headquarters[0].city, org.headquarters[0].country) self.set_data(C.HEADQUARTERS, headquarters) # Get description if org.short_description: description = org.short_description self.set_data(C.DESCRIPTION, description) # Get founders if org.founders and len(org.founders) > 0: founders = [] for founder in org.founders: full_name = '%s %s' % (founder.first_name, founder.last_name) founders.append(full_name) self.set_data(C.FOUNDERS, founders) # Get categories if org.categories and len(org.categories) > 0: for category in org.categories: self.add_data(C.CATEGORIES, category.name) # Grab aliases if org.also_known_as: self.set_data(C.ALIASES, org.also_known_as) # Grab websites --> homepage_url ? if org.homepage_url and len(org.homepage_url) > 0: self.set_data(C.WEBSITE, org.homepage_url) # Is it a VC company if org.role_investor: self.set_data(C.INVESTMENT_COMPANY_TYPE, C.ORGANIZATION_TYPE_VENTURE_CAPITAL) # Is it an educational organization if org.role_school: self.set_data(C.ORGANIZATION_TYPE, C.ORGANIZATION_TYPE_SCHOOL) # Get socials if org.websites and len(org.websites) > 0: for url in org.websites: url_type = url.website_type.lower() if url_type == 'twitter': self.set_data(C.TWITTER_URL, url.url) elif url_type == 'facebook': self.set_data(C.FACEBOOK_URL, url.url) elif url_type == 'linkedin': self.set_data(C.LINKEDIN_URL, url.url) elif url_type == 'angellist': self.set_data(C.ANGELLIST_URL, url.url) else: pass # Get investments if org.investments and len(org.investments) > 0: all_investments = set() for investment in org.investments: investment_name = investment.invested_in.name all_investments.add(investment_name) self.set_data(C.PORTFOLIO_COMPANIES, list(all_investments)) # Get founding year if org.founded_on: founding_year = org.founded_on.year self.set_data(C.FOUNDING_YEAR, founding_year) # Get contact email - for emails-domain info # Get number of employees if org.num_employees_min and org.num_employees_max: employees_range_str = '%s|%s' % (org.num_employees_min, org.num_employees_max) self.set_data(C.EMPLOYEES_RANGE, employees_range_str) # Go over all investors if org.investors and len(org.investors) > 0: investors = [] for investor in org.investors: investor_dict = investor.data investor_type = investor_dict['type'].lower() if investor_type == 'person': investor_name = '%s %s' % ( investor_dict['properties']['first_name'], investor_dict['properties']['last_name']) elif investor_type == 'organization': investor_name = investor_dict['properties']['name'] else: pass str = 'partner/round' investors.append((investor_name, investor_type, str)) self.set_data(C.INVESTORS, investors) # Go over all board members if org.board_members_and_advisors and len( org.board_members_and_advisors) > 0: board_members = [] for board_member in org.board_members_and_advisors: # Do we need this: board_member.person.role_investor: full_name = '%s %s' % (board_member.person.first_name, board_member.person.last_name) board_members.append(full_name) self.set_data(C.ADVISORS, board_members) if org.founders and len(org.founders) > 0: founders = [] for founder in org.founders: # Do we need thjs: founder.role_investor: full_name = founder.first_name + " " + founder.last_name founders.append(full_name) self.set_data(C.FOUNDERS, founders) team_members = [] if org.past_team and len(org.past_team) > 0: for team_member in org.past_team: full_name = team_member.person.first_name + " " + team_member.person.last_name team_members.append(full_name) if org.current_team and len(org.current_team) > 0: for team_member in org.current_team: full_name = team_member.person.first_name + " " + team_member.person.last_name team_members.append(full_name) if len(team_members) > 0: self.set_data(C.TEAM, team_members) pass # Only if data was not found, get the companies by names # data = self.cb.organizations(company_name) # if 'items' in data: # permalink = data.items[0].permalink # self.set_data("permalink", permalink) # response = self.get_node('organizations', permalink) # node_data = response.json().get('data') # # Add the company name and other information (and then I can go to sleep! # pass except Exception as e: print( "CrunchBasengager::enrich_company - failed to enrich company %s (%s)" % (name, e)) raise EngagementException(e) return [C.NAME]
def enrich_person(self): try: if not self.enrich_key: # Search for all the people with this name name = self.enriched_entity.deduced[ 'first_name'] + " " + self.enriched_entity.deduced[ 'last_name'] response = self._make_request(CrunchBaseEngager.PEOPLE_URL, {'name': name}) data = response.json().get('data') if not data or data.get('error'): raise EngagementException( "CrunchBaseEngager: error in retrieving person %s." % name) if len(data["items"]) > 1: raise EngagementException( "CrunchBaseEngager: person %s not ambiguous. Found %d people with this name." % (name, len(data["items"]))) if len(data["items"]) == 0: raise EngagementException( "CrunchBaseEngager: person %s not found." % name) # TODO: Future: Iterate over the returned people and check if there's another matching attribute (like social url) we can use to choose the right person permalink = data['items'][0]['properties']['permalink'] else: permalink = self.enrich_key # Get information on person via permalink response = self._make_request( 'https://api.crunchbase.com/v/3/people/' + permalink) if hasattr(response, 'from_cache'): self.set_data("from_cache", response.from_cache) if not response.from_cache: pass # code for debugging purposes data = response.json().get('data') people = Person(data) # TODO: deal with marking from cache... now I'm ignoring it # Keep the key email we used for the search #self.set_data("search_key", name) if people.data and 'relationships' in people.data and 'investments' in people.data[ 'relationships']: for elem in people.data['relationships']['investments'][ 'items']: pass if people.data and 'relationships' in people.data and 'advisory_roles' in people.data[ 'relationships']: for elem in people.data['relationships']['advisory_roles'][ 'items']: try: job_title = elem["properties"]["title"] company_name = elem["relationships"]["organization"][ "properties"]["name"] self.add_data(P.ADVISORY_JOBS, { P.JOB_TITLE: job_title, P.JOB_NAME: company_name }) except Exception as e: print('Unable to get advisory roles for %s' % permalink) if people.data and 'properties' in people.data and 'gender' in people.data[ 'properties']: self.set_data(P.GENDER, people.data['properties']['gender']) if people.data and 'properties' in people.data and 'bio' in people.data[ 'properties']: self.set_data(P.SHORT_DESCRIPTION, people.data['properties']['bio']) if people.born_on: self.set_data(P.DOB, people.born_on) if people.degrees: for degree in people.degrees.items: if degree.school and degree.school.name: education = {} education[P.EDUCATION_INSTITUTE] = degree.school.name degree_years = None if degree.started_on: degree_years = '%s' % degree.started_on.year if degree.started_on and degree.completed_on: degree_years = '%s-%s' % (degree.started_on.year, degree.completed_on.year) if degree_years: education[P.EDUCATION_YEARS] = degree_years if degree.degree_type_name: education[ P.EDUCATION_DEGREE] = degree.degree_type_name if degree.degree_subject: education[ P.EDUCATION_SUBJECT] = degree.degree_subject self.add_data(P.EDUCATIONS, education) if people.jobs: for job in people.jobs: if job.data['type'] == 'Job': j = {} if job.title: j["job_title"] = job.title org_type = job.data['relationships']['organization'][ 'type'] if org_type and org_type == 'Organization': if job.data['relationships']['organization'][ 'properties']['name']: j["job_name"] = job.data['relationships'][ 'organization']['properties']['name'] else: pass if job.data['properties']['started_on']: j["started_on"] = job.data['properties'][ 'started_on'] if job.data['properties']['ended_on']: j["ended_on"] = job.data['properties']['ended_on'] if len(j) > 0: self.add_data(P.JOBS, j) else: pass if len(people.founded_companies.items) > 0: for c in people.founded_companies.items: self.add_data("founded_companies", c.name) pass except Exception as e: print("CrunchBaseEngager failed to enrich person (name: %s)" % name) if "quota" in str(e).lower(): pass raise EngagementException(e) return [P.FULL_NAME]
def enrich_person(self): try: if P.BLOOMBERG_URL not in self.enriched_entity.deduced: # Search google for the person - the search string: 'site:bloomberg.com ploni almoni "executive profile"' url_prefix_1 = 'http://www.bloomberg.com/research/stocks/private/person.asp?personId='.lower( ) url_prefix_2 = 'http://www.bloomberg.com/research/stocks/people/person.asp?personId='.lower( ) query = 'site:bloomberg.com "%s" "executive profile"' % self.enrich_key res = search(query, tld='com', lang='en', num=3, start=0, stop=2, pause=2.0) matches = 0 for url in res: url_lower = url.lower().replace('https', 'http') if url_lower.find(url_prefix_1) == 0 or url_lower.find( url_prefix_2) == 0: matches += 1 if matches == 0: raise EngagementException( 'Unable to locate information in Bloomberg.com on %s' % self.enrich_key) elif matches > 1: # TODO: we can improve search that will also consult working places and determine which person is the one we need... (try: Ariel Cohen) raise EngagementException( 'Unable to locate information in Bloomberg.com - more than one match on %s' % self.enrich_key) # Grab person id from url p = re.compile(r'asp\?personId=(\d+)&') person_id = p.search(url).group(1) self.set_data(P.BLOOMBERG_ID, person_id) # TODO: look into the full url google returns - what is capId? self.set_data(P.BLOOMBERG_URL, url) else: url = self.enriched_entity.deduced[P.BLOOMBERG_URL] # Get the person's page for parsing response = requests.get(url) if response.status_code != 200: s = 'Unable to load page in Bloomberg.com on %s. Error: %s. (url=%s)' % ( self.enrich_key, response.status_code, url) raise EngagementException(s) soup = BeautifulSoup(response.content, 'html.parser') # Get age try: td_elem = soup.find("td", string='Age') tr_elem = td_elem.parent tr_elem2 = tr_elem.next_sibling td_elem2 = tr_elem2.find("td") age = td_elem2.text if age != "--": self.set_data(P.DOB, "%s years old" % age) except: self.logger.warning( 'Unable to locate job title/name attribute for %s', self.enrich_key) # Get current job try: job = {} elem = soup.find("span", {"itemprop": "jobTitle"}) if elem: job_title = elem.text if len(job_title.strip()) > 0: job[P.JOB_TITLE] = job_title elem = soup.find("a", {"itemprop": "worksFor"}) if elem: job_name = elem.text if len(job_name.strip()) > 0: job[P.JOB_NAME] = job_name if len(job) > 0: self.add_data(P.JOBS, job) except: self.logger.warning( 'Unable to locate job title/name attribute for %s', self.enrich_key) # Get person's description try: elem1 = soup.find("div", {"itemprop": "description"}) elem2 = soup.find("p", {"itemprop": "description"}) description = None if elem1: description = elem1.text elif elem2: description = elem2.text if description: description = description.replace('\n', '').replace( 'Read Full Background', '').strip() self.set_data(P.DESCRIPTION, description) except: self.logger.warning( 'Unable to locate description attribute for %s', self.enrich_key) # Get the board positions try: h2_elems = [ soup.findAll('h2', text=re.compile('Corporate Headquarters')) ] #h2_elems = [soup.find("h2", string='Corporate Headquarters')] #h2_elems += h2_elems[0].find_next_siblings("h2") for elem in h2_elems: if elem.text.startswith('Board Members Memberships'): for e in elem.next_siblings: if 'no Board Members' in e: break if e.name == "h2": break if e.name == "div" and e.find("a") is not None: company_name = e.find("a").text # TODO: pick up the word 'Director' - or? what else is there to be...? self.add_data( P.ADVISORY_JOBS, { P.JOB_NAME: company_name, P.JOB_TITLE: 'Director' }) except: self.logger.warning( 'Unable to locate board positions information for %s', self.enrich_key) try: # Get the education organizations education_elems = soup.find_all("div", {"itemprop": "alumniOf"}) institutes_names = [] for e in education_elems: # TODO: extract the Degree & Years, if available institutes_names.append(e.text) self.add_data(P.EDUCATIONS, {P.EDUCATION_INSTITUTE: e.text}) except: self.logger.warning( 'Unable to locate other affiliations information for %s', self.enrich_key) try: # Get the other companies he worked for companies_elems = soup.find_all("a", {"itemprop": "affiliation"}) for e in companies_elems: if e.text not in institutes_names and len( e.text.strip()) > 0: self.add_data(P.JOBS, {P.JOB_NAME: e.text}) except: self.logger.warning( 'Unable to locate other affiliations information for %s', self.enrich_key) except Exception as e: self.logger.error('Unable to enrich person %s. %s', self.enriched_entity, e) raise e return [P.FULL_NAME]
def _company_exists(company_name, cb_url=None, permalink=None): # Issue a request to CB search server - if matches exist, compare using name or cb_url if provided. try: # Truncate possible parameters on url if cb_url and cb_url.find('?') > 0: cb_url = cb_url[:cb_url.index('?')] company_name_clean = AcureRateUtils.clean_company_name( company_name) url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query' query = 'query=%s&facetFilters=' % company_name_clean.replace( '&', '%26') payload = { "params": query, "apiKey": CrunchBaseScraperEngager.THE_KEY, "appID": CrunchBaseScraperEngager.APP_ID } headers = { 'contentType': 'application/json; charset=utf-8', 'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY, 'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID } with requests_cache.disabled(): response = requests.post(url, json=payload, headers=headers) # @@@ fatal if response.status_code == 429: raise EngagementException( "%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True) if response.status_code != 200: raise EngagementException( "%s. %s." % (response.status_code, response.text), fatal=True) if response.json()['nbHits'] == 0: raise EngagementException( "CrunchBaseScraper: No hits returned when searching for %s (%s)." % (company_name_clean, company_name)) # Check how many matches we have (if any) matches = [] for company in response.json().get('hits', []): if company.get('type', '') == 'Organization' and company.get( 'organization', False) and 'name' in company: if 'permalink' in company and permalink and company[ 'permalink'].lower() == permalink: matches.append(company) break # Compare URLs if 'url' in company and cb_url and cb_url.endswith( company['url']): matches.append(company) break # Check by name result_company_name_clean = AcureRateUtils.clean_company_name( company.get('name')) if result_company_name_clean.lower( ) == company_name_clean.lower(): matches.append(company) if len(matches) == 0: raise EngagementException( "CrunchBaseScraper: No match for %s (%s)" % (company_name_clean, company_name)) if len(matches) > 1: raise EngagementException( "CrunchBaseScraper: Ambiguous results - got %d hits for %s (%s)" % (len(matches), company_name_clean, company_name)) except Exception as e: raise e return matches
def enrich_company(self): try: if C.BLOOMBERG_URL not in self.enriched_entity.deduced: # Search google for the person - the search string: 'site:bloomberg.com ploni almoni "executive profile"' # url_prefix_1 = 'http://www.bloomberg.com/research/stocks/private/person.asp?personId='.lower() # url_prefix_2 = 'http://www.bloomberg.com/research/stocks/people/person.asp?personId='.lower() url_prefix_1 = 'http://www.bloomberg.com/research/stocks/private/snapshot.asp?privcapId='.lower( ) url_prefix_2 = 'http://something something'.lower() query = 'site:bloomberg.com snapshot "%s"' % self.enrich_key res = search(query, tld='com', lang='en', num=3, start=0, stop=2, pause=2.0) matches = 0 for url in res: url_lower = url.lower().replace('https', 'http') if url_lower.find(url_prefix_1) == 0 or url_lower.find( url_prefix_2) == 0: matches += 1 if matches == 0: raise EngagementException( 'Unable to locate information in Bloomberg.com on %s' % self.enrich_key) elif matches > 1: # TODO: we can improve search that will also consult working places and determine which person is the one we need... (try: Ariel Cohen) raise EngagementException( 'Unable to locate information in Bloomberg.com - more than one match on %s' % self.enrich_key) # Grab person id from url p = re.compile(r'asp\?personId=(\d+)&') person_id = p.search(url).group(1) self.set_data(P.BLOOMBERG_ID, person_id) # TODO: look into the full url google returns - what is capId? self.set_data(P.BLOOMBERG_URL, url) else: url = self.enriched_entity.deduced[P.BLOOMBERG_URL] # Get the person's page for parsing response = requests.get(url) if response.status_code != 200: s = 'Unable to load page in Bloomberg.com on %s. Error: %s. (url=%s)' % ( self.enrich_key, response.status_code, url) raise EngagementException(s) soup = BeautifulSoup(response.content, 'html.parser') # Get company's overview try: elem1 = soup.find("div", {"itemprop": "description"}) elem2 = soup.find("p", {"itemprop": "description"}) description = None if elem1: description = elem1.text elif elem2: description = elem2.text if description: description = description.replace('\n', '').replace( 'Read Full Background', '').strip() self.set_data(C.DESCRIPTION, description) except: self.logger.warning( 'Unable to locate company overview attribute for %s', self.enrich_key) # Get key executives try: elems = soup.findAll("a", {"itemprop": "member"}) for elem in elems: name = elem.text.replace('Mr.', '').strip() name_tokens = name.split(' ') the_name = name if len(name_tokens) == 3: the_name = name_tokens[0] + ' ' + name_tokens[2] elif len(name_tokens) != 2: the_name = name self.logger.warning( 'Not sure how many tokens are in this name - %s' % name) self.add_data(C.TEAM, the_name) except Exception as e: self.logger.warning( 'Unable to locate company executives for %s (%s)' % (self.enrich_key, e)) # Get phones # TODO... # Get domain try: elem = soup.find("a", {"itemprop": "url"}) domain = elem.text self.set_data(C.DOMAIN, domain) except: self.logger.warning('Unable to locate domain attribute for %s', self.enrich_key) # Get address # TODO... # Get founding year # try: # elem = soup.find("div", {"itemprop": "address"}) # elem2 = elem.find("p") # This is currently WRONG - need to find the next sibling of elem # founding_year = elem2.text # self.set_data(C.FOUNDING_YEAR, founding_year) # except: # self.logger.warning('Unable to locate founding year attribute for %s', self.enrich_key) except Exception as e: self.logger.error('Unable to enrich company %s. %s', self.enriched_entity, e) raise e return [C.NAME]
def enrich_person(self): try: # TODO: improve - run 3 searches - by full name, first name and last name. Check all results agains P.possible_names... url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query' query = 'query=%s&facetFilters=' % self.enrich_key payload = { "params": query, "apiKey": CrunchBaseScraperEngager.THE_KEY, "appID": CrunchBaseScraperEngager.APP_ID } headers = { 'contentType': 'application/json; charset=utf-8', 'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY, 'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID } response = requests.post(url, json=payload, headers=headers) if response.status_code == 429: raise EngagementException( "%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True) if response.status_code != 200: raise EngagementException( "%s. %s." % (response.status_code, response.text), fatal=True) if response.json()['nbHits'] == 0: raise EngagementException( "No hits returned when searching for %s." % self.enrich_key) # Check how many matches we have (if any) matches = [] for person in response.json().get('hits', []): if person.get('type', '') == 'Person' and person.get( 'person', False) and person.get('name', '') == self.enrich_key: matches.append(person) if len(matches) == 0: raise EngagementException( "None of the hits match the person name we're searching for (%s)." % self.enrich_key) if len(matches) > 1: raise EngagementException( "Person name is ambiguous - got %d hits for %s. Not enriching." % (len(matches), self.enrich_key)) # Iterate over matches (currently we get here only if there's one, but in future we may want to refine match) for person in matches: # Grab name f, m, l = AcureRateUtils.tokenize_full_name(person['name']) self.set_data(P.FIRST_NAME, f) self.set_data(P.LAST_NAME, l) if m: self.set_data(P.MIDDLE_NAME, m) # Grab person photo if 'logo_url' in person: logo_url = person['logo_url'] self.add_data(P.PHOTOS, { P.PHOTO_URL: logo_url, P.PHOTO_SOURCE: 'crunchbase' }) # Grab location if 'location_name' in person: self.add_data(P.LOCATIONS, person['location_name']) # Grab socials if 'permalink' in person: self.set_data(P.CB_PERMALINK, person['permalink']) if 'url' in person: self.set_data(P.CRUNCHBASE_URL, person['url']) if 'linkedin_url' in person: self.set_data(P.LINKEDIN_URL, person['linkedin_url']) if 'twitter_url' in person: self.set_data(P.TWITTER_URL, person['twitter_url']) # Grab current position title = None if 'title' in person: title = person['title'] company = None if 'organization_name' in person: company = person['organization_name'] if title and company: current_job = { P.JOB_CURRENT: True, P.JOB_TITLE: title, P.JOB_NAME: company } self.add_data(P.JOBS, current_job) if AcureRateUtils.is_business(title): self.logger.info('---->> %s - %s @ %s', person['name'], title, company) # Grab primary role if title is not None and company is not None: role = '%s @ %s' % (title, company) self.set_data(P.PRIMARY_ROLE, role) # Set as business as person was found in CB... self.set_data(P.BUSINESS, True) self.set_data(P.BUSINESS_REASON, 'appears in CB') # Investor? if 'n_investments' in person and person['n_investments'] > 0: self.set_data(P.INVESTOR, True) self.set_data(P.INVESTOR_REASON, '%s investments' % person['n_investments']) self.logger.info('--==--==-->> Worth looking into %s', person['name']) # We found one person, we can break from loop # TODO: in the future, add the other persons we found to Queue for further enrichment break pass except Exception as e: self.logger.error( 'Failed to set some properties on person %s. Returning partial. (exception: %s)', self.enriched_entity, e) return [P.FULL_NAME]
def enrich_person(self): # Extract Twitter screen name screen_name = self.enriched_entity.deduced.get(P.TWITTER_SCREEN_NAME, None) if not screen_name: url = self.enrich_key screen_name = self._extract_screenname_from_url(url) if not screen_name: # If no screenname, search with Twitter using full name # Get all my job information to cross it against matches query = self.enriched_entity.deduced[P.FULL_NAME] possible_users = [ user for user in tweepy.Cursor(self._api.search_users, q=query).items(10) ] # TODO: why 10? for u in possible_users: job = AcureRateJob.attempt_parse(u.description) if self.enriched_entity.fuzzy_match_on_jobs(job): screen_name = u.screen_name self.logger.info( 'Located Twitter screen_name from %s matches. User = %s', len(possible_users), str(u)) break if not screen_name: raise EngagementException( 'Unable to enrich via Twitter. No twitter url/screenname.') try: # Keep Screen Name self.set_data(P.TWITTER_SCREEN_NAME, screen_name) # Pull info from Twitter user = self._api.get_user(screen_name) # Get user information self.set_data(P.FULL_NAME, user.name) self.set_data(P.TWITTER_FOLLOWERS_COUNT, user.followers_count) self.set_data(P.TWITTER_FRIENDS_COUNT, user.friends_count) self.set_data(P.TWITTER_LISTED_COUNT, user.listed_count) self.set_data(P.TWITTER_FAVOURITES_COUNT, user.favourites_count) self.set_data(P.TWITTER_STATUSES_COUNT, user.statuses_count) self.set_data(P.TWITTER_ACCOUNT_CREATION_DATE, str(user.created_at)) # Get description self.set_data(P.SHORT_DESCRIPTION, user.description ) # TODO: need to deal with URLs (grab them too) # Is Investor # TODO: refine this. We cannot rely only on the word 'investment'. Use NLTK. desc = user.description.lower() if 'investment' in desc or 'investor' in desc or 'investing' in desc: self.set_data(P.INVESTOR, True) self.set_data(P.INVESTOR_REASON, 'Twitter: %s:' % user.description) # Get location self.add_data(P.LOCATIONS, user.location) # Get photo self.add_data(P.PHOTOS, { P.PHOTO_URL: user.profile_image_url, P.PHOTO_SOURCE: 'twitter' }) # Assimilate the display urls into the description desc = self._assemble_description( user.description, user.entities['description']['urls']) if desc: self.set_data( P.SHORT_DESCRIPTION, desc) # TODO: need to deal with URLs (grab them too) # Get all the urls a person may add to his twitter profile the_urls = set() if 'description' in user.entities: for url in user.entities['description'].get('urls', []): the_urls.add(url['expanded_url']) if 'url' in user.entities: for url in user.entities['url'].get('urls', []): the_urls.add(url['expanded_url']) for url in the_urls: self.add_data(P.RELATED_URLS, { P.RELATED_URL_SOURCE: 'Twitter', P.RELATED_URL_VALUE: url }) # user.entities['url'] # Get "followers" (those who 'stock' a person) and "friends" (following) - person chose to do it if TwitterEngager.EXTRACT_FF: paged_users = self._get_followers(screen_name) self.set_data(P.TWITTER_FOLLOWERS, paged_users) paged_users = self._get_friends(screen_name) self.set_data(P.TWITTER_FRIENDS, paged_users) except Exception as e: self.logger.error('Error raised during enrichment via twitter. %s', e) return [P.TWITTER_SCREEN_NAME, P.DESCRIPTION]