def _digest_aliases(self): me = self.deduced # Go over all sources for ds in self.sources(): # Add name in source to aliases (regular and aliasized) if C.NAME in ds: self._append_to_deduced(C.ALIASES, AcureRateUtils.aliasize(ds[C.NAME])) self._append_to_deduced(C.ALIASES, ds[C.NAME].lower()) # Add aliases which may have come from source for alias in ds.get(C.ALIASES, []): self._append_to_deduced(C.ALIASES, alias) # If permalink exists, add it to alias as well if P.CB_PERMALINK in ds: self._append_to_deduced(C.ALIASES, ds[P.CB_PERMALINK]) self._append_to_deduced(C.ALIASES, ds[P.CB_PERMALINK].replace("-", " ")) # Add suffix of linkedin/facebook/twitter/crunchbase urls # TODO: get also aliases from facebook and twitter and angellist... if C.LINKEDIN_URL in me: alias = AcureRateUtils.get_url_last_path_element( me[C.LINKEDIN_URL]) if alias: self._append_to_deduced(C.ALIASES, alias) if C.CRUNCHBASE_URL in me: alias = AcureRateUtils.get_url_last_path_element( me[C.CRUNCHBASE_URL]) if alias: self._append_to_deduced(C.ALIASES, alias) pass
def import_entries(self): # Check that file exists if not os.path.isfile(self.path): self.logger.error('Could not locate file (%s)', self.path) return # TODO: Check what file it is (Google/Outlook) - check the file was not violated - header file exists self.num_rows_handled = 0 self.num_rows_succesfully_handled = 0 csv_reader = csv.DictReader(codecs.open(self.path, 'r', self.encoding)) for csv_row in csv_reader: # TODO: remove this from here, it should not be in base class, but specific to contacts importing fixed_csv_row = {} for k, v in csv_row.items(): if k is None: continue k = 'First Name' if 'First Name' in k else k fixed_csv_row[k] = v # If there's a mapping defined, use it if self.columns_mapping(): row = {} AcureRateUtils.dict2dict(fixed_csv_row, row, self.columns_mapping()) else: row = fixed_csv_row # Check if row should be ignored all-together (check all fields, not only those mapped) if self.handle_row(row, fixed_csv_row, self.num_rows_handled+1): self.num_rows_succesfully_handled += 1 self.num_rows_handled += 1 #if self.num_rows_handled % 1000 == 0: if True: self.logger.info('Done importing %d rows...', self.num_rows_handled) self.logger.info('Done importing all rows. Total: %d / Successful: %d', self.num_rows_handled, self.num_rows_succesfully_handled)
def _digest_organization_type(self): me = self.deduced # ACADEMY, GOVERNMENT, MILITARY, COMPANY, VENTURE-CAPITAL/INVESTOR # Check if something indicates an investment company for ds in self.sources(): if C.ORGANIZATION_TYPE in ds and ds[ C.ORGANIZATION_TYPE] == 'investor': # CrunchBaseScraper me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_VENTURE_CAPITAL return if C.INVESTMENT_COMPANY_TYPE in ds: me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_VENTURE_CAPITAL return if C.PRIMARY_ROLE in ds and ds[ C.PRIMARY_ROLE] == 'investor': # CrunchBaseBot me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_VENTURE_CAPITAL return # TODO: improve this - we need to make sure all providers point at the same education - determine its Academy for ds in self.sources(): if C.ORGANIZATION_TYPE in ds and ds[ C.ORGANIZATION_TYPE] == 'school': me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_ACADEMY return elif C.DOMAIN in ds and AcureRateUtils.is_academic_domain( ds[C.DOMAIN]): me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_ACADEMY return # Default is company me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_COMPANY pass
def search_ngram(self, ngram, index): print("%s: Searching for [%s]" % (AcureRateUtils.get_now_as_str(), ngram)) search_url = F6SEngager.F6S_SEARCH_URL % ngram # rc, response, ip = SatoriMain.perform_request(search_url, opener, with_ip=False, should_delay=False) rc, response = self.perform_request(search_url) if rc != 200: print(">>> ERROR: %s: %s." % (rc, response)) return # Check results results = json.loads(response) if results[0]['text'].find(' match') < 1: print("F6S Scraper: No mention of match(es) - %s." % results[0]['text']) return num_matches = int(results[0]['text'].split(' ')[0]) if num_matches == 0: print("F6S Scraper: No hits returned when searching for %s." % ngram) return # Count how many of them are 'Startup' startups_only = [ res for res in results[1:] if 'rightText' in res and res['rightText'] == 'Startup' and 'text' in res and res['text'] and res['text'].lower().find(ngram) == 0 ] if len(startups_only) == 0: return # Should we call recursively if len(startups_only) >= 20: for l in self.all_valid_chars: self.search_ngram(ngram + l, index + 1) print("%s: Found %s results for [%s]. Writing:" % (AcureRateUtils.get_now_as_str(), len(startups_only), ngram)) # Write to file self.extract_and_write(startups_only, ngram) pass
def _digest_domain(self): me = self.deduced # If domain was set or there's no website, return if C.DOMAIN in me or C.WEBSITE not in me: return # Deduce domain from website domain = AcureRateUtils.get_domain(me[C.WEBSITE]) if domain: me[C.DOMAIN] = domain
def spawn_engagers_sequentially(self, providers, entity_type, entity, enrichment_behavior, enriched=False): org_entity = copy.deepcopy(entity) # Iterate over all required providers and run enrichment engagement_results = {} el = EngagerLauncher() for provider_name in providers: try: res = el.launch(provider_name, entity_type, entity, enrichment_behavior.force) engagement_results[provider_name] = EngagementResult.from_json_string(res) except EngagementException as ee: self.logger.error('Failed to engage via %s on entity %s (exception: %s)', provider_name, entity.aid, ee) # Recreate entity new_entity = org_entity # Merge all results into entity #changed = False changed = enriched redigest_properties = {} for provider_name, engagement_result in engagement_results.items(): if engagement_result.status != EngagementResult.SKIPPED and engagement_result.status != EngagementResult.NOCHANGE: enrich_key = engagement_result.properties_changed['enrich_key'] for k, v in engagement_result.properties_changed.items(): property_changed = new_entity.set_data(provider_name, enrich_key, k, v) if property_changed and k in LISTS.TRIGGERING_PROPERTIES: redigest_properties[k] = v changed |= property_changed self.logger.info('Done merging properties of %s. Changed = %s', provider_name, changed) pass if changed or enrichment_behavior.force_save: new_entity.last_update = datetime.datetime.now() new_entity.digest() Store.set_entity(entity_type, new_entity) msg = 'Stored in Store! (changed=%s, force_save=%s)' % (changed, enrichment_behavior.force_save) # Redigest other entities self.redigest(redigest_properties) else: msg = 'Not stored. No change detected' self.logger.info(msg) # Prepare information to send to webhook if enrichment_behavior.webhook: payload = {'status_message': msg, 'status_code': 200, 'ts': time.time(), 'aid': new_entity.aid} r = AcureRateUtils.announce(enrichment_behavior.webhook, payload) self.logger.info('Done merging enrichment result into entity. Changed = %s', changed)
def _digest_employees_range(self): me = self.deduced # Go over providers and select the higher number for ds in self.sources(): if C.EMPLOYEES_NUMBER in ds: if C.EMPLOYEES_NUMBER not in me: me[C.EMPLOYEES_NUMBER] = ds[C.EMPLOYEES_NUMBER] elif ds[C.EMPLOYEES_NUMBER] > me[C.EMPLOYEES_NUMBER]: me[C.EMPLOYEES_NUMBER] = ds[C.EMPLOYEES_NUMBER] if C.EMPLOYEES_NUMBER in me: me[C.EMPLOYEES_RANGE] = AcureRateUtils.get_employees_range( me[C.EMPLOYEES_NUMBER]) pass
def attempt_parse(description): # TODO: implement using NLTK roles = AcureRateUtils.normalized_titles(description) j = AcureRateJob() setattr(j, 'job_name', 'unknown') # Use __init__ instead... job_roles = [] for normalized_title, seniority, area in roles: job_roles.append({ 'job_role': normalized_title, 'job_seniority': seniority, 'job_area': area }) setattr(j, 'job_roles', job_roles) return j
def extract_and_write(self, startups, ngram): # Iterate over all startups for res in startups: text = res['text'] the_type = res['type'] value = res['value'] if text.find(';') == 0: text = "'%s'" % text if text.lower().find(ngram) == 0: self.companies_file.write('%s; %s; %s\n' % (text, the_type, value)) now_str = AcureRateUtils.get_now_as_str() print('%s: %s, %s, %s' % (now_str, text, the_type, value)) self.companies_file.flush() pass
def _enrich_entity(self, entity_type, enrichment_key, enrichment_behavior, enrichment_data=None, enrichment_source=None): """ Enrich a person - either with provided data or external enrichment (or both) :param enrichment_key: the search key to be used to retrieve the object :param enrichment_behavior: object determining external enrichment, dates, force, new, etc. ;param enrichment_data: an EnrichmentData object or Array of objects including data rows to add ;param enrichment_source: an EnrichmentSource object specifying the source of the added data :return: the person entity after the enrichment process """ status_code = EnrichmentException.ALL_OK status_message = "Enrichment completed succesfully (behavior: %s)" % str( enrichment_behavior) # Validate parameters if enrichment_data and not enrichment_source: raise EnrichmentException( "Cannot enrich with additional data without enrichment source.", EnrichmentException.BAD_REQUEST) # Decide which external providers are to be used (all, selective list or empty list) providers = self._decide_providers(enrichment_behavior) try: updated_entities = [] changed = False # Get person from the Store # TODO: in case too many results are returned - they are in-memory - need to limit entities = Store.get_entities( entity_type, enrichment_key, single_result=False, mongo_query=enrichment_behavior.mongo_query) if len(entities) == 0: if enrichment_behavior.create_new: self.logger.info( 'Enriching on %s. Could not locate entities in %s collection, creating a new entity.', enrichment_key, entity_type) if entity_type == 'people': entities = [AcureRatePerson()] elif entity_type == 'company': entities = [AcureRateCompany()] # If no provider, add a Dummy engager, so the system digests and stores the data if not providers: providers = ['System'] elif 'System' not in providers: providers.insert(0, 'System') else: msg = 'Attempting enrichment on key %s. Could not locate entities matching key (Behavior::create_new = False)' % enrichment_key raise EnrichmentException( msg, EnrichmentException.CONTACT_NOT_FOUND) elif len(entities) > 1 and not enrichment_behavior.enrich_multiple: msg = 'Enrichment data %s returns %d entities but enrich_multiple=False. Not enriching' % ( enrichment_key, len(entities)) raise EnrichmentException( msg, EnrichmentException.MULTIPLE_CONTACTS) # Go over all entities retrieved from store (per given key) #with ClusterRpcProxy(EnrichmentServiceConfig.AMQP_CONFIG, timeout=None) as rpc: rpc = None if True: for entity in entities: # If new enriched data provided, merge it into received entity if enrichment_data and len(enrichment_data) > 0: enrichment_data.append( EnrichmentData('last_run_time', datetime.datetime.now(), 'override-no-change')) # enrichment_data.append(EnrichmentData('data_source', enrichment_source.source_type, 'override')) # enrichment_data.append(EnrichmentData('enrich_key', enrichment_source.source_key, 'override')) changed |= entity.merge_data( enrichment_source.source_type, enrichment_source.source_key, enrichment_data) #changed |= entity.merge_data('System', 'nokey', enrichment_data) if changed or enrichment_behavior.digest: changed = entity.digest() # Initiate engagement manager to enrich via providers if True: EngagementManager().spawn_engagers_sequentially( providers, entity_type, entity, enrichment_behavior, changed) else: rpc.engagement_manager.spawn_engagers.call_async( providers, entity_type, entity.to_json_string(), enrichment_behavior.force, enrichment_behavior.force_save) except EnrichmentException as e: self.logger.warning(e) if enrichment_behavior.webhook: r = AcureRateUtils.announce( enrichment_behavior.webhook, { 'status_message': e.message, 'status_code': e.code, 'ts': time.time() }) if r: self.logger.info( 'Sent post request to webhook at %s. Content: %s. Code: %s', enrichment_behavior.webhook, r.content, r.status_code) except Exception as e: msg = 'Failed to enrich %s entity. Key: %s. Reason: %s' % ( entity_type, enrichment_key, e) self.logger.error(msg, exc_info=True) if enrichment_behavior.webhook: r = AcureRateUtils.announce( enrichment_behavior.webhook, { 'status_message': msg, 'status_code': EnrichmentException.FATAL_ERROR, 'ts': time.time() }) if r: self.logger.info( 'Sent post request to webhook at %s. Content: %s. Code: %s', enrichment_behavior.webhook, r.content, r.status_code) return updated_entities
def __str__(self): msg = AcureRateUtils.obj2string(self) return msg
def enrich_company(self): company_name = self.enriched_entity.deduced.get(C.NAME, None) if company_name is None: self.logger.warning('Unable to enrich company. No name detected in entity: %s', self.enriched_entity) return # If there's a permalink, use it, otherwise try creating one if C.CRUNCHBASE_URL in self.enriched_entity.deduced: url = self.enriched_entity.deduced[C.CRUNCHBASE_URL] if url.find('/organization') == 0: url = 'https://www.crunchbase.com' + url permalink = AcureRateUtils.get_url_last_path_element(url) else: permalink = self.enriched_entity.deduced.get(C.CRUNCHBASE_PERMALINK, CrunchBaseEngager.formalize_permalink(company_name)) url = 'https://www.crunchbase.com/organization/%s#/entity' % permalink #driver = webdriver.Firefox() driver = webdriver.Chrome(r'C:\Python353\browser_drivers\chromedriver') driver.implicitly_wait(20) # seconds try: # Activate the driver driver.get(url) # If we got to here, keep the permalink self.set_data(C.CRUNCHBASE_PERMALINK, permalink) # Get company name try: name = driver.find_element_by_id('profile_header_heading').text self.set_data(C.NAME, name) driver.implicitly_wait(2) # seconds except: # TODO: there should be a smarter way to understand we got 404... s = "Failed to enrich %s. Unable to locate name entity in page - %s - something went awry... dumping this crawl." % (company_name, url) raise EngagementException(s) # Get company logo try: content = driver.find_element_by_class_name('logo-links-container') logo_url = content.find_element_by_css_selector("div > img").get_attribute("src") self.add_data(C.LOGOS, {C.LOGO_URL: logo_url, C.LOGO_SOURCE: 'crunchbase'}) except: pass # Get overview stats (acquisitions, total funds, etc.) try: stats = driver.find_element_by_class_name('overview-stats').text if stats.strip() != "": self.set_data(C.STATS, stats) stats_lower = stats.replace('\n', ' ').lower() if 'acquired by' in stats_lower and stats_lower.find(' on ') > 0: acquiring_company = stats[stats_lower.find('acquired by')+12:stats_lower.find(' on ')] self.set_data(C.ACQUIRED_BY, acquiring_company) #tokens = stats.split('\n') #self.set_data(C.ACQUIRED_BY, tokens[2]) except: pass # Get headquarters try: content = driver.find_element_by_xpath('//dt[text()="Headquarters:"]') headquarters = content.find_element_by_xpath("following-sibling::*[1]").text self.set_data(C.HEADQUARTERS, headquarters) except: pass # Get description try: content = driver.find_element_by_xpath('//dt[text()="Description:"]') description = content.find_element_by_xpath("following-sibling::*[1]").text self.set_data(C.DESCRIPTION, description) except: pass # Get founders try: founders = [] content = driver.find_element_by_xpath('//dt[text()="Founders:"]').find_element_by_xpath("following-sibling::*[1]") founders_elements = content.find_elements_by_css_selector('a') for f in founders_elements: name = f.get_attribute("data-name") permalink = f.get_attribute("data-permalink") image = f.get_attribute("data-image") founders.append(name) #founders.append((name, permalink, image)) self.set_data(C.FOUNDERS, founders) except Exception as e: print(e) # Get categories try: content = driver.find_element_by_xpath('//dt[text()="Categories:"]') categories = content.find_element_by_xpath("following-sibling::*[1]").text for c in categories.split(","): self.add_data(C.CATEGORIES, c) except: pass # Get web-site try: content = driver.find_element_by_xpath('//dt[text()="Website:"]').find_element_by_xpath("following-sibling::*[1]") website_url = content.find_element_by_css_selector('a').get_attribute("href") self.set_data(C.WEBSITE, website_url) except: pass # Get socials try: content = driver.find_element_by_xpath('//dt[text()="Social: "]').find_element_by_xpath("following-sibling::*[1]") social_links_elems = content.find_elements_by_tag_name('a') for e in social_links_elems: social_type = e.get_attribute('data-icons') # "facebook", "twitter", etc. social_link = e.get_attribute('href') if social_type == 'facebook': self.set_data(C.FACEBOOK_URL, social_link) elif social_type == 'twitter': self.set_data(C.TWITTER_URL, social_link) except Exception as e: print(e) # Get founding year try: content = driver.find_element_by_xpath('//dt[text()="Founded:"]') founding_year = content.find_element_by_xpath("following-sibling::*[1]").text self.set_data(C.FOUNDING_YEAR, founding_year) except: pass # Get contact email - for emails-domain info try: content = driver.find_element_by_xpath('//dt[text()="Contact:"]') contact_info = content.find_element_by_xpath("following-sibling::*[1]").text tokens = contact_info.split(' ') # contact info may be structured: [email protected] | Telephone email_domain = EmailUtil.get_email_domain_part(tokens[0]) if email_domain and len(email_domain) > 0: self.add_data(C.EMAIL_DOMAINS, email_domain) except: pass # Get aliases try: content = driver.find_element_by_xpath('//dt[text()="Aliases:"]') aliases = content.find_element_by_xpath("following-sibling::*[1]").text for a in aliases.split(", "): self.add_data(C.ALIASES, a) except: pass # Get company type try: content = driver.find_element_by_xpath('//dt[text()="Type:"]') type_str = content.find_element_by_xpath("following-sibling::*[1]").text self.set_data(C.INVESTMENT_COMPANY_TYPE, type_str) except: pass # Get sectors (in case it's investor company) try: content = driver.find_element_by_xpath('//dt[text()="Sectors:"]') sectors_str = content.find_element_by_xpath("following-sibling::*[1]").text for c in sectors_str.split(", "): self.add_data(C.CATEGORIES, c) except: pass # Get Investment Size (in case it's investor company) try: content = driver.find_element_by_xpath('//dt[text()="Investment Size:"]') investments_size_str = content.find_element_by_xpath("following-sibling::*[1]").text self.set_data(C.INVESTMENTS_RANGE, investments_size_str.replace(" ","")) except: pass # Get investments regions (in case it's investor company) try: content = driver.find_element_by_xpath('//dt[text()="Regions:"]') investments_regions_str = content.find_element_by_xpath("following-sibling::*[1]").text for r in investments_regions_str.split(", "): self.add_data(C.INVESTMENT_REGIONS, r) except: pass # Get employees range try: content = driver.find_element_by_xpath('//dt[text()="Employees:"]') employees_range_str = content.find_element_by_xpath("following-sibling::*[1]").text i = employees_range_str.find('None found') if i < 0: self.set_data(C.EMPLOYEES_RANGE, employees_range_str.replace(" ", "")) elif i > 0: self.set_data(C.EMPLOYEES_RANGE, employees_range_str.replace(" ", "")[:employees_range_str.find("|")-1]) except: pass # Get investors try: investors = [] investors_tables = driver.find_elements_by_css_selector(".table.investors") if len(investors_tables) > 0: investors_rows_elements = investors_tables[0].find_elements_by_tag_name("tbody") for investor_element in investors_rows_elements: # skip the header row of the table rows = investor_element.find_elements_by_tag_name("tr") for row in rows: cols = row.find_elements_by_tag_name("td") investor_permalink = '' if len(cols) == 3: investor_name = cols[0].text round = cols[1].text partner = cols[2].text investor_permalink = cols[0].find_element_by_class_name("follow_card").get_attribute('data-permalink') elif len(cols) == 2: round = cols[0].text partner = cols[1].text if "/organization" in investor_permalink: investor_type = "organization" else: investor_type = "person" if 'Seed' in round or 'Angel' in round: str = "%s / %s" % (partner, round) investors.append((investor_name, investor_type, str)) else: str = "%s / %s" % (partner, round) investors.append((investor_name, investor_type, str)) if len(investors) > 0: self.set_data(C.INVESTORS, investors) except Exception as e: print(e) # TODO: get Acquisitions # Get current team current_team = [] try: people_table = driver.find_elements_by_class_name('people') if len(people_table) > 0: # TODO: get the person title - we don't want developers here... people_rows_element = people_table[1].find_elements_by_css_selector("li") for person in people_rows_element: name_element = person.find_element_by_css_selector("h4 a") name = name_element.get_attribute('data-name') permalink = name_element.get_attribute('data-permalink') title_element = person.find_element_by_css_selector("h5") title = title_element.text image = person.find_element_by_css_selector("span a img").get_attribute("src") current_team.append(name) #current_team.append((name, permalink, title, image)) except Exception as e: print(e) # Get past team try: people_table = driver.find_elements_by_class_name('past_people') if len(people_table) > 0: # TODO: get the person title - we don't want developers here... people_rows_element = people_table[0].find_elements_by_css_selector("li") for person in people_rows_element: name_element = person.find_element_by_css_selector("h4 a") name = name_element.get_attribute('data-name') #permalink = name_element.get_attribute('data-permalink') #title_element = person.find_element_by_css_selector("h5") #title = title_element.text #image = person.find_element_by_css_selector("span a img").get_attribute("src") current_team.append(name) #current_team.append((name, permalink, title, image)) except Exception as e: print(e) # Store past & current team if len(current_team) > 0: self.set_data(C.TEAM, current_team) # Get board members and advisors try: advisors = [] advisors_table = driver.find_elements_by_css_selector('.base.no-data.advisors') if len(advisors_table) == 0: advisors_table = driver.find_elements_by_css_selector('.base.advisors') if len(advisors_table) > 0: advisors_rows_elements = advisors_table[0].find_elements_by_css_selector("h4 a") for advisor_element in advisors_rows_elements: name = advisor_element.get_attribute('data-name') permalink = advisor_element.get_attribute('data-permalink') # TODO: check that investors is person and not organization advisors.append(name) #advisors.append((name, permalink)) if len(advisors) > 0: self.set_data(C.ADVISORS, advisors) except Exception as e: print(e) except Exception as e: raise e driver.close() return [C.NAME]
def enrich_person(self): try: # TODO: improve - run 3 searches - by full name, first name and last name. Check all results agains P.possible_names... url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query' query = 'query=%s&facetFilters=' % self.enrich_key payload = { "params": query, "apiKey": CrunchBaseScraperEngager.THE_KEY, "appID": CrunchBaseScraperEngager.APP_ID } headers = { 'contentType': 'application/json; charset=utf-8', 'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY, 'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID } response = requests.post(url, json=payload, headers=headers) if response.status_code == 429: raise EngagementException( "%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True) if response.status_code != 200: raise EngagementException( "%s. %s." % (response.status_code, response.text), fatal=True) if response.json()['nbHits'] == 0: raise EngagementException( "No hits returned when searching for %s." % self.enrich_key) # Check how many matches we have (if any) matches = [] for person in response.json().get('hits', []): if person.get('type', '') == 'Person' and person.get( 'person', False) and person.get('name', '') == self.enrich_key: matches.append(person) if len(matches) == 0: raise EngagementException( "None of the hits match the person name we're searching for (%s)." % self.enrich_key) if len(matches) > 1: raise EngagementException( "Person name is ambiguous - got %d hits for %s. Not enriching." % (len(matches), self.enrich_key)) # Iterate over matches (currently we get here only if there's one, but in future we may want to refine match) for person in matches: # Grab name f, m, l = AcureRateUtils.tokenize_full_name(person['name']) self.set_data(P.FIRST_NAME, f) self.set_data(P.LAST_NAME, l) if m: self.set_data(P.MIDDLE_NAME, m) # Grab person photo if 'logo_url' in person: logo_url = person['logo_url'] self.add_data(P.PHOTOS, { P.PHOTO_URL: logo_url, P.PHOTO_SOURCE: 'crunchbase' }) # Grab location if 'location_name' in person: self.add_data(P.LOCATIONS, person['location_name']) # Grab socials if 'permalink' in person: self.set_data(P.CB_PERMALINK, person['permalink']) if 'url' in person: self.set_data(P.CRUNCHBASE_URL, person['url']) if 'linkedin_url' in person: self.set_data(P.LINKEDIN_URL, person['linkedin_url']) if 'twitter_url' in person: self.set_data(P.TWITTER_URL, person['twitter_url']) # Grab current position title = None if 'title' in person: title = person['title'] company = None if 'organization_name' in person: company = person['organization_name'] if title and company: current_job = { P.JOB_CURRENT: True, P.JOB_TITLE: title, P.JOB_NAME: company } self.add_data(P.JOBS, current_job) if AcureRateUtils.is_business(title): self.logger.info('---->> %s - %s @ %s', person['name'], title, company) # Grab primary role if title is not None and company is not None: role = '%s @ %s' % (title, company) self.set_data(P.PRIMARY_ROLE, role) # Set as business as person was found in CB... self.set_data(P.BUSINESS, True) self.set_data(P.BUSINESS_REASON, 'appears in CB') # Investor? if 'n_investments' in person and person['n_investments'] > 0: self.set_data(P.INVESTOR, True) self.set_data(P.INVESTOR_REASON, '%s investments' % person['n_investments']) self.logger.info('--==--==-->> Worth looking into %s', person['name']) # We found one person, we can break from loop # TODO: in the future, add the other persons we found to Queue for further enrichment break pass except Exception as e: self.logger.error( 'Failed to set some properties on person %s. Returning partial. (exception: %s)', self.enriched_entity, e) return [P.FULL_NAME]
def enrich_company(self): try: # Construct URL to look for company company_name = self.enriched_entity.deduced[C.NAME] org_type = self.enriched_entity.deduced.get( C.ORGANIZATION_TYPE, None) end_point = 'i' if org_type == C.ORGANIZATION_TYPE_VENTURE_CAPITAL else 'c' url = '%s/%s' % (self.BASE_URL, end_point) # Search Google for the exact URL result_urls = AcureRateUtils.google_search(site=url, query='"%s"' % company_name) # TODO: it is possible that more than 1 result is returned, and the first is ok. Need to compare name. if len(result_urls) != 1: s = 'Unable to locate results page for company %s' % company_name raise EngagementException(s) # Get the company's page for parsing response = requests.get(result_urls[0]) if response.status_code != 200: s = 'Unable to load page in StartupNationCentral.org on %s. Error: %s. (url=%s)' % ( self.enrich_key, response.status_code, result_urls[0]) raise EngagementException(s) self.set_data(C.STARTUPNATIONCENTRAL_URL, url) soup = BeautifulSoup(response.content, 'html.parser') # Get name try: name = soup.find("h1", {"class": "company__title"}).text self.set_data(C.NAME, name) except: self.logger.warning('Unable to locate name attribute for %s', self.enrich_key) # Get information if company was ACQUIRED or CLOSED # TODO... # Get short description try: short_description = soup.find( "div", { "class": "company__short-description" }).text self.set_data(C.SHORT_DESCRIPTION, short_description.replace('\n', '').strip()) except: self.logger.warning( 'Unable to locate short description attribute for %s', self.enrich_key) # Get description try: description = soup.find("div", { "class": "company__short-description" }).text self.set_data(C.DESCRIPTION, description.replace('\n', '').strip()) except: self.logger.warning( 'Unable to locate description attribute for %s', self.enrich_key) # Get company logo try: logo_elem = soup.find("img", {"class": "company__logo"}) logo_url = self.BASE_URL + logo_elem['src'] self.set_data(C.LOGO_URL, logo_url) except: self.logger.warning( 'Unable to locate company logo attribute for %s', self.enrich_key) # Get homepage try: homepage = soup.find("strong", string='Homepage').parent.find('a').text self.set_data(C.DOMAIN, homepage) except: self.logger.warning( 'Unable to locate homepage attribute for %s', self.enrich_key) # Get Sector try: sector = soup.find("strong", string='Sector').parent.find('a').text self.set_data(C.SECTOR, sector) except: self.logger.warning('Unable to locate sector attribute for %s', self.enrich_key) # Get founding year try: founding_year = soup.find( "strong", string='Founded').parent.find('div').text self.set_data(C.FOUNDING_YEAR, founding_year) except: self.logger.warning( 'Unable to locate founding year attribute for %s', self.enrich_key) # Get Business Model try: business_model = soup.find( "strong", string='Business Model').parent.find('a').text self.set_data(C.BUSINESS_MODEL, business_model) except: self.logger.warning( 'Unable to locate business model attribute for %s', self.enrich_key) # Get Funding stage try: funding_stage = soup.find( "strong", string='Funding Stage').parent.find('div').text self.set_data(C.FUNDING_STAGE, funding_stage) except: self.logger.warning( 'Unable to locate funding stage attribute for %s', self.enrich_key) # Get employees range try: employee_range = soup.find( "strong", string='Employees').parent.find('div').text self.set_data(C.EMPLOYEES_RANGE, employee_range) except: self.logger.warning( 'Unable to locate employee range attribute for %s', self.enrich_key) # Get Product Stage try: product_stage = soup.find( "strong", string='Product Stage').parent.find('div').text self.set_data(C.PRODUCT_STAGE, product_stage) except: self.logger.warning( 'Unable to locate product stage attribute for %s', self.enrich_key) # Get categories try: elems = soup.findAll("a", {"class": "tags__tag"}) for elem in elems: self.add_data(C.CATEGORIES, elem.text) except: self.logger.warning( 'Unable to locate categories attribute for %s', self.enrich_key) # Get Address try: pass except: self.logger.warning( 'Unable to locate address attribute for %s', self.enrich_key) # Get the team try: elems = soup.findAll("div", {"class": "company-team__info"}) for elem in elems: name_elem = elem.find("div", {"class": "company-team__name"}) self.add_data(C.TEAM, name_elem.text) # TODO: enrich the person with this position position_elem = elem.find( "div", {"class": "company-team__position"}) the_position = position_elem.text.lower() if any(x in the_position for x in [ 'cofounder', 'co-founder', 'co founder', 'founder', 'owner' ]): self.add_data(C.FOUNDERS, name_elem.text) except: self.logger.warning( 'Unable to locate team members attribute for %s', self.enrich_key) # If this is an investment company, get their portfolio companies if org_type == C.ORGANIZATION_TYPE_VENTURE_CAPITAL: # TODO: Garb these fields: # TODO: 'In Israel Since', 'Investment Stages', 'Min Amount', 'Max Amount', 'Capital Managed', 'Industry Preferences' try: portfolio_cards = soup.findAll( "div", {"class": "investor-portfolio__company"}) for elem in portfolio_cards: company_name_elem = elem.find( "h2", {"class": "company-card__title"}) self.add_data(C.PORTFOLIO_COMPANIES, company_name_elem.text) # TODO: grab more info from portfolio cards: logo, website url, short description - enrich the company data pass except: self.logger.warning( 'Unable to locate portfolio companies for %s', self.enrich_key) pass except Exception as e: self.logger.error('Unable to enrich person %s. %s', self.enriched_entity, e) raise e return []
def generate_companies_map(self): self._clean_graph() # Add founder to graph founder_node_id = self._add_entity_to_d3_json_graph(self.founder.deduced[P.FULL_NAME], self.founder, FindReferrals.GROUP_FOUNDER) # Get all the contacts that have these places of work in their jobs targetted_companies_1 = ["SAP", "VMware", "Hewlett-Packard", "Facebook", "Google", "NICE Systems", "LinkedIn", "Microsoft", "Waze", "Salesforce", "Kenshoo", "Cisco", "EMC-ZZZ", "Intel", "Twitter", "Apple", "NASA", "General Electric", "United Nations"] targetted_companies_2 = ["SAP", "Facebook", "Google", "NICE Systems", "LinkedIn", "Microsoft", "Salesforce", "Twitter", "Apple", "NASA", "General Electric", "United Nations"] targetted_companies_3 = ["Carmel Ventures", "Intel Capital", "Evergreen Venture Partners", "Gemini Israel Ventures", "Pitango Venture Capital", "Apax Partners", "Qumra Capital", "JVP"] targetted_companies = targetted_companies_3 #targetted_companies = ["Google"] targetted_companies_map = {} for company_name in targetted_companies: # Get company details from db: # r = DBWrapper.get_companies({"deduced.name": company_name}, True) company_r = DBWrapper.get_companies({"deduced.aliases": company_name.lower()}, True) if company_r is None: continue company = AcureRateCompany.reconstruct(company_r) targetted_companies_map[company_name] = company # if company_name == "Microsoft" or \ # company_name == "Twitter" or \ # company_name == "LinkedIn" or \ # company_name == "Google" or \ # company_name == "SAP" or \ # company_name == "Apple" or \ # company_name == "Salesforce" or \ # company_name == "NASA" or \ # company_name == "General Electric" or \ # company_name == "United Nations" or \ # company_name == "Facebook": # targetted_companies_map[company_name] = company # else: # pass # Get all people who are (a) in founder's contacts; (b) worked in this company regx = re.compile(company_name, re.IGNORECASE) query = {"$and": [{"deduced.jobs.job_name": regx}, {"$or": [{"data_sources.GoogleContacts.attribution_id": self.founder_aid}, {"data_sources.LinkedInContacts.attribution_id": self.founder_aid}]}]} cursor = DBWrapper.get_persons(query) for r in cursor: person = AcureRatePerson.reconstruct(r) person.deduced['company_referred'] = company.deduced deduced_link_type = FindReferrals.LINK_TYPE_DEFAULT title = person.title_at(company.deduced[C.NAME]) if title: # TODO: temp code: if not AcureRateUtils.is_senior(company_r, title) and 'Director' not in title: continue person.deduced['title_at_company_referred'] = title + " @ " + company_name # TODO: complete this... if 'president' in title.lower(): # TODO: remove... done to catch Miki Migdal... need to use isSenior deduced_link_type = FindReferrals.LINK_TYPE_MOVER_AND_SHAKER # Create in graph the referral node and link to it person_node_id = self._add_entity_to_d3_json_graph(person.deduced[P.FULL_NAME], person, FindReferrals.GROUP_REFERRALS) self._add_link_to_d3_json_graph(founder_node_id, person_node_id, value=FindReferrals.LINK_STRENGTH_MEDIUM, link_type=deduced_link_type) # Get all people who are (a) in founder's contacts; (b) have related investors query = {"$and": [{"$or": [{"deduced.investor": {"$exists": True}}, {"deduced.business": {"$exists": True}}]}, {"$or": [{"data_sources.GoogleContacts.attribution_id": self.founder_aid}, {"data_sources.LinkedInContacts.attribution_id": self.founder_aid}]}]} cursor = DBWrapper.get_persons(query) contacts = [AcureRatePerson.reconstruct(r) for r in cursor] for contact in contacts: contact_contacts = contact.business_related_contacts(high_profile=True) for contact_contact_name, contact_contact_relation, contact_contact_company in contact_contacts: r = DBWrapper.get_persons({"deduced.full_name": contact_contact_name}, True) if r: contact_contact = AcureRatePerson.reconstruct(r) for company_name, company in targetted_companies_map.items(): if contact_contact.is_related_to_companies(company.deduced[C.ALIASES]): # Create in graph the referral node and link to it contact_node_id = self._add_entity_to_d3_json_graph(contact.deduced[P.FULL_NAME], contact, FindReferrals.GROUP_REFERRALS) self._add_link_to_d3_json_graph(founder_node_id, contact_node_id, value=FindReferrals.LINK_STRENGTH_MEDIUM, link_type=FindReferrals.LINK_TYPE_MOVER_AND_SHAKER) # Create the contact's contact that will lead to the company contact_contact.deduced['company_referred'] = company.deduced title = contact_contact.title_at(company.deduced[C.ALIASES]) if title: contact_contact.deduced['title_at_company_referred'] = title + " @ " + company_name else: # no title, we can't know if it's a "serious" connection continue # contact_contact.deduced['title_at_company_referred'] = "Related to " + company_name relation_phrase = FindReferrals._generate_referral_2_investor_phrase(contact, contact_contact_name, contact_contact_relation, contact_contact_company) contact_contact.deduced['referral'] = contact.deduced[P.FULL_NAME] contact_contact.deduced['relation_phrase'] = relation_phrase link_strength = self._calculate_link_strength(contact, contact_contact_name, contact_contact_relation, contact_contact_company) contact_contact_node_id = self._add_entity_to_d3_json_graph(contact_contact.deduced[P.FULL_NAME], contact_contact, FindReferrals.GROUP_REFERRALS) #self._add_link_to_d3_json_graph(contact_node_id, contact_contact_node_id, relation=relation_phrase, value=FindReferrals.LINK_STRENGTH_MEDIUM, link_type=FindReferrals.LINK_TYPE_MOVER_AND_SHAKER) self._add_link_to_d3_json_graph(contact_node_id, contact_contact_node_id, relation=relation_phrase, value=link_strength, link_type=FindReferrals.LINK_TYPE_MOVER_AND_SHAKER) self._write_d3_json_to_file('companies_map.json')
def handle_row(self, mapped_row, raw_row, row_number): super().handle_row(mapped_row, raw_row, row_number) # Is this row an artifact of some kind of App or other synthetic data? if self.should_ignore_row(raw_row): self.logger.warning("Synthetic row. Ignoring. (row: %s)", raw_row) return False # TODO: special optimizations for the wise-crackers... # (1) fname = "ploni almoni", lname = <empty> # (2) fname = <empty>, lname = "ploni almoni" # (3) fname = "ploni almoni", lname = "*****@*****.**" # Sanity check to make sure row is with values if mapped_row is None or mapped_row['email1'] is None: self.logger.error("Something went awry... email attribute is None. Aborting") raise Exception("Something went awry during parsing... email attribute is None. Aborting") ed = [] emails = set() for i in [1, 2, 3]: attr = 'email%d' % i if attr in mapped_row and mapped_row[attr].strip() != '': emails.add(mapped_row[attr].strip()) verified_emails = [e for e in list(emails) if EmailUtil.is_valid(e)] best_email = EmailUtil.get_preferred_email_from_list(verified_emails) if best_email is None: if emails is None or len(emails) == 0: reason = "No emails detected" elif len(verified_emails) == 0: reason = "No verified emails - %s" % emails else: reason = "%s" % emails self.logger.warning("Row %d: %s %s: Cannot set key email. %s. Ignoring.", row_number, mapped_row['first_name'], mapped_row['last_name'], reason) return False key_email = EmailUtil.email_as_key(best_email) ed.append(EnrichmentData(P.EMAIL, key_email, 'override')) key = {P.EMAIL: key_email} error = None fname = AcureRateUtils.remove_parenthesized_content(mapped_row['first_name'].strip()) lname = AcureRateUtils.remove_parenthesized_content(mapped_row['last_name'].strip()) if '?' in fname or '?' in lname: self.logger.warning("Row %d - found suspicious name - fname: %s, lname: %s ('?' in name). Replacing and continuing.", row_number, mapped_row['first_name'], mapped_row['last_name']) fname = fname.replace('?', '').strip() lname = lname.replace('?', '').strip() if not AcureRateUtils.valid_name(fname) or not AcureRateUtils.valid_name(lname): self.logger.warning("Row %d: non-alphanumeric characters in name - fname: %s, lname: %s. Continuing.", row_number, mapped_row['first_name'], mapped_row['last_name']) # Handle cases where contact full English name is in FIRST_NAME field and full name in Hebrew in LAST_NAME if len(fname.split()) == 2 and lname == '': lname = fname.split()[1] fname = fname.split()[0] elif len(lname.split()) == 2 and fname == '': fname = lname.split()[0] lname = lname.split()[1] if fname == "" or lname == "": error = "Row %d: empty name after cleaning - fname: %s, lname: %s. Ignoring." % (row_number, mapped_row['first_name'], mapped_row['last_name']) elif '&' in fname or '&' in lname: error = "Row %d: '&' in name after cleaning - fname: %s, lname: %s. Ignoring." % (row_number, mapped_row['first_name'], mapped_row['last_name']) elif '@' in fname or '@' in lname: error = "Row %d: '@' in name after cleaning - fname: %s, lname: %s. Ignoring." % (row_number, mapped_row['first_name'], mapped_row['last_name']) elif len(fname) == 1 and len(lname) == 1: error = "Row %d: suspicious name - fname: %s, lname: %s (too short...) Ignoring." % (row_number, mapped_row['first_name'], mapped_row['last_name']) if error: self.logger.warning(error) return False if len(fname) <= 2 or len(lname) <= 2: self.logger.warning("Row %d: suspicious name - fname: %s, lname: %s (too short...) Continuing.", row_number, mapped_row['first_name'], mapped_row['last_name']) if len(fname.split()) > 1 or len(lname.split()) > 1: self.logger.warning("Row %d: suspicious name - fname: %s, lname: %s (space in fname/lname) Continuing.", row_number, mapped_row['first_name'], mapped_row['last_name']) ed.append(EnrichmentData(P.FIRST_NAME, fname.title(), 'override')) ed.append(EnrichmentData(P.LAST_NAME, lname.title(), 'override')) if mapped_row['middle_name'].strip() != "": ed.append(EnrichmentData(P.MIDDLE_NAME, mapped_row['middle_name'].strip(), 'override')) if mapped_row['prefix'].strip() != "": ed.append(EnrichmentData(P.PREFIX, mapped_row['prefix'].strip(), 'override')) if mapped_row['suffix'].strip() != "": ed.append(EnrichmentData(P.SUFFIX, mapped_row['suffix'].strip(), 'override')) # Handle email attributes (even if some are invalid emails - may lead to needed info in later stages) for i in [1, 2, 3]: if ("email%d" % i) in mapped_row and mapped_row["email%d" % i].strip() != "": ed.append(EnrichmentData(P.EMAILS, mapped_row["email%d" % i].strip(), 'add')) # Handle phone attributes for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]: if ("phone%d" % i) in mapped_row and mapped_row["phone%d" % i].strip() != "": ed.append(EnrichmentData(P.PHONES, mapped_row["phone%d" % i].strip(), 'add')) # TODO: handle cases where more than one org/title is mentioned # TODO: write code better... (dict2dict... ?) org = mapped_row.get('job_name1', None) title = mapped_row.get('job_title1', None) if org is not None and org.strip() != "" and title is not None and title.strip() != "": ed.append(EnrichmentData(P.JOBS, {'job_name': org, 'job_title': title}, 'add')) elif org is not None and org.strip() != "" and title is None: ed.append(EnrichmentData(P.JOBS, {'job_name': org}, 'add')) elif org is None and title is not None and title.strip() != "": ed.append(EnrichmentData(P.JOBS, {'job_title': title}, 'add')) # Is there a DOB field? Convert it to python timestamp if P.DOB in mapped_row and mapped_row[P.DOB] != '': ed.append(EnrichmentData(P.DOB, AcureRateUtils.normalized_dob(mapped_row[P.DOB]), 'add')) # Is there a direct manager marked? if P.DIRECT_MANAGER in mapped_row and mapped_row[P.DIRECT_MANAGER] != '': ed.append(EnrichmentData(P.DIRECT_MANAGER, mapped_row[P.DIRECT_MANAGER], 'add')) if P.WEBSITE in mapped_row and mapped_row[P.WEBSITE] != '': ed.append(EnrichmentData(P.WEBSITE, mapped_row[P.WEBSITE], 'add')) # Anything worthwhile in the notes? if 'notes' in mapped_row and mapped_row['notes'].strip() != '': #self.logger.info('Row %d - contact %s %s got notes - anything interesting here? Notes: %s', row_number, fname, lname, mapped_row['notes']) self.logger.info('Row %d: contact %s %s got notes - anything interesting here?', row_number, fname, lname) # Anything worthwhile in the notes? if 'categories' in mapped_row and mapped_row['categories'].strip() != '': categories = mapped_row['categories'].lower().split(";") # TODO: temp code below. Remove. if 'friends' in categories or 'friend' in categories or 'sayarut' in categories or 'hofen' in categories: self.logger.warning('Contact %s %s filtered out because of irrelevant category (categories: %s)', fname, lname, mapped_row['categories']) return False # Mark the source these contacts came from: ed.append(EnrichmentData(P.ATTRIBUTION_ID, self.attribution_id, 'override')) ed.append(EnrichmentData(P.ATTRIBUTION_NAME, self.attribution_name, 'override')) if not self.test_import: self.logger.info('Row %d: key_email: %s. Sending to enrichment...', row_number, key_email) source = EnrichmentSource(source_type=self.source, source_key='%s %s' % (self.attribution_id, best_email.lower())) behavior = EnrichmentBehavior(create_new=True, providers=self.providers) self.es.enrich_person(enrichment_key=key, enrichment_data=ed, enrichment_source=source, enrichment_behavior=behavior) return True
def enrich_person(self): permalink = self.enrich_key url = 'https://www.crunchbase.com/person/%s#/entity' % permalink #driver = webdriver.Firefox() driver = webdriver.Chrome(r'C:\Python353\browser_drivers\chromedriver') #driver.set_window_size(1120, 550) driver.implicitly_wait(11) # seconds try: # Activate the driver driver.get(url) # If we got to here, keep the permalink and URL self.set_data(P.CB_PERMALINK, permalink) self.set_data(P.CRUNCHBASE_URL, url) # Get person name try: full_name = driver.find_element_by_id('profile_header_heading').text f, m, l = AcureRateUtils.tokenize_full_name(full_name) self.set_data(P.FIRST_NAME, f) self.set_data(P.LAST_NAME, l) if m: self.set_data(P.MIDDLE_NAME, m) driver.implicitly_wait(2) # seconds except: s = "Failed to enrich %s. Unable to locate name entity in page - %s - something went awry... dumping this crawl." % (permalink, url) raise EngagementException(s) # Get primary role try: content = driver.find_element_by_xpath('//dt[text()="Primary Role"]') role_str = content.find_element_by_xpath("following-sibling::*[1]").text self.set_data(P.PRIMARY_ROLE, role_str.replace('\n', ' ')) except: pass # Get photo try: content = driver.find_element_by_class_name('logo-links-container') photo_url = content.find_element_by_css_selector("div > img").get_attribute("src") if "cb-default" not in photo_url: self.add_data(P.PHOTOS, {P.PHOTO_URL: photo_url, P.PHOTO_SOURCE: 'crunchbase'}) except: pass # Get dob try: content = driver.find_element_by_xpath('//dt[text()="Born:"]') dob = content.find_element_by_xpath("following-sibling::*[1]").text self.set_data(P.DOB, dob) except: pass # Get gender try: content = driver.find_element_by_xpath('//dt[text()="Gender:"]') gender = content.find_element_by_xpath("following-sibling::*[1]").text self.add_data(P.GENDER, gender) except: pass # Get location try: content = driver.find_element_by_xpath('//dt[text()="Location:"]') location = content.find_element_by_xpath("following-sibling::*[1]").text if location != "Unknown": self.add_data(P.LOCATIONS, location) except: pass # Get web-site try: content = driver.find_element_by_xpath('//dt[text()="Website:"]').find_element_by_xpath("following-sibling::*[1]") website_url = content.find_element_by_css_selector('a').get_attribute("href") self.set_data(P.WEBSITE, website_url) except: pass # Get socials try: content = driver.find_element_by_xpath('//dt[text()="Social: "]').find_element_by_xpath("following-sibling::*[1]") social_links_elems = content.find_elements_by_tag_name('a') for e in social_links_elems: social_type = e.get_attribute('data-icons') # "facebook", "twitter", "linkedin", etc. social_link = e.get_attribute('href') if social_type == 'facebook': self.set_data(P.FACEBOOK_URL, social_link) elif social_type == 'twitter': self.set_data(P.TWITTER_URL, social_link) elif social_type == 'linkedin': self.set_data(P.LINKEDIN_URL, social_link) except Exception as e: print(e) # Get person details (description) try: person_details_elem = driver.find_element_by_id('description') person_details_str = person_details_elem.text self.set_data(P.DESCRIPTION, person_details_str) except Exception as e: print(e) # Get current jobs try: for row in driver.find_elements_by_css_selector(".experiences .current_job"): title = row.find_element_by_tag_name('h4').text company = row.find_element_by_tag_name('h5').text current_job = {P.JOB_CURRENT: True, P.JOB_TITLE: title, P.JOB_NAME: company} self.add_data(P.JOBS, current_job) except Exception as e: print(e) # Get past jobs try: past_job_section = driver.find_element_by_css_selector(".experiences .past_job") for row in past_job_section.find_elements_by_css_selector(".info-row")[1:-1]: cols = row.find_elements_by_css_selector(".cell") started = cols[0].text ended = cols[1].text title = cols[2].text company = cols[3].text past_job = {P.JOB_STARTED: started, P.JOB_ENDED: ended, P.JOB_TITLE: title, P.JOB_NAME: company} self.add_data(P.JOBS, past_job) except Exception as e: print(e) # Get advisory roles try: advisory_roles_section = driver.find_element_by_css_selector(".advisory_roles") for row in advisory_roles_section.find_elements_by_css_selector("li .info-block"): company = row.find_element_by_tag_name('h4').text role_started = row.find_elements_by_css_selector('h5') role = role_started[0].text started = role_started[1].text advisory_job = {P.JOB_TITLE: role, P.JOB_NAME: company} if started.strip() != '': advisory_job[P.JOB_STARTED] = started self.add_data(P.ADVISORY_JOBS, advisory_job) except Exception as e: print(e) # Get investments try: investments = [] investors_tables = driver.find_elements_by_css_selector(".table.investors") if len(investors_tables) > 0: investors_rows_elements = investors_tables[0].find_elements_by_tag_name("tr") for investor_element in investors_rows_elements[1:]: # we're skipping the header line txt = investor_element.text # We care only about personal investments, so we go in only if there's anywhere seed investment if 'personal investment' in txt.lower(): cols = investor_element.find_elements_by_tag_name('td') if cols[3].text == 'Personal Investment': investments.append((cols[0].text, cols[1].text, cols[2].text)) self.set_data(P.INVESTMENTS, investments) except Exception as e: print(e) # Get education try: content = driver.find_element_by_class_name('education') education_elements = content.find_elements_by_css_selector("li > div") ed = {} for elem in education_elements: institute_name = elem.find_element_by_css_selector('h4 > a').text if institute_name != '': ed[P.EDUCATION_INSTITUTE] = institute_name degree = elem.find_element_by_css_selector('h5').text if degree != '': ed[P.EDUCATION_DEGREE] = degree years = elem.text.replace(institute_name, '').replace(degree, '').strip() if years != '': ed[P.EDUCATION_YEARS] = years self.add_data(P.EDUCATIONS, ed) except: pass except Exception as e: raise e driver.close() return [P.FULL_NAME]
def get_relations(self, filter=None): """ Looks at raw data of person entity and returns all relations. :return: List of tupples, each tupple: (target_aid, relationship type, relationship properties) """ from store.store import Store relations = set() # C:C - Create ACQUIRED_BY relation if C.ACQUIRED_BY in self.deduced: #acquiring_company = Store.get_company({C.NAME: self.deduced[C.ACQUIRED_BY]}) acquiring_company = Store.get_company( {C.ALIASES: self.deduced[C.ACQUIRED_BY].lower()}) if acquiring_company: relations.add((self.aid, G.RELATION_LABEL_ACQUIRED_BY, acquiring_company.aid, '')) # C:C - Create the INVESTED_IN relation if C.ORGANIZATION_TYPE in self.deduced and self.deduced[ C.ORGANIZATION_TYPE] == C.ORGANIZATION_TYPE_VENTURE_CAPITAL: for portfolio_company in self.deduced.get(C.PORTFOLIO_COMPANIES, []): ccc_company = Store.get_company( {C.ALIASES: portfolio_company.lower()}) if ccc_company: relations.add((self.aid, G.RELATION_LABEL_INVESTS_IN, ccc_company.aid, '')) # P:C - Create EMPLOYEE_OF relation (Team. past_team) for team_mate in self.deduced.get(C.TEAM, []): person = Store.get_person({P.FULL_NAME: team_mate}) if person: relations.add( (person.aid, G.RELATION_LABEL_EMPLOYEE_OF, self.aid, '')) # P:C - Create BOARD_AT relation (Advisors) for advisor in self.deduced.get(C.ADVISORS, []): person = Store.get_person({P.FULL_NAME: advisor}) if person: relations.add( (person.aid, G.RELATION_LABEL_ADVISOR_AT, self.aid, '')) # P:C - Create FOUNDER_OF relation (Company) for founder in self.deduced.get(C.FOUNDERS, []): person = Store.get_person({P.FULL_NAME: founder}) if person: relations.add( (person.aid, G.RELATION_LABEL_FOUNDER_OF, self.aid, '')) # P:C - Create INVESTS_AT relation (Investors) for investor_name, investor_type, investment_info in self.deduced.get( C.INVESTORS, []): # Find info on investment type -> relation_properties relation_properties = [] investment_round = AcureRateUtils.get_investment_round( investment_info) if investment_round: relation_properties.append("investment_type: '%s'" % investment_round) investment_lead = AcureRateUtils.is_investment_lead( investment_info) if investment_lead: # TODO: should be label and not property relation_properties.append("investment_lead: True") if investor_type == 'person': person = Store.get_person( {'deduced.' + P.FULL_NAME: investor_name}) if person: relations.add((person.aid, G.RELATION_LABEL_INVESTS_IN, self.aid, ', '.join(relation_properties))) elif investor_type == 'organization': investing_company = Store.get_company({C.NAME: investor_name}) if investing_company: relations.add( (investing_company.aid, G.RELATION_LABEL_INVESTS_IN, self.aid, ', '.join(relation_properties))) # If filter provided, leave only relations that are relevant if filter: relations = [ tup for tup in relations if tup[1].lower() == filter.lower() ] return relations
def digest(self): # Keep data before we reconstuct it - to check at the end if there were changes if self.deduced: before_reconstruct = copy.deepcopy(self.deduced) else: before_reconstruct = None # Reset 'deduced' - we're starting *clean* when digesting me = self.deduced = {} self._digest_name() # Go over data of all providers for ds in self.sources(): # Collect related investors from providers if C.RELATED_INVESTORS in ds: for investor in ds[C.RELATED_INVESTORS]: self._append_to_deduced(C.RELATED_INVESTORS, investor) # if 'name' not in me and 'name' in provider: # me['name'] = provider['name'] # TODO: revisit this code. We currently have only one provider, so we just copy the attributes values attrs = [ "company_type", "crunchbase_url", "domain", "homepage_url", "stock_symbol", "short_description", "image_url", "facebook_url", "twitter_url", "linkedin_url", C.ADVISORS, C.FOUNDERS, C.CATEGORIES, C.TEAM, C.FOUNDING_YEAR, C.WEBSITE, C.CRUNCHBASE_PERMALINK, C.BLOOMBERG_URL ] for a in attrs: if a in ds: me[a] = ds[a] # Select the company logo self._digest_logos() self._digest_domain() self._digest_email_domains() self._digest_phones() self._digest_investors() # Go over related people - check if they are investors: self._digest_related_investors() self._digest_portfolio_companies() self._digest_related_vcs() self._digest_aliases() self._digest_employees_range() self._digest_exits() self._digest_organization_type() self._digest_email_convention() # Check if anything changed during digest: if before_reconstruct is None: return True added, removed, modified, same = AcureRateUtils.dict_compare( self.deduced, before_reconstruct) if len(added) == 0 and len(removed) == 0 and len(modified) == 0: return False return True
def enrich_person(self): result_obj = self._get_person_info() self.set_data("score", result_obj['likelihood']) contact_info = result_obj.get('contactInfo', None) if contact_info: if 'givenName' in contact_info: self.set_data(P.FIRST_NAME, contact_info['givenName']) if 'familyName' in contact_info: self.set_data(P.LAST_NAME, contact_info['familyName']) demographics = result_obj.get('demographics', None) if demographics: gender = demographics.get('gender', None) if gender: self.add_data(P.GENDER, gender.lower()) loc = demographics.get('locationGeneral', None) if loc: self.add_data(P.LOCATIONS, loc) photos = result_obj.get('photos', None) if photos: for photo in photos: new_photo = {} m = {"url": P.PHOTO_URL, "typeName": P.PHOTO_SOURCE} AcureRateUtils.dict2dict(photo, new_photo, m) self.add_data(P.PHOTOS, new_photo) organizations = result_obj.get('organizations', None) if organizations: for org in organizations: new_job = {} m = { "name": P.JOB_NAME, "title": P.JOB_TITLE, "current": P.JOB_CURRENT, "isPrimary": P.JOB_PRIMARY } AcureRateUtils.dict2dict(org, new_job, m) # If there are start/end dates, grab them (year only - drop the month) if 'startDate' in org: new_job[P.JOB_STARTED] = org['startDate'][0:4] if 'endDate' in org: new_job[P.JOB_ENDED] = org['endDate'][0:4] self.add_data(P.JOBS, new_job) social_profiles = result_obj.get('socialProfiles', None) if social_profiles: for social_profile in social_profiles: if social_profile.get('typeName', '') == 'Twitter': self.set_data(P.TWITTER_URL, social_profile['url']) elif social_profile.get('typeName', '') == 'LinkedIn': self.set_data(P.LINKEDIN_URL, social_profile['url']) elif social_profile.get('typeName', '') == 'GooglePlus': self.set_data(P.GOOGLEPLUS_URL, social_profile['url']) elif social_profile.get('typeName', '') == 'Facebook': self.set_data(P.FACEBOOK_URL, social_profile['url']) elif social_profile.get('typeName', '') == 'Gravatar': self.set_data(P.GRAVATAR_URL, social_profile['url']) elif social_profile.get('typeName', '') == 'Foursquare': self.set_data(P.FOURSQUARE_URL, social_profile['url']) elif social_profile.get('typeName', '') == 'Pinterest': self.set_data(P.PINTEREST_URL, social_profile['url']) elif social_profile.get('typeName', '') == 'Klout': self.set_data(P.KLOUT_URL, social_profile['url']) elif social_profile.get('typeName', '') == 'AngelList': self.set_data(P.ANGELLIST_URL, social_profile['url']) else: print('Something else...') # TODO: add all other attributes received from FullContact return [P.JOBS]
def _company_exists(company_name, cb_url=None, permalink=None): # Issue a request to CB search server - if matches exist, compare using name or cb_url if provided. try: # Truncate possible parameters on url if cb_url and cb_url.find('?') > 0: cb_url = cb_url[:cb_url.index('?')] company_name_clean = AcureRateUtils.clean_company_name( company_name) url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query' query = 'query=%s&facetFilters=' % company_name_clean.replace( '&', '%26') payload = { "params": query, "apiKey": CrunchBaseScraperEngager.THE_KEY, "appID": CrunchBaseScraperEngager.APP_ID } headers = { 'contentType': 'application/json; charset=utf-8', 'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY, 'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID } with requests_cache.disabled(): response = requests.post(url, json=payload, headers=headers) # @@@ fatal if response.status_code == 429: raise EngagementException( "%s. Exceeded requests quota. Error: %s." % (response.status_code, response.text), fatal=True) if response.status_code != 200: raise EngagementException( "%s. %s." % (response.status_code, response.text), fatal=True) if response.json()['nbHits'] == 0: raise EngagementException( "CrunchBaseScraper: No hits returned when searching for %s (%s)." % (company_name_clean, company_name)) # Check how many matches we have (if any) matches = [] for company in response.json().get('hits', []): if company.get('type', '') == 'Organization' and company.get( 'organization', False) and 'name' in company: if 'permalink' in company and permalink and company[ 'permalink'].lower() == permalink: matches.append(company) break # Compare URLs if 'url' in company and cb_url and cb_url.endswith( company['url']): matches.append(company) break # Check by name result_company_name_clean = AcureRateUtils.clean_company_name( company.get('name')) if result_company_name_clean.lower( ) == company_name_clean.lower(): matches.append(company) if len(matches) == 0: raise EngagementException( "CrunchBaseScraper: No match for %s (%s)" % (company_name_clean, company_name)) if len(matches) > 1: raise EngagementException( "CrunchBaseScraper: Ambiguous results - got %d hits for %s (%s)" % (len(matches), company_name_clean, company_name)) except Exception as e: raise e return matches
def _get_person(self, full_name): f, m, l = AcureRateUtils.tokenize_full_name(full_name) q = {"deduced.first_name": f, "deduced.last_name": l} r = DBWrapper.get_persons(q, True) return AcureRatePerson().reconstruct(r) if r else None
def enrich_person(self): try: if P.ANGELLIST_URL not in self.enriched_entity.deduced: # Search google for the person - the search string: 'site:bloomberg.com ploni almoni "executive profile"' url_prefix_1 = 'http://angel.co/'.lower() query = 'site:angel.co "%s"' % self.enrich_key res = search(query, tld='com', lang='en', num=3, start=0, stop=2, pause=2.0) matches = 0 for url in res: url_lower = url.lower().replace('https', 'http') if url_lower.find(url_prefix_1) == 0: matches += 1 if matches == 0: raise EngagementException( 'Unable to locate information in angel.co on %s' % self.enrich_key) elif matches > 1: # TODO: we can improve search that will also consult working places and determine which person is the one we need... (try: Ariel Cohen) raise EngagementException( 'Unable to locate information in angel.co - more than one match on %s' % self.enrich_key) # Grab person id from url p = url.rfind('/') person_id = url[:p - 1] self.set_data(P.ANGELLIST_ID, person_id) # TODO: look into the full url google returns - what is capId? self.set_data(P.ANGELLIST_URL, url) else: url = self.enriched_entity.deduced[P.ANGELLIST_URL] # ----------------- # CHECK: https://angel.co/alberto-roman # ----------------- headers = requests.utils.default_headers() headers.update({ 'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 5 Build/LMY48B; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.65 Mobile Safari/537.36' }) # Get the person's page for parsing response = requests.get(url, headers=headers) if response.status_code != 200: s = 'Unable to load page in Angel.co on %s. Error: %s. (url=%s)' % ( self.enrich_key, response.status_code, url) raise EngagementException(s) soup = BeautifulSoup(response.content, 'html.parser') # Get name try: elem = soup.find("h1", {"itemprop": "name"}) if elem: name = elem.text.strip() self.set_data(P.FULL_NAME, name) except: self.logger.warning('Unable to locate name attribute for %s', self.enrich_key) # Get photo # Studied at... # Get socials try: elem = soup.find("a", {"data-field": "linkedin_url"}) if elem: linkedin_url = elem['href'] self.set_data(P.LINKEDIN_URL, linkedin_url) except: self.logger.warning('Unable to locate social attribute for %s', self.enrich_key) try: elem = soup.find("a", {"data-field": "twitter_url"}) if elem: twitter_url = elem['href'] self.set_data(P.TWITTER_URL, twitter_url) except: self.logger.warning('Unable to locate social attribute for %s', self.enrich_key) try: elem = soup.find("a", {"data-field": "facebook_url"}) if elem: facebook_url = elem['href'] self.set_data(P.FACEBOOK_URL, facebook_url) except: self.logger.warning('Unable to locate social attribute for %s', self.enrich_key) try: elem = soup.find("a", {"data-field": "blog_url"}) if elem: blog_url = elem['href'] self.set_data(P.BLOG_URL, blog_url) except: self.logger.warning('Unable to locate social attribute for %s', self.enrich_key) # Get experience try: experience_elem = soup.find("div", {"class": "experience_container"}) startup_roles = experience_elem.findAll( "div", {"class": "startup_roles"}) for review in startup_roles: current_job = {} # Get logo of job startup_photo_elem = review.find("div", {"class": "photo"}) startup_photo_url = startup_photo_elem.find("img")['src'] # Get details of job startup_text_elem = review.find("div", {"class": "text"}) startup_elem = startup_text_elem.find( "a", {"data-type": "Startup"}) current_job[P.JOB_NAME] = startup_elem.text.strip() startup_angellist_url = startup_elem['href'] # Get other details more_details_elems = startup_text_elem.findAll("span") if len(more_details_elems) > 0: current_job[ P.JOB_TITLE] = more_details_elems[0].text.strip() if len(more_details_elems) > 1: role_years = more_details_elems[1].text.strip() s, e, c = AcureRateUtils.parse_date_range(role_years) if s: current_job[P.JOB_STARTED] = s if e: current_job[P.JOB_ENDED] = e if c: current_job[P.JOB_CURRENT] = c # TODO: parse start/end/current year from string line if len(more_details_elems) > 2: role_description = more_details_elems[2].text.strip() self.add_data(P.JOBS, current_job) except: self.logger.warning( 'Unable to locate job title/name attribute for %s', self.enrich_key) # Get education records try: education_elem = soup.find("div", {"class": "education"}) education_orgs = education_elem.findAll( "div", {"class": "college-row-view"}) for review in education_orgs: school = review.find("div", { "class": "school" }).text.strip() degree = review.find("div", { "class": "degree" }).text.strip() except: self.logger.warning( 'Unable to locate education attribute for %s', self.enrich_key) # Get investments try: investments_list_elem = soup.find("div", {"class": "investment_list"}) investments = investments_list_elem.findAll( "div", {"class": "investment"}) for investment in investments: company_name = investment.find("div", { "class": "company-link" }).text.strip() self.add_data(P.INVESTMENTS, company_name) except: self.logger.warning( 'Unable to locate investments attribute for %s', self.enrich_key) # Get references/reviews try: reviews_section_elem = soup.find("div", {"class": "reviews"}) reviews_elem = reviews_section_elem.findAll( "li", {"class": "review"}) for review in reviews_elem: reference = {} reference[P.REFERER_REVIEW] = review.find( "div", { "class": "review-content" }).text.strip() referencing_person_elem = review.find( "div", { "class": "annotation" }).find("a", {"class": "profile-link"}) reference[ P.REFERER_NAME] = referencing_person_elem.text.strip() reference[ P.REFERER_ANGELLIST_URL] = referencing_person_elem[ 'href'] self.add_data(P.REFERENCES, reference) except: self.logger.warning( 'Unable to locate education attribute for %s', self.enrich_key) # Get business locations # TODO.. # Get business markets # TODO.. except Exception as e: self.logger.error('Unable to enrich person %s. %s', self.enriched_entity, e) raise e return [P.FULL_NAME]