Пример #1
0
    def _digest_aliases(self):
        me = self.deduced
        # Go over all sources
        for ds in self.sources():
            # Add name in source to aliases (regular and aliasized)
            if C.NAME in ds:
                self._append_to_deduced(C.ALIASES,
                                        AcureRateUtils.aliasize(ds[C.NAME]))
                self._append_to_deduced(C.ALIASES, ds[C.NAME].lower())

            # Add aliases which may have come from source
            for alias in ds.get(C.ALIASES, []):
                self._append_to_deduced(C.ALIASES, alias)

            # If permalink exists, add it to alias as well
            if P.CB_PERMALINK in ds:
                self._append_to_deduced(C.ALIASES, ds[P.CB_PERMALINK])
                self._append_to_deduced(C.ALIASES,
                                        ds[P.CB_PERMALINK].replace("-", " "))

        # Add suffix of linkedin/facebook/twitter/crunchbase urls
        # TODO: get also aliases from facebook and twitter and angellist...
        if C.LINKEDIN_URL in me:
            alias = AcureRateUtils.get_url_last_path_element(
                me[C.LINKEDIN_URL])
            if alias:
                self._append_to_deduced(C.ALIASES, alias)
        if C.CRUNCHBASE_URL in me:
            alias = AcureRateUtils.get_url_last_path_element(
                me[C.CRUNCHBASE_URL])
            if alias:
                self._append_to_deduced(C.ALIASES, alias)
        pass
Пример #2
0
    def import_entries(self):

        # Check that file exists
        if not os.path.isfile(self.path):
            self.logger.error('Could not locate file (%s)', self.path)
            return

        # TODO: Check what file it is (Google/Outlook) - check the file was not violated - header file exists

        self.num_rows_handled = 0
        self.num_rows_succesfully_handled = 0
        csv_reader = csv.DictReader(codecs.open(self.path, 'r', self.encoding))
        for csv_row in csv_reader:
            # TODO: remove this from here, it should not be in base class, but specific to contacts importing
            fixed_csv_row = {}
            for k, v in csv_row.items():
                if k is None:
                    continue
                k = 'First Name' if 'First Name' in k else k
                fixed_csv_row[k] = v
            # If there's a mapping defined, use it
            if self.columns_mapping():
                row = {}
                AcureRateUtils.dict2dict(fixed_csv_row, row, self.columns_mapping())
            else:
                row = fixed_csv_row
            # Check if row should be ignored all-together (check all fields, not only those mapped)
            if self.handle_row(row, fixed_csv_row, self.num_rows_handled+1):
                self.num_rows_succesfully_handled += 1
            self.num_rows_handled += 1
            #if self.num_rows_handled % 1000 == 0:
            if True:
                self.logger.info('Done importing %d rows...', self.num_rows_handled)
        self.logger.info('Done importing all rows. Total: %d / Successful: %d', self.num_rows_handled, self.num_rows_succesfully_handled)
Пример #3
0
    def _digest_organization_type(self):
        me = self.deduced

        #  ACADEMY, GOVERNMENT, MILITARY, COMPANY, VENTURE-CAPITAL/INVESTOR

        # Check if something indicates an investment company
        for ds in self.sources():
            if C.ORGANIZATION_TYPE in ds and ds[
                    C.ORGANIZATION_TYPE] == 'investor':  # CrunchBaseScraper
                me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_VENTURE_CAPITAL
                return
            if C.INVESTMENT_COMPANY_TYPE in ds:
                me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_VENTURE_CAPITAL
                return
            if C.PRIMARY_ROLE in ds and ds[
                    C.PRIMARY_ROLE] == 'investor':  # CrunchBaseBot
                me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_VENTURE_CAPITAL
                return

        # TODO: improve this - we need to make sure all providers point at the same education - determine its Academy
        for ds in self.sources():
            if C.ORGANIZATION_TYPE in ds and ds[
                    C.ORGANIZATION_TYPE] == 'school':
                me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_ACADEMY
                return
            elif C.DOMAIN in ds and AcureRateUtils.is_academic_domain(
                    ds[C.DOMAIN]):
                me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_ACADEMY
                return

        # Default is company
        me[C.ORGANIZATION_TYPE] = C.ORGANIZATION_TYPE_COMPANY
        pass
Пример #4
0
    def search_ngram(self, ngram, index):

        print("%s: Searching for [%s]" %
              (AcureRateUtils.get_now_as_str(), ngram))

        search_url = F6SEngager.F6S_SEARCH_URL % ngram
        # rc, response, ip = SatoriMain.perform_request(search_url, opener, with_ip=False, should_delay=False)
        rc, response = self.perform_request(search_url)
        if rc != 200:
            print(">>> ERROR: %s: %s." % (rc, response))
            return

        # Check results
        results = json.loads(response)
        if results[0]['text'].find(' match') < 1:
            print("F6S Scraper: No mention of match(es) - %s." %
                  results[0]['text'])
            return

        num_matches = int(results[0]['text'].split(' ')[0])
        if num_matches == 0:
            print("F6S Scraper: No hits returned when searching for %s." %
                  ngram)
            return

        # Count how many of them are 'Startup'
        startups_only = [
            res for res in results[1:]
            if 'rightText' in res and res['rightText'] == 'Startup' and 'text'
            in res and res['text'] and res['text'].lower().find(ngram) == 0
        ]
        if len(startups_only) == 0:
            return

        # Should we call recursively
        if len(startups_only) >= 20:
            for l in self.all_valid_chars:
                self.search_ngram(ngram + l, index + 1)

        print("%s: Found %s results for [%s]. Writing:" %
              (AcureRateUtils.get_now_as_str(), len(startups_only), ngram))

        # Write to file
        self.extract_and_write(startups_only, ngram)

        pass
Пример #5
0
    def _digest_domain(self):
        me = self.deduced

        # If domain was set or there's no website, return
        if C.DOMAIN in me or C.WEBSITE not in me:
            return

        # Deduce domain from website
        domain = AcureRateUtils.get_domain(me[C.WEBSITE])
        if domain:
            me[C.DOMAIN] = domain
Пример #6
0
    def spawn_engagers_sequentially(self, providers, entity_type, entity, enrichment_behavior, enriched=False):

        org_entity = copy.deepcopy(entity)

        # Iterate over all required providers and run enrichment
        engagement_results = {}
        el = EngagerLauncher()
        for provider_name in providers:
            try:
                res = el.launch(provider_name, entity_type, entity, enrichment_behavior.force)
                engagement_results[provider_name] = EngagementResult.from_json_string(res)
            except EngagementException as ee:
                self.logger.error('Failed to engage via %s on entity %s (exception: %s)', provider_name, entity.aid, ee)

        # Recreate entity
        new_entity = org_entity

        # Merge all results into entity
        #changed = False
        changed = enriched
        redigest_properties = {}
        for provider_name, engagement_result in engagement_results.items():
            if engagement_result.status != EngagementResult.SKIPPED and engagement_result.status != EngagementResult.NOCHANGE:
                enrich_key = engagement_result.properties_changed['enrich_key']
                for k, v in engagement_result.properties_changed.items():
                    property_changed = new_entity.set_data(provider_name, enrich_key, k, v)
                    if property_changed and k in LISTS.TRIGGERING_PROPERTIES:
                        redigest_properties[k] = v
                    changed |= property_changed
            self.logger.info('Done merging properties of %s. Changed = %s', provider_name, changed)
            pass

        if changed or enrichment_behavior.force_save:
            new_entity.last_update = datetime.datetime.now()
            new_entity.digest()
            Store.set_entity(entity_type, new_entity)
            msg = 'Stored in Store! (changed=%s, force_save=%s)' % (changed, enrichment_behavior.force_save)

            # Redigest other entities
            self.redigest(redigest_properties)
        else:
            msg = 'Not stored. No change detected'

        self.logger.info(msg)

        # Prepare information to send to webhook
        if enrichment_behavior.webhook:
            payload = {'status_message': msg, 'status_code': 200, 'ts': time.time(), 'aid': new_entity.aid}
            r = AcureRateUtils.announce(enrichment_behavior.webhook, payload)

        self.logger.info('Done merging enrichment result into entity. Changed = %s', changed)
Пример #7
0
    def _digest_employees_range(self):
        me = self.deduced

        # Go over providers and select the higher number
        for ds in self.sources():
            if C.EMPLOYEES_NUMBER in ds:
                if C.EMPLOYEES_NUMBER not in me:
                    me[C.EMPLOYEES_NUMBER] = ds[C.EMPLOYEES_NUMBER]
                elif ds[C.EMPLOYEES_NUMBER] > me[C.EMPLOYEES_NUMBER]:
                    me[C.EMPLOYEES_NUMBER] = ds[C.EMPLOYEES_NUMBER]
        if C.EMPLOYEES_NUMBER in me:
            me[C.EMPLOYEES_RANGE] = AcureRateUtils.get_employees_range(
                me[C.EMPLOYEES_NUMBER])
        pass
Пример #8
0
 def attempt_parse(description):
     # TODO: implement using NLTK
     roles = AcureRateUtils.normalized_titles(description)
     j = AcureRateJob()
     setattr(j, 'job_name', 'unknown')  # Use __init__ instead...
     job_roles = []
     for normalized_title, seniority, area in roles:
         job_roles.append({
             'job_role': normalized_title,
             'job_seniority': seniority,
             'job_area': area
         })
     setattr(j, 'job_roles', job_roles)
     return j
Пример #9
0
    def extract_and_write(self, startups, ngram):

        # Iterate over all startups
        for res in startups:
            text = res['text']
            the_type = res['type']
            value = res['value']
            if text.find(';') == 0:
                text = "'%s'" % text
            if text.lower().find(ngram) == 0:
                self.companies_file.write('%s; %s; %s\n' %
                                          (text, the_type, value))
                now_str = AcureRateUtils.get_now_as_str()
                print('%s: %s, %s, %s' % (now_str, text, the_type, value))

        self.companies_file.flush()
        pass
Пример #10
0
    def _enrich_entity(self,
                       entity_type,
                       enrichment_key,
                       enrichment_behavior,
                       enrichment_data=None,
                       enrichment_source=None):
        """
        Enrich a person - either with provided data or external enrichment (or both)

        :param enrichment_key: the search key to be used to retrieve the object
        :param enrichment_behavior: object determining external enrichment, dates, force, new, etc.
        ;param enrichment_data: an EnrichmentData object or Array of objects including data rows to add
        ;param enrichment_source: an EnrichmentSource object specifying the source of the added data
        :return: the person entity after the enrichment process
        """

        status_code = EnrichmentException.ALL_OK
        status_message = "Enrichment completed succesfully (behavior: %s)" % str(
            enrichment_behavior)

        # Validate parameters
        if enrichment_data and not enrichment_source:
            raise EnrichmentException(
                "Cannot enrich with additional data without enrichment source.",
                EnrichmentException.BAD_REQUEST)

        # Decide which external providers are to be used (all, selective list or empty list)
        providers = self._decide_providers(enrichment_behavior)

        try:
            updated_entities = []
            changed = False
            # Get person from the Store
            # TODO: in case too many results are returned - they are in-memory - need to limit
            entities = Store.get_entities(
                entity_type,
                enrichment_key,
                single_result=False,
                mongo_query=enrichment_behavior.mongo_query)
            if len(entities) == 0:
                if enrichment_behavior.create_new:
                    self.logger.info(
                        'Enriching on %s. Could not locate entities in %s collection, creating a new entity.',
                        enrichment_key, entity_type)
                    if entity_type == 'people':
                        entities = [AcureRatePerson()]
                    elif entity_type == 'company':
                        entities = [AcureRateCompany()]
                    # If no provider, add a Dummy engager, so the system digests and stores the data
                    if not providers:
                        providers = ['System']
                    elif 'System' not in providers:
                        providers.insert(0, 'System')
                else:
                    msg = 'Attempting enrichment on key %s. Could not locate entities matching key (Behavior::create_new = False)' % enrichment_key
                    raise EnrichmentException(
                        msg, EnrichmentException.CONTACT_NOT_FOUND)
            elif len(entities) > 1 and not enrichment_behavior.enrich_multiple:
                msg = 'Enrichment data %s returns %d entities but enrich_multiple=False. Not enriching' % (
                    enrichment_key, len(entities))
                raise EnrichmentException(
                    msg, EnrichmentException.MULTIPLE_CONTACTS)

            # Go over all entities retrieved from store (per given key)
            #with ClusterRpcProxy(EnrichmentServiceConfig.AMQP_CONFIG, timeout=None) as rpc:
            rpc = None
            if True:
                for entity in entities:
                    # If new enriched data provided, merge it into received entity
                    if enrichment_data and len(enrichment_data) > 0:
                        enrichment_data.append(
                            EnrichmentData('last_run_time',
                                           datetime.datetime.now(),
                                           'override-no-change'))
                        # enrichment_data.append(EnrichmentData('data_source', enrichment_source.source_type, 'override'))
                        # enrichment_data.append(EnrichmentData('enrich_key', enrichment_source.source_key, 'override'))
                        changed |= entity.merge_data(
                            enrichment_source.source_type,
                            enrichment_source.source_key, enrichment_data)
                        #changed |= entity.merge_data('System', 'nokey', enrichment_data)
                    if changed or enrichment_behavior.digest:
                        changed = entity.digest()

                    # Initiate engagement manager to enrich via providers
                    if True:
                        EngagementManager().spawn_engagers_sequentially(
                            providers, entity_type, entity,
                            enrichment_behavior, changed)
                    else:
                        rpc.engagement_manager.spawn_engagers.call_async(
                            providers, entity_type, entity.to_json_string(),
                            enrichment_behavior.force,
                            enrichment_behavior.force_save)
        except EnrichmentException as e:
            self.logger.warning(e)
            if enrichment_behavior.webhook:
                r = AcureRateUtils.announce(
                    enrichment_behavior.webhook, {
                        'status_message': e.message,
                        'status_code': e.code,
                        'ts': time.time()
                    })
                if r:
                    self.logger.info(
                        'Sent post request to webhook at %s. Content: %s. Code: %s',
                        enrichment_behavior.webhook, r.content, r.status_code)
        except Exception as e:
            msg = 'Failed to enrich %s entity. Key: %s. Reason: %s' % (
                entity_type, enrichment_key, e)
            self.logger.error(msg, exc_info=True)
            if enrichment_behavior.webhook:
                r = AcureRateUtils.announce(
                    enrichment_behavior.webhook, {
                        'status_message': msg,
                        'status_code': EnrichmentException.FATAL_ERROR,
                        'ts': time.time()
                    })
                if r:
                    self.logger.info(
                        'Sent post request to webhook at %s. Content: %s. Code: %s',
                        enrichment_behavior.webhook, r.content, r.status_code)

        return updated_entities
Пример #11
0
 def __str__(self):
     msg = AcureRateUtils.obj2string(self)
     return msg
Пример #12
0
    def enrich_company(self):

        company_name = self.enriched_entity.deduced.get(C.NAME, None)
        if company_name is None:
            self.logger.warning('Unable to enrich company. No name detected in entity: %s', self.enriched_entity)
            return

        # If there's a permalink, use it, otherwise try creating one
        if C.CRUNCHBASE_URL in self.enriched_entity.deduced:
            url = self.enriched_entity.deduced[C.CRUNCHBASE_URL]
            if url.find('/organization') == 0:
                url = 'https://www.crunchbase.com' + url
            permalink = AcureRateUtils.get_url_last_path_element(url)
        else:
            permalink = self.enriched_entity.deduced.get(C.CRUNCHBASE_PERMALINK,
                                                         CrunchBaseEngager.formalize_permalink(company_name))
            url = 'https://www.crunchbase.com/organization/%s#/entity' % permalink

        #driver = webdriver.Firefox()
        driver = webdriver.Chrome(r'C:\Python353\browser_drivers\chromedriver')
        driver.implicitly_wait(20)  # seconds
        try:
            # Activate the driver
            driver.get(url)

            # If we got to here, keep the permalink
            self.set_data(C.CRUNCHBASE_PERMALINK, permalink)

            # Get company name
            try:
                name = driver.find_element_by_id('profile_header_heading').text
                self.set_data(C.NAME, name)
                driver.implicitly_wait(2)  # seconds
            except:
                # TODO: there should be a smarter way to understand we got 404...
                s = "Failed to enrich %s. Unable to locate name entity in page - %s - something went awry... dumping this crawl." % (company_name, url)
                raise EngagementException(s)

            # Get company logo
            try:
                content = driver.find_element_by_class_name('logo-links-container')
                logo_url = content.find_element_by_css_selector("div > img").get_attribute("src")
                self.add_data(C.LOGOS, {C.LOGO_URL: logo_url, C.LOGO_SOURCE: 'crunchbase'})
            except:
                pass

            # Get overview stats (acquisitions, total funds, etc.)
            try:
                stats = driver.find_element_by_class_name('overview-stats').text
                if stats.strip() != "":
                    self.set_data(C.STATS, stats)
                    stats_lower = stats.replace('\n', ' ').lower()
                    if 'acquired by' in stats_lower and stats_lower.find(' on ') > 0:
                        acquiring_company = stats[stats_lower.find('acquired by')+12:stats_lower.find(' on ')]
                        self.set_data(C.ACQUIRED_BY, acquiring_company)
                        #tokens = stats.split('\n')
                        #self.set_data(C.ACQUIRED_BY, tokens[2])

            except:
                pass

            # Get headquarters
            try:
                content = driver.find_element_by_xpath('//dt[text()="Headquarters:"]')
                headquarters = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(C.HEADQUARTERS, headquarters)
            except:
                pass

            # Get description
            try:
                content = driver.find_element_by_xpath('//dt[text()="Description:"]')
                description = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(C.DESCRIPTION, description)
            except:
                pass

            # Get founders
            try:
                founders = []
                content = driver.find_element_by_xpath('//dt[text()="Founders:"]').find_element_by_xpath("following-sibling::*[1]")
                founders_elements = content.find_elements_by_css_selector('a')
                for f in founders_elements:
                    name = f.get_attribute("data-name")
                    permalink = f.get_attribute("data-permalink")
                    image = f.get_attribute("data-image")
                    founders.append(name)
                    #founders.append((name, permalink, image))
                self.set_data(C.FOUNDERS, founders)
            except Exception as e:
                print(e)

            # Get categories
            try:
                content = driver.find_element_by_xpath('//dt[text()="Categories:"]')
                categories = content.find_element_by_xpath("following-sibling::*[1]").text
                for c in categories.split(","):
                    self.add_data(C.CATEGORIES, c)
            except:
                pass

            # Get web-site
            try:
                content = driver.find_element_by_xpath('//dt[text()="Website:"]').find_element_by_xpath("following-sibling::*[1]")
                website_url = content.find_element_by_css_selector('a').get_attribute("href")
                self.set_data(C.WEBSITE, website_url)
            except:
                pass

            # Get socials
            try:
                content = driver.find_element_by_xpath('//dt[text()="Social: "]').find_element_by_xpath("following-sibling::*[1]")
                social_links_elems = content.find_elements_by_tag_name('a')
                for e in social_links_elems:
                    social_type = e.get_attribute('data-icons')  # "facebook", "twitter", etc.
                    social_link = e.get_attribute('href')
                    if social_type == 'facebook':
                        self.set_data(C.FACEBOOK_URL, social_link)
                    elif social_type == 'twitter':
                        self.set_data(C.TWITTER_URL, social_link)
            except Exception as e:
                print(e)

            # Get founding year
            try:
                content = driver.find_element_by_xpath('//dt[text()="Founded:"]')
                founding_year = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(C.FOUNDING_YEAR, founding_year)
            except:
                pass

            # Get contact email - for emails-domain info
            try:
                content = driver.find_element_by_xpath('//dt[text()="Contact:"]')
                contact_info = content.find_element_by_xpath("following-sibling::*[1]").text
                tokens = contact_info.split(' ')  # contact info may be structured:  [email protected] | Telephone
                email_domain = EmailUtil.get_email_domain_part(tokens[0])
                if email_domain and len(email_domain) > 0:
                    self.add_data(C.EMAIL_DOMAINS, email_domain)
            except:
                pass

            # Get aliases
            try:
                content = driver.find_element_by_xpath('//dt[text()="Aliases:"]')
                aliases = content.find_element_by_xpath("following-sibling::*[1]").text
                for a in aliases.split(", "):
                    self.add_data(C.ALIASES, a)
            except:
                pass

            # Get company type
            try:
                content = driver.find_element_by_xpath('//dt[text()="Type:"]')
                type_str = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(C.INVESTMENT_COMPANY_TYPE, type_str)
            except:
                pass

            # Get sectors (in case it's investor company)
            try:
                content = driver.find_element_by_xpath('//dt[text()="Sectors:"]')
                sectors_str = content.find_element_by_xpath("following-sibling::*[1]").text
                for c in sectors_str.split(", "):
                    self.add_data(C.CATEGORIES, c)
            except:
                pass

            # Get Investment Size (in case it's investor company)
            try:
                content = driver.find_element_by_xpath('//dt[text()="Investment Size:"]')
                investments_size_str = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(C.INVESTMENTS_RANGE, investments_size_str.replace(" ",""))
            except:
                pass

            # Get investments regions (in case it's investor company)
            try:
                content = driver.find_element_by_xpath('//dt[text()="Regions:"]')
                investments_regions_str = content.find_element_by_xpath("following-sibling::*[1]").text
                for r in investments_regions_str.split(", "):
                    self.add_data(C.INVESTMENT_REGIONS, r)
            except:
                pass

            # Get employees range
            try:
                content = driver.find_element_by_xpath('//dt[text()="Employees:"]')
                employees_range_str = content.find_element_by_xpath("following-sibling::*[1]").text
                i = employees_range_str.find('None found')
                if i < 0:
                    self.set_data(C.EMPLOYEES_RANGE, employees_range_str.replace(" ", ""))
                elif i > 0:
                    self.set_data(C.EMPLOYEES_RANGE, employees_range_str.replace(" ", "")[:employees_range_str.find("|")-1])
            except:
                pass

            # Get investors
            try:
                investors = []
                investors_tables = driver.find_elements_by_css_selector(".table.investors")
                if len(investors_tables) > 0:
                    investors_rows_elements = investors_tables[0].find_elements_by_tag_name("tbody")
                    for investor_element in investors_rows_elements:  # skip the header row of the table
                        rows = investor_element.find_elements_by_tag_name("tr")
                        for row in rows:
                            cols = row.find_elements_by_tag_name("td")
                            investor_permalink = ''
                            if len(cols) == 3:
                                investor_name = cols[0].text
                                round = cols[1].text
                                partner = cols[2].text
                                investor_permalink = cols[0].find_element_by_class_name("follow_card").get_attribute('data-permalink')
                            elif len(cols) == 2:
                                round = cols[0].text
                                partner = cols[1].text
                            if "/organization" in investor_permalink:
                                investor_type = "organization"
                            else:
                                investor_type = "person"
                            if 'Seed' in round or 'Angel' in round:
                                str = "%s / %s" % (partner, round)
                                investors.append((investor_name, investor_type, str))
                            else:
                                str = "%s / %s" % (partner, round)
                                investors.append((investor_name, investor_type, str))
                    if len(investors) > 0:
                        self.set_data(C.INVESTORS, investors)
            except Exception as e:
                print(e)

            # TODO: get Acquisitions

            # Get current team
            current_team = []
            try:
                people_table = driver.find_elements_by_class_name('people')
                if len(people_table) > 0:
                    # TODO: get the person title - we don't want developers here...
                    people_rows_element = people_table[1].find_elements_by_css_selector("li")
                    for person in people_rows_element:
                        name_element = person.find_element_by_css_selector("h4 a")
                        name = name_element.get_attribute('data-name')
                        permalink = name_element.get_attribute('data-permalink')
                        title_element = person.find_element_by_css_selector("h5")
                        title = title_element.text
                        image = person.find_element_by_css_selector("span a img").get_attribute("src")
                        current_team.append(name)
                        #current_team.append((name, permalink, title, image))
            except Exception as e:
                print(e)

            # Get past team
            try:
                people_table = driver.find_elements_by_class_name('past_people')
                if len(people_table) > 0:
                    # TODO: get the person title - we don't want developers here...
                    people_rows_element = people_table[0].find_elements_by_css_selector("li")
                    for person in people_rows_element:
                        name_element = person.find_element_by_css_selector("h4 a")
                        name = name_element.get_attribute('data-name')
                        #permalink = name_element.get_attribute('data-permalink')
                        #title_element = person.find_element_by_css_selector("h5")
                        #title = title_element.text
                        #image = person.find_element_by_css_selector("span a img").get_attribute("src")
                        current_team.append(name)
                        #current_team.append((name, permalink, title, image))
            except Exception as e:
                print(e)

            # Store past & current team
            if len(current_team) > 0:
                self.set_data(C.TEAM, current_team)

            # Get board members and advisors
            try:
                advisors = []
                advisors_table = driver.find_elements_by_css_selector('.base.no-data.advisors')
                if len(advisors_table) == 0:
                    advisors_table = driver.find_elements_by_css_selector('.base.advisors')
                    if len(advisors_table) > 0:
                        advisors_rows_elements = advisors_table[0].find_elements_by_css_selector("h4 a")
                        for advisor_element in advisors_rows_elements:
                            name = advisor_element.get_attribute('data-name')
                            permalink = advisor_element.get_attribute('data-permalink')
                            # TODO: check that investors is person and not organization
                            advisors.append(name)
                            #advisors.append((name, permalink))
                    if len(advisors) > 0:
                        self.set_data(C.ADVISORS, advisors)
            except Exception as e:
                print(e)

        except Exception as e:
            raise e

        driver.close()
        return [C.NAME]
    def enrich_person(self):
        try:
            # TODO: improve - run 3 searches - by full name, first name and last name. Check all results agains P.possible_names...
            url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query'
            query = 'query=%s&facetFilters=' % self.enrich_key
            payload = {
                "params": query,
                "apiKey": CrunchBaseScraperEngager.THE_KEY,
                "appID": CrunchBaseScraperEngager.APP_ID
            }
            headers = {
                'contentType': 'application/json; charset=utf-8',
                'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY,
                'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID
            }
            response = requests.post(url, json=payload, headers=headers)
            if response.status_code == 429:
                raise EngagementException(
                    "%s. Exceeded requests quota. Error: %s." %
                    (response.status_code, response.text),
                    fatal=True)
            if response.status_code != 200:
                raise EngagementException(
                    "%s. %s." % (response.status_code, response.text),
                    fatal=True)
            if response.json()['nbHits'] == 0:
                raise EngagementException(
                    "No hits returned when searching for %s." %
                    self.enrich_key)

            # Check how many matches we have (if any)
            matches = []
            for person in response.json().get('hits', []):
                if person.get('type', '') == 'Person' and person.get(
                        'person', False) and person.get('name',
                                                        '') == self.enrich_key:
                    matches.append(person)
            if len(matches) == 0:
                raise EngagementException(
                    "None of the hits match the person name we're searching for (%s)."
                    % self.enrich_key)
            if len(matches) > 1:
                raise EngagementException(
                    "Person name is ambiguous - got %d hits for %s. Not enriching."
                    % (len(matches), self.enrich_key))

            # Iterate over matches (currently we get here only if there's one, but in future we may want to refine match)
            for person in matches:
                # Grab name
                f, m, l = AcureRateUtils.tokenize_full_name(person['name'])
                self.set_data(P.FIRST_NAME, f)
                self.set_data(P.LAST_NAME, l)
                if m:
                    self.set_data(P.MIDDLE_NAME, m)

                # Grab person photo
                if 'logo_url' in person:
                    logo_url = person['logo_url']
                    self.add_data(P.PHOTOS, {
                        P.PHOTO_URL: logo_url,
                        P.PHOTO_SOURCE: 'crunchbase'
                    })

                # Grab location
                if 'location_name' in person:
                    self.add_data(P.LOCATIONS, person['location_name'])

                # Grab socials
                if 'permalink' in person:
                    self.set_data(P.CB_PERMALINK, person['permalink'])
                if 'url' in person:
                    self.set_data(P.CRUNCHBASE_URL, person['url'])
                if 'linkedin_url' in person:
                    self.set_data(P.LINKEDIN_URL, person['linkedin_url'])
                if 'twitter_url' in person:
                    self.set_data(P.TWITTER_URL, person['twitter_url'])

                # Grab current position
                title = None
                if 'title' in person:
                    title = person['title']

                company = None
                if 'organization_name' in person:
                    company = person['organization_name']
                if title and company:
                    current_job = {
                        P.JOB_CURRENT: True,
                        P.JOB_TITLE: title,
                        P.JOB_NAME: company
                    }
                    self.add_data(P.JOBS, current_job)
                    if AcureRateUtils.is_business(title):
                        self.logger.info('---->> %s - %s @ %s', person['name'],
                                         title, company)

                # Grab primary role
                if title is not None and company is not None:
                    role = '%s @ %s' % (title, company)
                    self.set_data(P.PRIMARY_ROLE, role)

                # Set as business as person was found in CB...
                self.set_data(P.BUSINESS, True)
                self.set_data(P.BUSINESS_REASON, 'appears in CB')

                # Investor?
                if 'n_investments' in person and person['n_investments'] > 0:
                    self.set_data(P.INVESTOR, True)
                    self.set_data(P.INVESTOR_REASON,
                                  '%s investments' % person['n_investments'])
                    self.logger.info('--==--==-->> Worth looking into %s',
                                     person['name'])
                # We found one person, we can break from loop
                # TODO: in the future, add the other persons we found to Queue for further enrichment
                break
            pass
        except Exception as e:
            self.logger.error(
                'Failed to set some properties on person %s. Returning partial. (exception: %s)',
                self.enriched_entity, e)
        return [P.FULL_NAME]
    def enrich_company(self):
        try:
            # Construct URL to look for company
            company_name = self.enriched_entity.deduced[C.NAME]
            org_type = self.enriched_entity.deduced.get(
                C.ORGANIZATION_TYPE, None)
            end_point = 'i' if org_type == C.ORGANIZATION_TYPE_VENTURE_CAPITAL else 'c'
            url = '%s/%s' % (self.BASE_URL, end_point)

            # Search Google for the exact URL
            result_urls = AcureRateUtils.google_search(site=url,
                                                       query='"%s"' %
                                                       company_name)
            # TODO: it is possible that more than 1 result is returned, and the first is ok. Need to compare name.
            if len(result_urls) != 1:
                s = 'Unable to locate results page for company %s' % company_name
                raise EngagementException(s)

            # Get the company's page for parsing
            response = requests.get(result_urls[0])
            if response.status_code != 200:
                s = 'Unable to load page in StartupNationCentral.org on %s. Error: %s. (url=%s)' % (
                    self.enrich_key, response.status_code, result_urls[0])
                raise EngagementException(s)

            self.set_data(C.STARTUPNATIONCENTRAL_URL, url)
            soup = BeautifulSoup(response.content, 'html.parser')

            # Get name
            try:
                name = soup.find("h1", {"class": "company__title"}).text
                self.set_data(C.NAME, name)
            except:
                self.logger.warning('Unable to locate name attribute for %s',
                                    self.enrich_key)

            # Get information if company was ACQUIRED or CLOSED
            # TODO...

            # Get short description
            try:
                short_description = soup.find(
                    "div", {
                        "class": "company__short-description"
                    }).text
                self.set_data(C.SHORT_DESCRIPTION,
                              short_description.replace('\n', '').strip())
            except:
                self.logger.warning(
                    'Unable to locate short description attribute for %s',
                    self.enrich_key)

            # Get description
            try:
                description = soup.find("div", {
                    "class": "company__short-description"
                }).text
                self.set_data(C.DESCRIPTION,
                              description.replace('\n', '').strip())
            except:
                self.logger.warning(
                    'Unable to locate description attribute for %s',
                    self.enrich_key)

            # Get company logo
            try:
                logo_elem = soup.find("img", {"class": "company__logo"})
                logo_url = self.BASE_URL + logo_elem['src']
                self.set_data(C.LOGO_URL, logo_url)
            except:
                self.logger.warning(
                    'Unable to locate company logo attribute for %s',
                    self.enrich_key)

            # Get homepage
            try:
                homepage = soup.find("strong",
                                     string='Homepage').parent.find('a').text
                self.set_data(C.DOMAIN, homepage)
            except:
                self.logger.warning(
                    'Unable to locate homepage attribute for %s',
                    self.enrich_key)

            # Get Sector
            try:
                sector = soup.find("strong",
                                   string='Sector').parent.find('a').text
                self.set_data(C.SECTOR, sector)
            except:
                self.logger.warning('Unable to locate sector attribute for %s',
                                    self.enrich_key)

            # Get founding year
            try:
                founding_year = soup.find(
                    "strong", string='Founded').parent.find('div').text
                self.set_data(C.FOUNDING_YEAR, founding_year)
            except:
                self.logger.warning(
                    'Unable to locate founding year attribute for %s',
                    self.enrich_key)

            # Get Business Model
            try:
                business_model = soup.find(
                    "strong", string='Business Model').parent.find('a').text
                self.set_data(C.BUSINESS_MODEL, business_model)
            except:
                self.logger.warning(
                    'Unable to locate business model attribute for %s',
                    self.enrich_key)

            # Get Funding stage
            try:
                funding_stage = soup.find(
                    "strong", string='Funding Stage').parent.find('div').text
                self.set_data(C.FUNDING_STAGE, funding_stage)
            except:
                self.logger.warning(
                    'Unable to locate funding stage attribute for %s',
                    self.enrich_key)

            # Get employees range
            try:
                employee_range = soup.find(
                    "strong", string='Employees').parent.find('div').text
                self.set_data(C.EMPLOYEES_RANGE, employee_range)
            except:
                self.logger.warning(
                    'Unable to locate employee range attribute for %s',
                    self.enrich_key)

            # Get Product Stage
            try:
                product_stage = soup.find(
                    "strong", string='Product Stage').parent.find('div').text
                self.set_data(C.PRODUCT_STAGE, product_stage)
            except:
                self.logger.warning(
                    'Unable to locate product stage attribute for %s',
                    self.enrich_key)

            # Get categories
            try:
                elems = soup.findAll("a", {"class": "tags__tag"})
                for elem in elems:
                    self.add_data(C.CATEGORIES, elem.text)
            except:
                self.logger.warning(
                    'Unable to locate categories attribute for %s',
                    self.enrich_key)

            # Get Address
            try:
                pass
            except:
                self.logger.warning(
                    'Unable to locate address attribute for %s',
                    self.enrich_key)

            # Get the team
            try:
                elems = soup.findAll("div", {"class": "company-team__info"})
                for elem in elems:
                    name_elem = elem.find("div",
                                          {"class": "company-team__name"})
                    self.add_data(C.TEAM, name_elem.text)

                    # TODO: enrich the person with this position
                    position_elem = elem.find(
                        "div", {"class": "company-team__position"})
                    the_position = position_elem.text.lower()
                    if any(x in the_position for x in [
                            'cofounder', 'co-founder', 'co founder', 'founder',
                            'owner'
                    ]):
                        self.add_data(C.FOUNDERS, name_elem.text)
            except:
                self.logger.warning(
                    'Unable to locate team members attribute for %s',
                    self.enrich_key)

            # If this is an investment company, get their portfolio companies
            if org_type == C.ORGANIZATION_TYPE_VENTURE_CAPITAL:
                # TODO: Garb these fields:
                # TODO: 'In Israel Since', 'Investment Stages', 'Min Amount', 'Max Amount', 'Capital Managed', 'Industry Preferences'
                try:
                    portfolio_cards = soup.findAll(
                        "div", {"class": "investor-portfolio__company"})
                    for elem in portfolio_cards:
                        company_name_elem = elem.find(
                            "h2", {"class": "company-card__title"})
                        self.add_data(C.PORTFOLIO_COMPANIES,
                                      company_name_elem.text)
                        # TODO: grab more info from portfolio cards: logo, website url, short description - enrich the company data
                    pass
                except:
                    self.logger.warning(
                        'Unable to locate portfolio companies for %s',
                        self.enrich_key)

            pass

        except Exception as e:
            self.logger.error('Unable to enrich person %s. %s',
                              self.enriched_entity, e)
            raise e

        return []
Пример #15
0
    def generate_companies_map(self):
        self._clean_graph()

        # Add founder to graph
        founder_node_id = self._add_entity_to_d3_json_graph(self.founder.deduced[P.FULL_NAME], self.founder, FindReferrals.GROUP_FOUNDER)

        # Get all the contacts that have these places of work in their jobs
        targetted_companies_1 = ["SAP", "VMware", "Hewlett-Packard", "Facebook", "Google", "NICE Systems", "LinkedIn",
                               "Microsoft", "Waze", "Salesforce", "Kenshoo", "Cisco", "EMC-ZZZ", "Intel", "Twitter", "Apple",
                               "NASA", "General Electric", "United Nations"]
        targetted_companies_2 = ["SAP", "Facebook", "Google", "NICE Systems", "LinkedIn", "Microsoft", "Salesforce",
                                 "Twitter", "Apple", "NASA", "General Electric", "United Nations"]
        targetted_companies_3 = ["Carmel Ventures", "Intel Capital", "Evergreen Venture Partners", "Gemini Israel Ventures",
                                 "Pitango Venture Capital", "Apax Partners", "Qumra Capital", "JVP"]

        targetted_companies = targetted_companies_3
        #targetted_companies = ["Google"]
        targetted_companies_map = {}
        for company_name in targetted_companies:
            # Get company details from db:
            # r = DBWrapper.get_companies({"deduced.name": company_name}, True)
            company_r = DBWrapper.get_companies({"deduced.aliases": company_name.lower()}, True)
            if company_r is None:
                continue
            company = AcureRateCompany.reconstruct(company_r)
            targetted_companies_map[company_name] = company
            # if company_name == "Microsoft" or \
            #                 company_name == "Twitter" or \
            #                 company_name == "LinkedIn" or \
            #                 company_name == "Google" or \
            #                 company_name == "SAP" or \
            #                 company_name == "Apple" or \
            #                 company_name == "Salesforce" or \
            #                 company_name == "NASA" or \
            #                 company_name == "General Electric" or \
            #                 company_name == "United Nations" or \
            #                 company_name == "Facebook":
            #     targetted_companies_map[company_name] = company
            # else:
            #     pass

            # Get all people who are (a) in founder's contacts; (b) worked in this company
            regx = re.compile(company_name, re.IGNORECASE)
            query = {"$and": [{"deduced.jobs.job_name": regx},
                              {"$or": [{"data_sources.GoogleContacts.attribution_id": self.founder_aid},
                                       {"data_sources.LinkedInContacts.attribution_id": self.founder_aid}]}]}
            cursor = DBWrapper.get_persons(query)
            for r in cursor:
                person = AcureRatePerson.reconstruct(r)
                person.deduced['company_referred'] = company.deduced
                deduced_link_type = FindReferrals.LINK_TYPE_DEFAULT
                title = person.title_at(company.deduced[C.NAME])
                if title:
                    # TODO: temp code:
                    if not AcureRateUtils.is_senior(company_r, title) and 'Director' not in title:
                        continue
                    person.deduced['title_at_company_referred'] = title + " @ " + company_name
                    # TODO: complete this...
                    if 'president' in title.lower():  # TODO: remove... done to catch Miki Migdal... need to use isSenior
                        deduced_link_type = FindReferrals.LINK_TYPE_MOVER_AND_SHAKER
                    # Create in graph the referral node and link to it
                    person_node_id = self._add_entity_to_d3_json_graph(person.deduced[P.FULL_NAME], person, FindReferrals.GROUP_REFERRALS)
                    self._add_link_to_d3_json_graph(founder_node_id, person_node_id, value=FindReferrals.LINK_STRENGTH_MEDIUM, link_type=deduced_link_type)

        # Get all people who are (a) in founder's contacts; (b) have related investors
        query = {"$and": [{"$or": [{"deduced.investor": {"$exists": True}},
                                   {"deduced.business": {"$exists": True}}]},
                          {"$or": [{"data_sources.GoogleContacts.attribution_id": self.founder_aid},
                                   {"data_sources.LinkedInContacts.attribution_id": self.founder_aid}]}]}
        cursor = DBWrapper.get_persons(query)
        contacts = [AcureRatePerson.reconstruct(r) for r in cursor]
        for contact in contacts:
            contact_contacts = contact.business_related_contacts(high_profile=True)
            for contact_contact_name, contact_contact_relation, contact_contact_company in contact_contacts:
                r = DBWrapper.get_persons({"deduced.full_name": contact_contact_name}, True)
                if r:
                    contact_contact = AcureRatePerson.reconstruct(r)
                    for company_name, company in targetted_companies_map.items():
                        if contact_contact.is_related_to_companies(company.deduced[C.ALIASES]):
                            # Create in graph the referral node and link to it

                            contact_node_id = self._add_entity_to_d3_json_graph(contact.deduced[P.FULL_NAME], contact, FindReferrals.GROUP_REFERRALS)
                            self._add_link_to_d3_json_graph(founder_node_id, contact_node_id, value=FindReferrals.LINK_STRENGTH_MEDIUM, link_type=FindReferrals.LINK_TYPE_MOVER_AND_SHAKER)

                            # Create the contact's contact that will lead to the company
                            contact_contact.deduced['company_referred'] = company.deduced
                            title = contact_contact.title_at(company.deduced[C.ALIASES])
                            if title:
                                contact_contact.deduced['title_at_company_referred'] = title + " @ " + company_name
                            else:
                                # no title, we can't know if it's a "serious" connection
                                continue
                                # contact_contact.deduced['title_at_company_referred'] = "Related to " + company_name

                            relation_phrase = FindReferrals._generate_referral_2_investor_phrase(contact, contact_contact_name, contact_contact_relation, contact_contact_company)
                            contact_contact.deduced['referral'] = contact.deduced[P.FULL_NAME]
                            contact_contact.deduced['relation_phrase'] = relation_phrase

                            link_strength = self._calculate_link_strength(contact, contact_contact_name, contact_contact_relation, contact_contact_company)

                            contact_contact_node_id = self._add_entity_to_d3_json_graph(contact_contact.deduced[P.FULL_NAME], contact_contact, FindReferrals.GROUP_REFERRALS)
                            #self._add_link_to_d3_json_graph(contact_node_id, contact_contact_node_id, relation=relation_phrase, value=FindReferrals.LINK_STRENGTH_MEDIUM, link_type=FindReferrals.LINK_TYPE_MOVER_AND_SHAKER)
                            self._add_link_to_d3_json_graph(contact_node_id, contact_contact_node_id, relation=relation_phrase, value=link_strength, link_type=FindReferrals.LINK_TYPE_MOVER_AND_SHAKER)


        self._write_d3_json_to_file('companies_map.json')
Пример #16
0
    def handle_row(self, mapped_row, raw_row, row_number):
        super().handle_row(mapped_row, raw_row, row_number)

        # Is this row an artifact of some kind of App or other synthetic data?
        if self.should_ignore_row(raw_row):
            self.logger.warning("Synthetic row. Ignoring. (row: %s)", raw_row)
            return False

        # TODO: special optimizations for the wise-crackers...
        # (1) fname = "ploni almoni", lname = <empty>
        # (2) fname = <empty>, lname = "ploni almoni"
        # (3) fname = "ploni almoni", lname = "*****@*****.**"

        # Sanity check to make sure row is with values
        if mapped_row is None or mapped_row['email1'] is None:
            self.logger.error("Something went awry... email attribute is None. Aborting")
            raise Exception("Something went awry during parsing... email attribute is None. Aborting")

        ed = []
        emails = set()
        for i in [1, 2, 3]:
            attr = 'email%d' % i
            if attr in mapped_row and mapped_row[attr].strip() != '':
                emails.add(mapped_row[attr].strip())
        verified_emails = [e for e in list(emails) if EmailUtil.is_valid(e)]
        best_email = EmailUtil.get_preferred_email_from_list(verified_emails)
        if best_email is None:
            if emails is None or len(emails) == 0:
                reason = "No emails detected"
            elif len(verified_emails) == 0:
                reason = "No verified emails - %s" % emails
            else:
                reason = "%s" % emails
            self.logger.warning("Row %d: %s %s: Cannot set key email. %s. Ignoring.", row_number, mapped_row['first_name'], mapped_row['last_name'], reason)
            return False
        key_email = EmailUtil.email_as_key(best_email)
        ed.append(EnrichmentData(P.EMAIL, key_email, 'override'))
        key = {P.EMAIL: key_email}
        error = None
        fname = AcureRateUtils.remove_parenthesized_content(mapped_row['first_name'].strip())
        lname = AcureRateUtils.remove_parenthesized_content(mapped_row['last_name'].strip())
        if '?' in fname or '?' in lname:
            self.logger.warning("Row %d - found suspicious name - fname: %s, lname: %s ('?' in name). Replacing and continuing.", row_number, mapped_row['first_name'], mapped_row['last_name'])
            fname = fname.replace('?', '').strip()
            lname = lname.replace('?', '').strip()

        if not AcureRateUtils.valid_name(fname) or not AcureRateUtils.valid_name(lname):
            self.logger.warning("Row %d: non-alphanumeric characters in name - fname: %s, lname: %s. Continuing.", row_number, mapped_row['first_name'], mapped_row['last_name'])

        # Handle cases where contact full English name is in FIRST_NAME field and full name in Hebrew in LAST_NAME
        if len(fname.split()) == 2 and lname == '':
            lname = fname.split()[1]
            fname = fname.split()[0]
        elif len(lname.split()) == 2 and fname == '':
            fname = lname.split()[0]
            lname = lname.split()[1]

        if fname == "" or lname == "":
            error = "Row %d: empty name after cleaning - fname: %s, lname: %s. Ignoring." % (row_number, mapped_row['first_name'], mapped_row['last_name'])
        elif '&' in fname or '&' in lname:
            error = "Row %d: '&' in name after cleaning - fname: %s, lname: %s. Ignoring." % (row_number, mapped_row['first_name'], mapped_row['last_name'])
        elif '@' in fname or '@' in lname:
            error = "Row %d: '@' in name after cleaning - fname: %s, lname: %s. Ignoring." % (row_number, mapped_row['first_name'], mapped_row['last_name'])
        elif len(fname) == 1 and len(lname) == 1:
            error = "Row %d: suspicious name - fname: %s, lname: %s (too short...) Ignoring." % (row_number, mapped_row['first_name'], mapped_row['last_name'])
        if error:
            self.logger.warning(error)
            return False

        if len(fname) <= 2 or len(lname) <= 2:
            self.logger.warning("Row %d: suspicious name - fname: %s, lname: %s (too short...) Continuing.", row_number, mapped_row['first_name'], mapped_row['last_name'])
        if len(fname.split()) > 1 or len(lname.split()) > 1:
            self.logger.warning("Row %d: suspicious name - fname: %s, lname: %s (space in fname/lname) Continuing.", row_number, mapped_row['first_name'], mapped_row['last_name'])

        ed.append(EnrichmentData(P.FIRST_NAME, fname.title(), 'override'))
        ed.append(EnrichmentData(P.LAST_NAME, lname.title(), 'override'))

        if mapped_row['middle_name'].strip() != "":
            ed.append(EnrichmentData(P.MIDDLE_NAME, mapped_row['middle_name'].strip(), 'override'))
        if mapped_row['prefix'].strip() != "":
            ed.append(EnrichmentData(P.PREFIX, mapped_row['prefix'].strip(), 'override'))
        if mapped_row['suffix'].strip() != "":
            ed.append(EnrichmentData(P.SUFFIX, mapped_row['suffix'].strip(), 'override'))

        # Handle email attributes (even if some are invalid emails - may lead to needed info in later stages)
        for i in [1, 2, 3]:
            if ("email%d" % i) in mapped_row and mapped_row["email%d" % i].strip() != "":
                ed.append(EnrichmentData(P.EMAILS, mapped_row["email%d" % i].strip(), 'add'))

        # Handle phone attributes
        for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]:
            if ("phone%d" % i) in mapped_row and mapped_row["phone%d" % i].strip() != "":
                ed.append(EnrichmentData(P.PHONES, mapped_row["phone%d" % i].strip(), 'add'))

        # TODO: handle cases where more than one org/title is mentioned
        # TODO: write code better... (dict2dict... ?)
        org = mapped_row.get('job_name1', None)
        title = mapped_row.get('job_title1', None)
        if org is not None and org.strip() != "" and title is not None and title.strip() != "":
            ed.append(EnrichmentData(P.JOBS, {'job_name': org, 'job_title': title}, 'add'))
        elif org is not None and org.strip() != "" and title is None:
            ed.append(EnrichmentData(P.JOBS, {'job_name': org}, 'add'))
        elif org is None and title is not None and title.strip() != "":
            ed.append(EnrichmentData(P.JOBS, {'job_title': title}, 'add'))

        # Is there a DOB field? Convert it to python timestamp
        if P.DOB in mapped_row and mapped_row[P.DOB] != '':
            ed.append(EnrichmentData(P.DOB, AcureRateUtils.normalized_dob(mapped_row[P.DOB]), 'add'))

        # Is there a direct manager marked?
        if P.DIRECT_MANAGER in mapped_row and mapped_row[P.DIRECT_MANAGER] != '':
            ed.append(EnrichmentData(P.DIRECT_MANAGER, mapped_row[P.DIRECT_MANAGER], 'add'))

        if P.WEBSITE in mapped_row and mapped_row[P.WEBSITE] != '':
            ed.append(EnrichmentData(P.WEBSITE, mapped_row[P.WEBSITE], 'add'))

        # Anything worthwhile in the notes?
        if 'notes' in mapped_row and mapped_row['notes'].strip() != '':
            #self.logger.info('Row %d - contact %s %s got notes - anything interesting here? Notes: %s', row_number, fname, lname, mapped_row['notes'])
            self.logger.info('Row %d: contact %s %s got notes - anything interesting here?', row_number, fname, lname)

        # Anything worthwhile in the notes?
        if 'categories' in mapped_row and mapped_row['categories'].strip() != '':
            categories = mapped_row['categories'].lower().split(";")
            # TODO: temp code below. Remove.
            if 'friends' in categories or 'friend' in categories or 'sayarut' in categories or 'hofen' in categories:
                self.logger.warning('Contact %s %s filtered out because of irrelevant category (categories: %s)', fname,
                                    lname, mapped_row['categories'])
                return False

        # Mark the source these contacts came from:
        ed.append(EnrichmentData(P.ATTRIBUTION_ID, self.attribution_id, 'override'))
        ed.append(EnrichmentData(P.ATTRIBUTION_NAME, self.attribution_name, 'override'))

        if not self.test_import:
            self.logger.info('Row %d: key_email: %s. Sending to enrichment...', row_number, key_email)
            source = EnrichmentSource(source_type=self.source, source_key='%s %s' % (self.attribution_id, best_email.lower()))
            behavior = EnrichmentBehavior(create_new=True, providers=self.providers)
            self.es.enrich_person(enrichment_key=key, enrichment_data=ed, enrichment_source=source, enrichment_behavior=behavior)
        return True
Пример #17
0
    def enrich_person(self):
        permalink = self.enrich_key
        url = 'https://www.crunchbase.com/person/%s#/entity' % permalink

        #driver = webdriver.Firefox()
        driver = webdriver.Chrome(r'C:\Python353\browser_drivers\chromedriver')
        #driver.set_window_size(1120, 550)
        driver.implicitly_wait(11)  # seconds
        try:
            # Activate the driver
            driver.get(url)

            # If we got to here, keep the permalink and URL
            self.set_data(P.CB_PERMALINK, permalink)
            self.set_data(P.CRUNCHBASE_URL, url)

            # Get person name
            try:
                full_name = driver.find_element_by_id('profile_header_heading').text
                f, m, l = AcureRateUtils.tokenize_full_name(full_name)
                self.set_data(P.FIRST_NAME, f)
                self.set_data(P.LAST_NAME, l)
                if m:
                    self.set_data(P.MIDDLE_NAME, m)
                driver.implicitly_wait(2)  # seconds
            except:
                s = "Failed to enrich %s. Unable to locate name entity in page - %s - something went awry... dumping this crawl." % (permalink, url)
                raise EngagementException(s)

            # Get primary role
            try:
                content = driver.find_element_by_xpath('//dt[text()="Primary Role"]')
                role_str = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(P.PRIMARY_ROLE, role_str.replace('\n', ' '))
            except:
                pass

            # Get photo
            try:
                content = driver.find_element_by_class_name('logo-links-container')
                photo_url = content.find_element_by_css_selector("div > img").get_attribute("src")
                if "cb-default" not in photo_url:
                    self.add_data(P.PHOTOS, {P.PHOTO_URL: photo_url, P.PHOTO_SOURCE: 'crunchbase'})
            except:
                pass

            # Get dob
            try:
                content = driver.find_element_by_xpath('//dt[text()="Born:"]')
                dob = content.find_element_by_xpath("following-sibling::*[1]").text
                self.set_data(P.DOB, dob)
            except:
                pass

            # Get gender
            try:
                content = driver.find_element_by_xpath('//dt[text()="Gender:"]')
                gender = content.find_element_by_xpath("following-sibling::*[1]").text
                self.add_data(P.GENDER, gender)
            except:
                pass

            # Get location
            try:
                content = driver.find_element_by_xpath('//dt[text()="Location:"]')
                location = content.find_element_by_xpath("following-sibling::*[1]").text
                if location != "Unknown":
                    self.add_data(P.LOCATIONS, location)
            except:
                pass

            # Get web-site
            try:
                content = driver.find_element_by_xpath('//dt[text()="Website:"]').find_element_by_xpath("following-sibling::*[1]")
                website_url = content.find_element_by_css_selector('a').get_attribute("href")
                self.set_data(P.WEBSITE, website_url)
            except:
                pass

            # Get socials
            try:
                content = driver.find_element_by_xpath('//dt[text()="Social: "]').find_element_by_xpath("following-sibling::*[1]")
                social_links_elems = content.find_elements_by_tag_name('a')
                for e in social_links_elems:
                    social_type = e.get_attribute('data-icons')  # "facebook", "twitter", "linkedin", etc.
                    social_link = e.get_attribute('href')
                    if social_type == 'facebook':
                        self.set_data(P.FACEBOOK_URL, social_link)
                    elif social_type == 'twitter':
                        self.set_data(P.TWITTER_URL, social_link)
                    elif social_type == 'linkedin':
                        self.set_data(P.LINKEDIN_URL, social_link)
            except Exception as e:
                print(e)

            # Get person details (description)
            try:
                person_details_elem = driver.find_element_by_id('description')
                person_details_str = person_details_elem.text
                self.set_data(P.DESCRIPTION, person_details_str)
            except Exception as e:
                print(e)

            # Get current jobs
            try:
                for row in driver.find_elements_by_css_selector(".experiences .current_job"):
                    title = row.find_element_by_tag_name('h4').text
                    company = row.find_element_by_tag_name('h5').text
                    current_job = {P.JOB_CURRENT: True, P.JOB_TITLE: title, P.JOB_NAME: company}
                    self.add_data(P.JOBS, current_job)
            except Exception as e:
                print(e)

            # Get past jobs
            try:
                past_job_section = driver.find_element_by_css_selector(".experiences .past_job")
                for row in past_job_section.find_elements_by_css_selector(".info-row")[1:-1]:
                    cols = row.find_elements_by_css_selector(".cell")
                    started = cols[0].text
                    ended = cols[1].text
                    title = cols[2].text
                    company = cols[3].text
                    past_job = {P.JOB_STARTED: started, P.JOB_ENDED: ended, P.JOB_TITLE: title, P.JOB_NAME: company}
                    self.add_data(P.JOBS, past_job)
            except Exception as e:
                print(e)

            # Get advisory roles
            try:
                advisory_roles_section = driver.find_element_by_css_selector(".advisory_roles")
                for row in advisory_roles_section.find_elements_by_css_selector("li .info-block"):
                    company = row.find_element_by_tag_name('h4').text
                    role_started = row.find_elements_by_css_selector('h5')
                    role = role_started[0].text
                    started = role_started[1].text
                    advisory_job = {P.JOB_TITLE: role, P.JOB_NAME: company}
                    if started.strip() != '':
                        advisory_job[P.JOB_STARTED] = started
                    self.add_data(P.ADVISORY_JOBS, advisory_job)
            except Exception as e:
                print(e)

            # Get investments
            try:
                investments = []
                investors_tables = driver.find_elements_by_css_selector(".table.investors")
                if len(investors_tables) > 0:
                    investors_rows_elements = investors_tables[0].find_elements_by_tag_name("tr")
                    for investor_element in investors_rows_elements[1:]:  # we're skipping the header line
                        txt = investor_element.text
                        # We care only about personal investments, so we go in only if there's anywhere seed investment
                        if 'personal investment' in txt.lower():
                            cols = investor_element.find_elements_by_tag_name('td')
                            if cols[3].text == 'Personal Investment':
                                investments.append((cols[0].text, cols[1].text, cols[2].text))
                    self.set_data(P.INVESTMENTS, investments)
            except Exception as e:
                print(e)

            # Get education
            try:
                content = driver.find_element_by_class_name('education')
                education_elements = content.find_elements_by_css_selector("li > div")
                ed = {}
                for elem in education_elements:
                    institute_name = elem.find_element_by_css_selector('h4 > a').text
                    if institute_name != '':
                        ed[P.EDUCATION_INSTITUTE] = institute_name
                    degree = elem.find_element_by_css_selector('h5').text
                    if degree != '':
                        ed[P.EDUCATION_DEGREE] = degree
                    years = elem.text.replace(institute_name, '').replace(degree, '').strip()
                    if years != '':
                        ed[P.EDUCATION_YEARS] = years
                    self.add_data(P.EDUCATIONS, ed)
            except:
                pass

        except Exception as e:
            raise e

        driver.close()
        return [P.FULL_NAME]
Пример #18
0
    def get_relations(self, filter=None):
        """
        Looks at raw data of person entity and returns all relations.

        :return: List of tupples, each tupple: (target_aid, relationship type, relationship properties)
        """
        from store.store import Store

        relations = set()

        # C:C - Create ACQUIRED_BY relation
        if C.ACQUIRED_BY in self.deduced:
            #acquiring_company = Store.get_company({C.NAME: self.deduced[C.ACQUIRED_BY]})
            acquiring_company = Store.get_company(
                {C.ALIASES: self.deduced[C.ACQUIRED_BY].lower()})
            if acquiring_company:
                relations.add((self.aid, G.RELATION_LABEL_ACQUIRED_BY,
                               acquiring_company.aid, ''))

        # C:C - Create the INVESTED_IN relation
        if C.ORGANIZATION_TYPE in self.deduced and self.deduced[
                C.ORGANIZATION_TYPE] == C.ORGANIZATION_TYPE_VENTURE_CAPITAL:
            for portfolio_company in self.deduced.get(C.PORTFOLIO_COMPANIES,
                                                      []):
                ccc_company = Store.get_company(
                    {C.ALIASES: portfolio_company.lower()})
                if ccc_company:
                    relations.add((self.aid, G.RELATION_LABEL_INVESTS_IN,
                                   ccc_company.aid, ''))

        # P:C - Create EMPLOYEE_OF relation (Team. past_team)
        for team_mate in self.deduced.get(C.TEAM, []):
            person = Store.get_person({P.FULL_NAME: team_mate})
            if person:
                relations.add(
                    (person.aid, G.RELATION_LABEL_EMPLOYEE_OF, self.aid, ''))

        # P:C - Create BOARD_AT relation (Advisors)
        for advisor in self.deduced.get(C.ADVISORS, []):
            person = Store.get_person({P.FULL_NAME: advisor})
            if person:
                relations.add(
                    (person.aid, G.RELATION_LABEL_ADVISOR_AT, self.aid, ''))

        # P:C - Create FOUNDER_OF relation (Company)
        for founder in self.deduced.get(C.FOUNDERS, []):
            person = Store.get_person({P.FULL_NAME: founder})
            if person:
                relations.add(
                    (person.aid, G.RELATION_LABEL_FOUNDER_OF, self.aid, ''))

        # P:C - Create INVESTS_AT relation (Investors)
        for investor_name, investor_type, investment_info in self.deduced.get(
                C.INVESTORS, []):

            # Find info on investment type -> relation_properties
            relation_properties = []
            investment_round = AcureRateUtils.get_investment_round(
                investment_info)
            if investment_round:
                relation_properties.append("investment_type: '%s'" %
                                           investment_round)
            investment_lead = AcureRateUtils.is_investment_lead(
                investment_info)
            if investment_lead:  # TODO: should be label and not property
                relation_properties.append("investment_lead: True")

            if investor_type == 'person':
                person = Store.get_person(
                    {'deduced.' + P.FULL_NAME: investor_name})
                if person:
                    relations.add((person.aid, G.RELATION_LABEL_INVESTS_IN,
                                   self.aid, ', '.join(relation_properties)))
            elif investor_type == 'organization':
                investing_company = Store.get_company({C.NAME: investor_name})
                if investing_company:
                    relations.add(
                        (investing_company.aid, G.RELATION_LABEL_INVESTS_IN,
                         self.aid, ', '.join(relation_properties)))

        # If filter provided, leave only relations that are relevant
        if filter:
            relations = [
                tup for tup in relations if tup[1].lower() == filter.lower()
            ]

        return relations
Пример #19
0
    def digest(self):

        # Keep data before we reconstuct it - to check at the end if there were changes
        if self.deduced:
            before_reconstruct = copy.deepcopy(self.deduced)
        else:
            before_reconstruct = None

        # Reset 'deduced' - we're starting *clean* when digesting
        me = self.deduced = {}

        self._digest_name()

        # Go over data of all providers
        for ds in self.sources():

            # Collect related investors from providers
            if C.RELATED_INVESTORS in ds:
                for investor in ds[C.RELATED_INVESTORS]:
                    self._append_to_deduced(C.RELATED_INVESTORS, investor)

            # if 'name' not in me and 'name' in provider:
            #     me['name'] = provider['name']

            # TODO: revisit this code. We currently have only one provider, so we just copy the attributes values
            attrs = [
                "company_type", "crunchbase_url", "domain", "homepage_url",
                "stock_symbol", "short_description", "image_url",
                "facebook_url", "twitter_url", "linkedin_url", C.ADVISORS,
                C.FOUNDERS, C.CATEGORIES, C.TEAM, C.FOUNDING_YEAR, C.WEBSITE,
                C.CRUNCHBASE_PERMALINK, C.BLOOMBERG_URL
            ]
            for a in attrs:
                if a in ds:
                    me[a] = ds[a]

        # Select the company logo
        self._digest_logos()

        self._digest_domain()

        self._digest_email_domains()

        self._digest_phones()

        self._digest_investors()

        # Go over related people - check if they are investors:
        self._digest_related_investors()

        self._digest_portfolio_companies()

        self._digest_related_vcs()

        self._digest_aliases()

        self._digest_employees_range()

        self._digest_exits()

        self._digest_organization_type()

        self._digest_email_convention()

        # Check if anything changed during digest:
        if before_reconstruct is None:
            return True
        added, removed, modified, same = AcureRateUtils.dict_compare(
            self.deduced, before_reconstruct)
        if len(added) == 0 and len(removed) == 0 and len(modified) == 0:
            return False
        return True
Пример #20
0
    def enrich_person(self):

        result_obj = self._get_person_info()

        self.set_data("score", result_obj['likelihood'])

        contact_info = result_obj.get('contactInfo', None)
        if contact_info:
            if 'givenName' in contact_info:
                self.set_data(P.FIRST_NAME, contact_info['givenName'])
            if 'familyName' in contact_info:
                self.set_data(P.LAST_NAME, contact_info['familyName'])

        demographics = result_obj.get('demographics', None)
        if demographics:
            gender = demographics.get('gender', None)
            if gender:
                self.add_data(P.GENDER, gender.lower())
            loc = demographics.get('locationGeneral', None)
            if loc:
                self.add_data(P.LOCATIONS, loc)

        photos = result_obj.get('photos', None)
        if photos:
            for photo in photos:
                new_photo = {}
                m = {"url": P.PHOTO_URL, "typeName": P.PHOTO_SOURCE}
                AcureRateUtils.dict2dict(photo, new_photo, m)
                self.add_data(P.PHOTOS, new_photo)

        organizations = result_obj.get('organizations', None)
        if organizations:
            for org in organizations:
                new_job = {}
                m = {
                    "name": P.JOB_NAME,
                    "title": P.JOB_TITLE,
                    "current": P.JOB_CURRENT,
                    "isPrimary": P.JOB_PRIMARY
                }
                AcureRateUtils.dict2dict(org, new_job, m)
                # If there are start/end dates, grab them (year only - drop the month)
                if 'startDate' in org:
                    new_job[P.JOB_STARTED] = org['startDate'][0:4]
                if 'endDate' in org:
                    new_job[P.JOB_ENDED] = org['endDate'][0:4]
                self.add_data(P.JOBS, new_job)

        social_profiles = result_obj.get('socialProfiles', None)
        if social_profiles:
            for social_profile in social_profiles:
                if social_profile.get('typeName', '') == 'Twitter':
                    self.set_data(P.TWITTER_URL, social_profile['url'])
                elif social_profile.get('typeName', '') == 'LinkedIn':
                    self.set_data(P.LINKEDIN_URL, social_profile['url'])
                elif social_profile.get('typeName', '') == 'GooglePlus':
                    self.set_data(P.GOOGLEPLUS_URL, social_profile['url'])
                elif social_profile.get('typeName', '') == 'Facebook':
                    self.set_data(P.FACEBOOK_URL, social_profile['url'])
                elif social_profile.get('typeName', '') == 'Gravatar':
                    self.set_data(P.GRAVATAR_URL, social_profile['url'])
                elif social_profile.get('typeName', '') == 'Foursquare':
                    self.set_data(P.FOURSQUARE_URL, social_profile['url'])
                elif social_profile.get('typeName', '') == 'Pinterest':
                    self.set_data(P.PINTEREST_URL, social_profile['url'])
                elif social_profile.get('typeName', '') == 'Klout':
                    self.set_data(P.KLOUT_URL, social_profile['url'])
                elif social_profile.get('typeName', '') == 'AngelList':
                    self.set_data(P.ANGELLIST_URL, social_profile['url'])
                else:
                    print('Something else...')

        # TODO: add all other attributes received from FullContact

        return [P.JOBS]
    def _company_exists(company_name, cb_url=None, permalink=None):
        # Issue a request to CB search server - if matches exist, compare using name or cb_url if provided.
        try:
            # Truncate possible parameters on url
            if cb_url and cb_url.find('?') > 0:
                cb_url = cb_url[:cb_url.index('?')]

            company_name_clean = AcureRateUtils.clean_company_name(
                company_name)

            url = 'https://a0ef2haqr0-3.algolia.io/1/indexes/main_production/query'
            query = 'query=%s&facetFilters=' % company_name_clean.replace(
                '&', '%26')
            payload = {
                "params": query,
                "apiKey": CrunchBaseScraperEngager.THE_KEY,
                "appID": CrunchBaseScraperEngager.APP_ID
            }
            headers = {
                'contentType': 'application/json; charset=utf-8',
                'X-Algolia-API-Key': CrunchBaseScraperEngager.THE_KEY,
                'X-Algolia-Application-Id': CrunchBaseScraperEngager.APP_ID
            }
            with requests_cache.disabled():
                response = requests.post(url, json=payload, headers=headers)
            # @@@ fatal
            if response.status_code == 429:
                raise EngagementException(
                    "%s. Exceeded requests quota. Error: %s." %
                    (response.status_code, response.text),
                    fatal=True)
            if response.status_code != 200:
                raise EngagementException(
                    "%s. %s." % (response.status_code, response.text),
                    fatal=True)
            if response.json()['nbHits'] == 0:
                raise EngagementException(
                    "CrunchBaseScraper: No hits returned when searching for %s (%s)."
                    % (company_name_clean, company_name))

            # Check how many matches we have (if any)
            matches = []
            for company in response.json().get('hits', []):
                if company.get('type', '') == 'Organization' and company.get(
                        'organization', False) and 'name' in company:
                    if 'permalink' in company and permalink and company[
                            'permalink'].lower() == permalink:
                        matches.append(company)
                        break
                    # Compare URLs
                    if 'url' in company and cb_url and cb_url.endswith(
                            company['url']):
                        matches.append(company)
                        break
                    # Check by name
                    result_company_name_clean = AcureRateUtils.clean_company_name(
                        company.get('name'))
                    if result_company_name_clean.lower(
                    ) == company_name_clean.lower():
                        matches.append(company)
            if len(matches) == 0:
                raise EngagementException(
                    "CrunchBaseScraper: No match for %s (%s)" %
                    (company_name_clean, company_name))
            if len(matches) > 1:
                raise EngagementException(
                    "CrunchBaseScraper: Ambiguous results - got %d hits for %s (%s)"
                    % (len(matches), company_name_clean, company_name))
        except Exception as e:
            raise e
        return matches
Пример #22
0
 def _get_person(self, full_name):
     f, m, l = AcureRateUtils.tokenize_full_name(full_name)
     q = {"deduced.first_name": f, "deduced.last_name": l}
     r = DBWrapper.get_persons(q, True)
     return AcureRatePerson().reconstruct(r) if r else None
Пример #23
0
    def enrich_person(self):
        try:
            if P.ANGELLIST_URL not in self.enriched_entity.deduced:
                # Search google for the person - the search string: 'site:bloomberg.com ploni almoni "executive profile"'
                url_prefix_1 = 'http://angel.co/'.lower()
                query = 'site:angel.co "%s"' % self.enrich_key
                res = search(query,
                             tld='com',
                             lang='en',
                             num=3,
                             start=0,
                             stop=2,
                             pause=2.0)
                matches = 0
                for url in res:
                    url_lower = url.lower().replace('https', 'http')
                    if url_lower.find(url_prefix_1) == 0:
                        matches += 1
                if matches == 0:
                    raise EngagementException(
                        'Unable to locate information in angel.co on %s' %
                        self.enrich_key)
                elif matches > 1:
                    # TODO: we can improve search that will also consult working places and determine which person is the one we need... (try: Ariel Cohen)
                    raise EngagementException(
                        'Unable to locate information in angel.co - more than one match on %s'
                        % self.enrich_key)

                # Grab person id from url
                p = url.rfind('/')
                person_id = url[:p - 1]
                self.set_data(P.ANGELLIST_ID, person_id)
                # TODO: look into the full url google returns - what is capId?
                self.set_data(P.ANGELLIST_URL, url)
            else:
                url = self.enriched_entity.deduced[P.ANGELLIST_URL]

            # -----------------
            # CHECK: https://angel.co/alberto-roman
            # -----------------

            headers = requests.utils.default_headers()
            headers.update({
                'User-Agent':
                'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 5 Build/LMY48B; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.65 Mobile Safari/537.36'
            })

            # Get the person's page for parsing
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                s = 'Unable to load page in Angel.co on %s. Error: %s. (url=%s)' % (
                    self.enrich_key, response.status_code, url)
                raise EngagementException(s)

            soup = BeautifulSoup(response.content, 'html.parser')

            # Get name
            try:
                elem = soup.find("h1", {"itemprop": "name"})
                if elem:
                    name = elem.text.strip()
                    self.set_data(P.FULL_NAME, name)
            except:
                self.logger.warning('Unable to locate name attribute for %s',
                                    self.enrich_key)

            # Get photo

            # Studied at...

            # Get socials
            try:
                elem = soup.find("a", {"data-field": "linkedin_url"})
                if elem:
                    linkedin_url = elem['href']
                    self.set_data(P.LINKEDIN_URL, linkedin_url)
            except:
                self.logger.warning('Unable to locate social attribute for %s',
                                    self.enrich_key)

            try:
                elem = soup.find("a", {"data-field": "twitter_url"})
                if elem:
                    twitter_url = elem['href']
                    self.set_data(P.TWITTER_URL, twitter_url)
            except:
                self.logger.warning('Unable to locate social attribute for %s',
                                    self.enrich_key)

            try:
                elem = soup.find("a", {"data-field": "facebook_url"})
                if elem:
                    facebook_url = elem['href']
                    self.set_data(P.FACEBOOK_URL, facebook_url)
            except:
                self.logger.warning('Unable to locate social attribute for %s',
                                    self.enrich_key)

            try:
                elem = soup.find("a", {"data-field": "blog_url"})
                if elem:
                    blog_url = elem['href']
                    self.set_data(P.BLOG_URL, blog_url)
            except:
                self.logger.warning('Unable to locate social attribute for %s',
                                    self.enrich_key)

            # Get experience
            try:
                experience_elem = soup.find("div",
                                            {"class": "experience_container"})
                startup_roles = experience_elem.findAll(
                    "div", {"class": "startup_roles"})
                for review in startup_roles:
                    current_job = {}

                    # Get logo of job
                    startup_photo_elem = review.find("div", {"class": "photo"})
                    startup_photo_url = startup_photo_elem.find("img")['src']

                    # Get details of job
                    startup_text_elem = review.find("div", {"class": "text"})
                    startup_elem = startup_text_elem.find(
                        "a", {"data-type": "Startup"})
                    current_job[P.JOB_NAME] = startup_elem.text.strip()

                    startup_angellist_url = startup_elem['href']

                    # Get other details
                    more_details_elems = startup_text_elem.findAll("span")
                    if len(more_details_elems) > 0:
                        current_job[
                            P.JOB_TITLE] = more_details_elems[0].text.strip()
                    if len(more_details_elems) > 1:
                        role_years = more_details_elems[1].text.strip()
                        s, e, c = AcureRateUtils.parse_date_range(role_years)
                        if s:
                            current_job[P.JOB_STARTED] = s
                        if e:
                            current_job[P.JOB_ENDED] = e
                        if c:
                            current_job[P.JOB_CURRENT] = c
                        # TODO: parse start/end/current year from string line
                    if len(more_details_elems) > 2:
                        role_description = more_details_elems[2].text.strip()

                    self.add_data(P.JOBS, current_job)
            except:
                self.logger.warning(
                    'Unable to locate job title/name attribute for %s',
                    self.enrich_key)

            # Get education records
            try:
                education_elem = soup.find("div", {"class": "education"})
                education_orgs = education_elem.findAll(
                    "div", {"class": "college-row-view"})
                for review in education_orgs:
                    school = review.find("div", {
                        "class": "school"
                    }).text.strip()
                    degree = review.find("div", {
                        "class": "degree"
                    }).text.strip()

            except:
                self.logger.warning(
                    'Unable to locate education attribute for %s',
                    self.enrich_key)

            # Get investments
            try:
                investments_list_elem = soup.find("div",
                                                  {"class": "investment_list"})
                investments = investments_list_elem.findAll(
                    "div", {"class": "investment"})
                for investment in investments:
                    company_name = investment.find("div", {
                        "class": "company-link"
                    }).text.strip()
                    self.add_data(P.INVESTMENTS, company_name)

            except:
                self.logger.warning(
                    'Unable to locate investments attribute for %s',
                    self.enrich_key)

            # Get references/reviews
            try:
                reviews_section_elem = soup.find("div", {"class": "reviews"})
                reviews_elem = reviews_section_elem.findAll(
                    "li", {"class": "review"})
                for review in reviews_elem:
                    reference = {}
                    reference[P.REFERER_REVIEW] = review.find(
                        "div", {
                            "class": "review-content"
                        }).text.strip()
                    referencing_person_elem = review.find(
                        "div", {
                            "class": "annotation"
                        }).find("a", {"class": "profile-link"})
                    reference[
                        P.REFERER_NAME] = referencing_person_elem.text.strip()
                    reference[
                        P.REFERER_ANGELLIST_URL] = referencing_person_elem[
                            'href']
                    self.add_data(P.REFERENCES, reference)
            except:
                self.logger.warning(
                    'Unable to locate education attribute for %s',
                    self.enrich_key)

            # Get business locations
            # TODO..

            # Get business markets
            # TODO..

        except Exception as e:
            self.logger.error('Unable to enrich person %s. %s',
                              self.enriched_entity, e)
            raise e
        return [P.FULL_NAME]