Пример #1
0
 def test(self, company_name):
     job = rq.get_current_job()
     print job.meta.keys()
     if "queue_name" in job.meta.keys():
         print RQueue()._has_completed(job.meta["queue_name"])
         print RQueue()._has_completed("queue_name")
         if RQueue()._has_completed(job.meta["queue_name"]):
             q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
Пример #2
0
    def search_sources(self, domain, api_key, name=""):
        pattern = Toofr().get(domain)
        if pattern:
            ptn = {
                "domain": domain,
                "company_email_pattern": [{
                    "source": "toofr",
                    "pattern": pattern
                }]
            }
            self._find_if_object_exists('EmailPattern', 'domain', domain, ptn)
            Webhook()._update_company_email_pattern(ptn)
            return pattern

        # syncronous jigsaw search
        # job_5 = q.enqueue(Sources()._jigsaw_search, domain)
        job_1 = q.enqueue(Sources()._whois_search, domain)
        job_2 = q.enqueue(Sources()._google_span_search, domain)
        job_3 = q.enqueue(Sources()._press_search, domain, api_key)
        job_4 = q.enqueue(Sources()._zoominfo_search, domain)
        jobs = [job_1, job_2, job_3, job_4]
        if name != "":
            job_5 = q.enqueue(Sources()._mx_server_check, name, domain)
            job_6 = q.enqueue(Sources()._linkedin_login_search, name, domain)
            jobs = jobs + [job_5, job_6]

        for job in jobs:
            RQueue()._meta(job, "{0}_{1}".format(domain, api_key))
Пример #3
0
    def _email_pattern(self, domain, api_key=""):
        print ''' Score email pattern based on number of occurrences '''
        qry = {'where':json.dumps({'domain': domain}),'limit':1000}
        crawls = Parse().get('CompanyEmailPatternCrawl', qry)
        crawls = pd.DataFrame(crawls.json()['results'])

        df = crawls[crawls.pattern.notnull()].drop_duplicates('email')
        _df = df[df.crawl_source != "mx_check"]
        df = df[df.crawl_source == "mx_check"].drop_duplicates('pattern')
        if len(df.pattern) > 2: df = df[df.crawl_source != "mx_check"]
        df = _df.append(df)
        df = df.pattern.value_counts()

        score = pd.DataFrame()
        score['pattern'], score['freq'] = df.index, df.values
        score['score'] = [freq / float(score.freq.sum()) for freq in score['freq']]
        score['source'], score['tried'] = 'clearspark', False
        score = score.fillna("")
        score = score.to_dict('records')
        #print score, api_key
        print "SCORE"
        print score
        score = {'domain':domain, 'company_email_pattern':score}
        self._find_if_object_exists('EmailPattern','domain', domain, score)

        # TODO - add date crawled to score
        if RQueue()._has_completed("{0}_{1}".format(domain, api_key)):
            if score['company_email_pattern'] == []:
                score['email_guess'] = EmailGuess()._random()
                #q.enqueue(Sources()._jigsaw_search, domain)
            Webhook()._update_company_email_pattern(score)
Пример #4
0
    def employee_webhook(self,
                         company_name,
                         company_list,
                         qry="",
                         limit=5,
                         list_id="",
                         _report=""):
        _user, _company = company_list['user'], company_list['company']
        employees = Companies()._employees(company_name, qry)
        company = Companies()._get_info(company_name)
        _company_list = company_list['objectId']
        for index, row in employees.iterrows():
            data = row.to_dict()
            company['user'], company['company'] = _user, _company
            prospect = company
            prospect['name'], prospect['pos'] = row['name'], row['title']
            prospect['city'] = row['locale']
            prospect['linkedin_url'] = row['linkedin_url']
            prospect['lists'] = [Parse()._pointer('ProspectList', list_id)]
            if type(company['industry']) is list:
                company['industry'] = company['industry'][0]
            prospect['company_profile'] = company_list['profile']
            r = Prospecter().create('Prospect', company)
            print "prospect_create_result", r.json()

        if RQueue()._has_completed("{0}_{1}".format(_company_list, list_id)):
            data = {'done': arrow.now().timestamp}
            r = Prospecter().update("SignalReport/" + _report, data)
            print "employee_webhook_has_completed -->", r.json()
Пример #5
0
    def _press_search(self, domain, api_key):
        pw = Google().search('"{0}" site:prnewswire.com'.format(domain))
        bw = Google().search('"{0}" site:businesswire.com'.format(domain))
        #job_queue_lol = objectId+str(arrow.now().timestamp)
        print bw, pw
        pw = pw if not pw.empty else pd.DataFrame(columns=["link"])
        bw = pw if not bw.empty else pd.DataFrame(columns=["link"])
        queue = "press-check-" + domain
        for link in pw.link:
            job = q.enqueue(PRNewsWire()._email, domain, link, timeout=3600)
            RQueue()._meta(job, "{0}_{1}".format(domain, api_key))

        for link in bw.link:
            job = q.enqueue(BusinessWire()._email, domain, link, timeout=3600)
            RQueue()._meta(job, "{0}_{1}".format(domain, api_key))
        '''
Пример #6
0
 def _domain_research(self, domain, api_key="", name="", prospect_name=""):
     # Primary Research
     if name == "": name = domain
     x = 6000
     j1 = q.enqueue(Zoominfo()._domain_search,
                    domain,
                    api_key,
                    name,
                    timeout=x)
     j2 = q.enqueue(Linkedin()._domain_search,
                    domain,
                    api_key,
                    name,
                    timeout=x)
     j3 = q.enqueue(YellowPages()._domain_search,
                    domain,
                    api_key,
                    name,
                    timeout=x)
     j4 = q.enqueue(Yelp()._domain_search, domain, api_key, name, timeout=x)
     j5 = q.enqueue(Forbes()._domain_search,
                    domain,
                    api_key,
                    name,
                    timeout=x)
     j6 = q.enqueue(GlassDoor()._domain_search,
                    domain,
                    api_key,
                    name,
                    timeout=x)
     j7 = q.enqueue(Hoovers()._domain_search,
                    domain,
                    api_key,
                    name,
                    timeout=x)
     j8 = q.enqueue(Crunchbase()._domain_search,
                    domain,
                    api_key,
                    name,
                    timeout=x)
     j9 = q.enqueue(Facebook()._domain_search,
                    domain,
                    api_key,
                    name,
                    timeout=x)
     j10 = q.enqueue(Twitter()._domain_search,
                     domain,
                     api_key,
                     name,
                     timeout=x)
     j11 = q.enqueue(Indeed()._domain_search,
                     domain,
                     api_key,
                     name,
                     timeout=x)
     jobs = [j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11]
     for job in jobs:
         RQueue()._meta(job, "{0}_{1}".format(name, api_key), prospect_name)
Пример #7
0
    def parse(self, url, company_name):
        cache = Google().cache(url)
        soup = BeautifulSoup(cache)
        p = []

        for i in soup.find_all("div", {"class": "entityblock"}):
            try:
                img = i.find("img")["data-delayed-url"]
            except:
                img = i.find("img")["src"]
            profile = i.find("a")["href"]
            name = i.find("h3", {"class": "name"})
            name = name.text if name else ""
            title = i.find("p", {"class": "headline"})
            title = title.text if title else ""
            company = title.split("at ")[-1]
            title = title.split(" at ")[0]
            city = i.find("dd")
            city = city.text if city else ""
            cols = ["img", "profile", "name", "title", "city", "company"]
            vals = [img, profile, name, title, city, company]
            print vals
            p.append(dict(zip(cols, vals)))
        print p
        results = pd.DataFrame(p)
        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(company_name, company)
                for company in results.company
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(company_name, company)
                for company in results.company
            ]
        results = results[(results.company_score > 64)]
        data = {'data': results.to_dict("r"), 'company_name': company_name}
        CompanyExtraInfoCrawl()._persist(data, "employees", "")

        job = rq.get_current_job()
        print job.meta.keys()
        if "queue_name" in job.meta.keys():
            if RQueue()._has_completed(job.meta["queue_name"]):
                q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])

        return p
Пример #8
0
    def _employees(self, domain, api_key="", company_name="", keyword=""):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"'
        args = args + ' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        results = Google().search(qry, 10)
        if results.empty:
            if domain == "":
                ''' return results '''
            else:
                results = Google().search(qry.format(domain, args, keyword))
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        if " " in company_name:
            results['company_score'] = [
                fuzz.partial_ratio(_name, company)
                for company in results.company
            ]
        else:
            results['company_score'] = [
                fuzz.ratio(_name, company) for company in results.company
            ]
        if keyword != "":
            results['score'] = [
                fuzz.ratio(keyword, title) for title in results.title
            ]
            results = results[results.score > 75]

        results = results[results.company_score > 64]
        results = results.drop_duplicates()
        data = {'data': results.to_dict('r'), 'company_name': company_name}
        data["domain"] = domain
        CompanyExtraInfoCrawl()._persist(data, "employees", api_key)

        job = rq.get_current_job()
        if "queue_name" in job.meta.keys():
            if RQueue()._has_completed(job.meta["queue_name"]):
                q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
        return results
Пример #9
0
    def _daily_secondary_research(self, name, domain, api_key=""):
        # Secondary Research - sometimes require location or domain
        if name == "": name = domain
        x = 6000
        j0 = q.enqueue(Companies()._company_blog,
                       domain,
                       api_key,
                       name,
                       timeout=x)
        # Secondary Research - sometimes require location or domain
        if name == "": name = domain
        x = 6000
        j0 = q.enqueue(Companies()._company_blog,
                       domain,
                       api_key,
                       name,
                       timeout=x)
        j2 = q.enqueue(GlassDoor()._reviews, domain, api_key, name, timeout=x)
        j3 = q.enqueue(Companies()._press_releases,
                       domain,
                       api_key,
                       name,
                       timeout=x)
        j4 = q.enqueue(Companies()._news, domain, api_key, name, timeout=x)
        j5 = q.enqueue(Companies()._hiring, domain, api_key, name, timeout=x)
        j6 = q.enqueue(Twitter()._daily_news, domain, api_key, name, timeout=x)
        j7 = q.enqueue(Facebook()._daily_news,
                       domain,
                       api_key,
                       name,
                       timeout=x)
        j8 = q.enqueue(Linkedin()._daily_news,
                       domain,
                       api_key,
                       name,
                       timeout=x)

        # TODO - general pages on their site
        jobs = [j0, j2, j3, j4, j5, j6, j7, j8]
        for job in jobs:
            RQueue()._meta(job, "{0}_{1}".format(name, api_key))
Пример #10
0
 def _research(self, name, api_key="", prospect_name=""):
     # Primary Research
     j9 = q.enqueue(Facebook()._company_profile,
                    name,
                    api_key,
                    timeout=6000)
     j10 = q.enqueue(Twitter()._company_profile,
                     name,
                     api_key,
                     timeout=6000)
     j11 = q.enqueue(Indeed()._company_profile, name, api_key, timeout=6000)
     j0 = q.enqueue(BusinessWeek()._company_profile,
                    name,
                    api_key,
                    timeout=6000)
     j1 = q.enqueue(Zoominfo()._company_profile,
                    name,
                    api_key,
                    timeout=6000)
     j2 = q.enqueue(Linkedin()._company_profile,
                    name,
                    api_key,
                    timeout=6000)
     j3 = q.enqueue(YellowPages()._company_profile,
                    name,
                    api_key,
                    timeout=6000)
     j4 = q.enqueue(Yelp()._company_profile, name, api_key, timeout=6000)
     j5 = q.enqueue(Forbes()._company_profile, name, api_key, timeout=6000)
     j6 = q.enqueue(GlassDoor()._company_profile,
                    name,
                    api_key,
                    timeout=6000)
     j7 = q.enqueue(Hoovers()._company_profile, name, api_key, timeout=6000)
     j8 = q.enqueue(Crunchbase()._company_profile,
                    name,
                    api_key,
                    timeout=6000)
     jobs = [j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11]
     for job in jobs:
         RQueue()._meta(job, "{0}_{1}".format(name, api_key), prospect_name)
Пример #11
0
    def _company_info(self, company_name, api_key=""):
        #TODO - company_name = self._remove_non_ascii(company_name) add to save
        qry = {
            'where': json.dumps({'company_name': company_name}),
            'limit': 1000
        }
        qry['order'] = '-createdAt'
        crawls = Parse().get('CompanyInfoCrawl', qry).json()['results']

        if not crawls:
            # start crawls
            return company_name
        crawls = self._source_score(pd.DataFrame(crawls))
        crawls = self._logo_score(crawls)
        #crawls = crawls[crawls.api_key == api_key]
        crawls['name_score'] = [
            fuzz.token_sort_ratio(row['name'], row.company_name)
            for index, row in crawls.iterrows()
        ]
        crawls = crawls[crawls.name_score > 70].append(
            crawls[crawls.name.isnull()])
        logo = crawls.sort("logo_score", ascending=False)

        #logo=logo[(logo.logo != "") & (logo.logo.notnull())][["source","logo"]]
        logo = logo[(logo.logo != "") & (logo.logo.notnull())].logo.tolist()
        logo = logo[0] if logo else ""

        #crawls = crawls[["press", 'source_score', 'source', 'createdAt', 'domain']]
        final = {}
        #print crawls.press.dropna()
        for col in crawls.columns:
            if col in ['source_score', 'source', 'createdAt']: continue
            df = crawls[[col, 'source_score', 'source', 'createdAt']]
            if df[col].dropna().empty: continue
            if type(list(df[col].dropna())[0]) == list:
                df[col] = df[col].dropna().apply(tuple)
            try:
                df = df[df[col] != ""]
            except:
                "lol"
            try:
                df = df[df[col].notnull()]
                df = [
                    source[1].sort('createdAt').drop_duplicates(col, True)
                    for source in df.groupby(col)
                ]
                df = [_df for _df in df if _df is not None]
                df = [pd.DataFrame(
                    columns=['source_score', col])] if len(df) is 0 else df
                df = pd.concat(df).sort('source_score')[col]
                if list(df): final[col] = list(df)[-1]
            except:
                "lol"

        if 'industry' in final.keys():
            try:
                final['industry'] = final['industry'][0]
            except:
                final["industry"] = ""

        try:
            final['industry_keywords'] = list(
                set(crawls.industry.dropna().sum()))
        except:
            final['industry_keywords'] = []

        if 'address' in final.keys():
            final['address'] = FullContact()._normalize_location(
                final['address'])
        try:
            final['handles'] = crawls[['source', 'handle']].dropna()
            final['handles'] = final['handles'].drop_duplicates().to_dict('r')
        except:
            "lol"

        tmp = crawls[['source', 'logo']].dropna()
        #print tmp
        #print "THE LOGO", logo
        final["logo"] = logo
        final['logos'] = tmp.drop_duplicates().to_dict('r')

        try:
            tmp = crawls[['source', 'phone']].dropna()
            final['phones'] = tmp.drop_duplicates().to_dict('r')
        except:
            """ """
        # TODO - if company_name exists update
        # TODO - find if domain exists under different company_name then update
        final = self._prettify_fields(final)
        if "name_score" in final.keys(): del final["name_score"]
        #print json.dumps(final)
        self._add_to_clearspark_db('Company', 'company_name', company_name,
                                   final)

        # TODO - find main domain from domain -> ie canon.ca should be canon.com
        # clean data - ie titleify fields, and lowercase domain
        # TODO - start a domain search with the deduced domain and the company_name
        #print "RQUEUE CHECK"
        if "domain" in final.keys():
            domain = final["domain"]
        '''
        if len(RQueue()._results("{0}_{1}".format(company_name, api_key))) == 1:
            q.enqueue(Companies()._domain_research, domain, api_key, company_name)
            q.enqueue(Companies()._secondary_research, company_name, domain, api_key)
        '''

        if RQueue()._has_completed("{0}_{1}".format(company_name, api_key)):
            #q.enqueue(Companies()._domain_research, domain, api_key, company_name)
            #q.enqueue(Companies()._secondary_research, company_name, domain, api_key)

            print "WEBHOOK <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
            if "company_name" in final.keys():
                Webhook()._update_company_info(final)
            '''
            job = q.enqueue(EmailGuess().search_sources, final["domain"],api_key,"")
            job.meta["{0}_{1}".format(company_name, api_key)] = True
            job.save()
            for domain in crawls.domain.dropna().drop_duplicates():
                job = q.enqueue(EmailGuess().search_sources, domain, api_key, "")
                RQueue()._meta(job, "{0}_{1}".format(company_name, api_key))
            '''
        return final