Exemplo n.º 1
0
    def _company_blog(self, domain, api_key="", name=""):
        #TODO get blog url
        df = Google().search('inurl:blog site:{0}'.format(domain), 1)
        print df
        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index', 1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num) + " " + " ".join(_date.split(" ")[1:])
            try:
                timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                    num = int(i.split())
                    timestamps.append(arrow.utcnow().replace(days=num *
                                                             -1).timestamp)
                else:
                    timestamps.append(0)
        df["timestamp"] = timestamps

        data = {'data': df.to_dict('r'), 'blog_url': url}
        data["domain"] = domain
        data["api_key"] = api_key
        data["company_name"] = name
        CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
Exemplo n.º 2
0
    def _company_blog(self, domain, api_key="", name=""):
        #TODO get blog url
        df = Google().search('inurl:blog site:{0}'.format(domain), 1)
        print df
        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index',1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num)+" "+" ".join(_date.split(" ")[1:])
            try:
              timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                  num = int(i.split())
                  timestamps.append(arrow.utcnow().replace(days=num*-1).timestamp)
                else:
                  timestamps.append(0)
        df["timestamp"] = timestamps

        data = {'data': df.to_dict('r'), 'blog_url':url}
        data["domain"] = domain
        data["api_key"] = api_key
        data["company_name"] = name
        CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
Exemplo n.º 3
0
    def _press_releases(self, qry, company_domain=None, period=None):
        queries = [
            '"{0}" site:prnewswire.com'.format(qry),
            '"{0}" site:businesswire.com'.format(qry),
            '"{0}" site:marketwired.com'.format(qry),
            '"{0}" site:newswire.ca'.format(qry),
            '"{0}" site:reuters.com'.format(qry)
        ]

        p = Google()._multi_get(queries)
        try:
            p = p.drop_duplicates()
        except:
            """ """
        #p['date'] = [span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span]
        p['description'] = [
            "".join(span.split('...')[1:]).strip() for span in p.link_span
        ]
        p["domain"] = company_domain
        p['date'] = [span.split('...')[0].strip() for span in p.link_span]
        p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date]
        p['title'] = p['link_text']

        p = p.drop('link_text', 1)
        p = p.drop('url', 1)
        p = p.drop('link_span', 1)
        #for i in p.timestamp: print i
        data = p
        data["domain"] = company_domain
        data["domain"] = company_domain
        data["event_type"] = "CompanyPressEvent"
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        data["event_key"] = [
            "".join(map(str, _data.values()))[:124]
            for _data in data.to_dict("r")
        ]
        _df = data.to_dict("r")
        for i in _df:
            for key in i.keys():
                if i[key] == None: del i[key]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
        return data
Exemplo n.º 4
0
    def _press_releases(self, qry, company_domain=None, period=None):
        queries = ['"{0}" site:prnewswire.com'.format(qry),
                   '"{0}" site:businesswire.com'.format(qry),
                   '"{0}" site:marketwired.com'.format(qry),
                   '"{0}" site:newswire.ca'.format(qry),
                   '"{0}" site:reuters.com'.format(qry)]

        p = Google()._multi_get(queries)
        try:
          p = p.drop_duplicates()
        except:
          """ """
        #p['date'] = [span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span]
        p['description'] = ["".join(span.split('...')[1:]).strip() for span in p.link_span]
        p["domain"] = company_domain
        p['date'] = [span.split('...')[0].strip() for span in p.link_span]
        p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date]
        p['title'] = p['link_text']

        p = p.drop('link_text',1)
        p = p.drop('url',1)
        p = p.drop('link_span',1)
        #for i in p.timestamp: print i
        data = p
        data["domain"] = company_domain
        data["domain"] = company_domain
        data["event_type"] = "CompanyPressEvent"
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        data["event_key"] = ["".join(map(str, _data.values()))[:124]
                             for _data in data.to_dict("r")]
        _df = data.to_dict("r")
        for i in _df:
            for key in i.keys():
                if i[key] == None: del i[key]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
        return data
Exemplo n.º 5
0
    def _company_blog(self, domain, period=None):
        #TODO get blog url
        if period:
            df = Google().search('inurl:blog site:{0}'.format(domain), 1, "d")
        else:
            df = Google().search('inurl:blog site:{0}'.format(domain), 1)

        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index', 1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num) + " " + " ".join(_date.split(" ")[1:])
            try:
                timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                    num = int(i.split())
                    timestamps.append(arrow.utcnow().replace(days=num *
                                                             -1).timestamp)
                else:
                    timestamps.append(0)
        df["timestamp"] = timestamps
        data = df
        print data
        data["domain"] = domain
        data["event_type"] = "CompanyBlogEvent"
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        data["event_key"] = [
            "".join(map(str, _data.values()))[:124]
            for _data in data.to_dict("r")
        ]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
        return data
Exemplo n.º 6
0
    def _company_blog(self, domain, period=None):
        #TODO get blog url
        if period:
          df = Google().search('inurl:blog site:{0}'.format(domain), 1, "d")
        else:
          df = Google().search('inurl:blog site:{0}'.format(domain), 1)

        if df.empty: return
        df["count"] = [len(url) for url in df.link]
        df = df.reset_index().drop('index',1)
        df = df.drop('title', 1)
        url = df.sort('count').url.ix[0]
        df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span]
        months = list(calendar.month_abbr)
        timestamps = []
        for _date in df.timestamp:
            try:
                num = months.index(_date.split(" ")[0])
            except:
                timestamps.append(0)
                continue
            _date = str(num)+" "+" ".join(_date.split(" ")[1:])
            try:
              timestamps.append(arrow.get(_date, "M D, YYYY").timestamp)
            except:
                if "day" in i:
                  num = int(i.split())
                  timestamps.append(arrow.utcnow().replace(days=num*-1).timestamp)
                else:
                  timestamps.append(0)
        df["timestamp"] = timestamps
        data = df
        print data
        data["domain"] = domain
        data["event_type"] = "CompanyBlogEvent"
        data = data.applymap(lambda x: self._remove_non_ascii(x))
        data["event_key"] = ["".join(map(str, _data.values()))[:124]
                             for _data in data.to_dict("r")]
        data = [row.dropna().to_dict() for i, row in data.iterrows()]
        r.table("events").insert(data).run(conn)
        return data