def _company_blog(self, domain, api_key="", name=""): #TODO get blog url df = Google().search('inurl:blog site:{0}'.format(domain), 1) print df if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index', 1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num) + " " + " ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num * -1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = {'data': df.to_dict('r'), 'blog_url': url} data["domain"] = domain data["api_key"] = api_key data["company_name"] = name CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
def _company_blog(self, domain, api_key="", name=""): #TODO get blog url df = Google().search('inurl:blog site:{0}'.format(domain), 1) print df if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index',1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num)+" "+" ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num*-1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = {'data': df.to_dict('r'), 'blog_url':url} data["domain"] = domain data["api_key"] = api_key data["company_name"] = name CompanyExtraInfoCrawl()._persist(data, "blog_data", api_key)
def _press_releases(self, qry, company_domain=None, period=None): queries = [ '"{0}" site:prnewswire.com'.format(qry), '"{0}" site:businesswire.com'.format(qry), '"{0}" site:marketwired.com'.format(qry), '"{0}" site:newswire.ca'.format(qry), '"{0}" site:reuters.com'.format(qry) ] p = Google()._multi_get(queries) try: p = p.drop_duplicates() except: """ """ #p['date'] = [span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span] p['description'] = [ "".join(span.split('...')[1:]).strip() for span in p.link_span ] p["domain"] = company_domain p['date'] = [span.split('...')[0].strip() for span in p.link_span] p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date] p['title'] = p['link_text'] p = p.drop('link_text', 1) p = p.drop('url', 1) p = p.drop('link_span', 1) #for i in p.timestamp: print i data = p data["domain"] = company_domain data["domain"] = company_domain data["event_type"] = "CompanyPressEvent" data = data.applymap(lambda x: self._remove_non_ascii(x)) data["event_key"] = [ "".join(map(str, _data.values()))[:124] for _data in data.to_dict("r") ] _df = data.to_dict("r") for i in _df: for key in i.keys(): if i[key] == None: del i[key] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn) return data
def _press_releases(self, qry, company_domain=None, period=None): queries = ['"{0}" site:prnewswire.com'.format(qry), '"{0}" site:businesswire.com'.format(qry), '"{0}" site:marketwired.com'.format(qry), '"{0}" site:newswire.ca'.format(qry), '"{0}" site:reuters.com'.format(qry)] p = Google()._multi_get(queries) try: p = p.drop_duplicates() except: """ """ #p['date'] = [span.split('Business Wire')[-1].split('...')[0].strip() for span in p.link_span] p['description'] = ["".join(span.split('...')[1:]).strip() for span in p.link_span] p["domain"] = company_domain p['date'] = [span.split('...')[0].strip() for span in p.link_span] p["timestamp"] = [Helper()._str_to_timestamp(i) for i in p.date] p['title'] = p['link_text'] p = p.drop('link_text',1) p = p.drop('url',1) p = p.drop('link_span',1) #for i in p.timestamp: print i data = p data["domain"] = company_domain data["domain"] = company_domain data["event_type"] = "CompanyPressEvent" data = data.applymap(lambda x: self._remove_non_ascii(x)) data["event_key"] = ["".join(map(str, _data.values()))[:124] for _data in data.to_dict("r")] _df = data.to_dict("r") for i in _df: for key in i.keys(): if i[key] == None: del i[key] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn) return data
def _company_blog(self, domain, period=None): #TODO get blog url if period: df = Google().search('inurl:blog site:{0}'.format(domain), 1, "d") else: df = Google().search('inurl:blog site:{0}'.format(domain), 1) if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index', 1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num) + " " + " ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num * -1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = df print data data["domain"] = domain data["event_type"] = "CompanyBlogEvent" data = data.applymap(lambda x: self._remove_non_ascii(x)) data["event_key"] = [ "".join(map(str, _data.values()))[:124] for _data in data.to_dict("r") ] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn) return data
def _company_blog(self, domain, period=None): #TODO get blog url if period: df = Google().search('inurl:blog site:{0}'.format(domain), 1, "d") else: df = Google().search('inurl:blog site:{0}'.format(domain), 1) if df.empty: return df["count"] = [len(url) for url in df.link] df = df.reset_index().drop('index',1) df = df.drop('title', 1) url = df.sort('count').url.ix[0] df["timestamp"] = [i.split("...")[0].strip() for i in df.link_span] months = list(calendar.month_abbr) timestamps = [] for _date in df.timestamp: try: num = months.index(_date.split(" ")[0]) except: timestamps.append(0) continue _date = str(num)+" "+" ".join(_date.split(" ")[1:]) try: timestamps.append(arrow.get(_date, "M D, YYYY").timestamp) except: if "day" in i: num = int(i.split()) timestamps.append(arrow.utcnow().replace(days=num*-1).timestamp) else: timestamps.append(0) df["timestamp"] = timestamps data = df print data data["domain"] = domain data["event_type"] = "CompanyBlogEvent" data = data.applymap(lambda x: self._remove_non_ascii(x)) data["event_key"] = ["".join(map(str, _data.values()))[:124] for _data in data.to_dict("r")] data = [row.dropna().to_dict() for i, row in data.iterrows()] r.table("events").insert(data).run(conn) return data