def get_tweets(args): from utils import logging_utils as logging_utils from utils.TwitterDownloader import TwitterDownloader from utils.Storage import Storage from utils import twitter_analytics_helpers as tap from utils.TaggingUtils import TaggingUtils as TU param_table = "PARAM_TWITTER_COLLECTION" parameters_list = [ "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING", "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME", "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET", "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE" ] parameters = tap.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tap.get_parameters(args.param_connection_string, common_table, common_list, common_where) languages = parameters["LANGUAGE"].split(",") storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) tagger = TU() downloader = TwitterDownloader(parameters["TWITTER_API_KEY"], parameters["TWITTER_API_SECRET"]) downloader.load_api() all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) fields_to_keep = [ "text", "favorite_count", "source", "retweeted", "entities", "id_str", "retweet_count", "favorited", "user", "lang", "created_at", "place", "constituent_name", "constituent_id", "search_term", "id", "sentiment_score", "entity_tags", "relevance", "constituent" ] for language in languages: for constituent_id, constituent_name in all_constituents: search_query = get_search_string(constituent_id, args.param_connection_string, "PARAM_TWITTER_KEYWORDS", "PARAM_TWITTER_EXCLUSIONS") #Get max id of all tweets to extract tweets with id highe than that q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \ "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"], constituent_id,language) try: sinceId = int( storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"]) except Exception as e: print(e) sinceId = None max_id = -1 tweetCount = 0 print("Downloading max {0} tweets for {1} in {2} on {3}".format( parameters["MAX_TWEETS"], constituent_name, language, str(datetime.now()))) while tweetCount < parameters["MAX_TWEETS"]: tweets_unmodified = [] tweets_modified = [] tweets_mongo = [] try: tweets, tmp_tweet_count, max_id = downloader.download( constituent_name, search_query, language, parameters["TWEETS_PER_QUERY"], sinceId, max_id) except Exception as e: continue if not tweets: break else: print("Downloaded {} tweets".format(tmp_tweet_count)) tweetCount += tmp_tweet_count #Add fields for both unmodified and modified tweets for tweet in tweets: tweet._json['source'] = "Twitter" tweet._json['constituent_name'] = constituent_name tweet._json['constituent_id'] = constituent_id tweet._json['search_term'] = search_query tweet._json["constituent"] = tap.get_old_constituent_name( constituent_id) #Removing bad fields clean_tweet = tap.scrub(tweet._json) # Separate the tweets that go to one topic or the other #unmodified t_unmodified = deepcopy(clean_tweet) t_unmodified["date"] = tap.convert_timestamp( t_unmodified["created_at"]) tweets_unmodified.append(t_unmodified) #Add additional fields clean_tweet["sentiment_score"] = tap.get_nltk_sentiment( str(clean_tweet["text"])) tagged_text = tagger.get_spacy_entities( str(clean_tweet["text"])) clean_tweet["entity_tags"] = tap.get_spacey_tags( tagged_text) clean_tweet["relevance"] = -1 #mongo t_mongo = deepcopy(clean_tweet) t_mongo['date'] = datetime.strptime( t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y') tweets_mongo.append(t_mongo) #modified tagged_tweet = dict((k, clean_tweet[k]) for k in fields_to_keep if k in clean_tweet) tagged_tweet['date'] = tap.convert_timestamp( clean_tweet["created_at"]) tweets_modified.append(tagged_tweet) #send to PubSub topic #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified) #ps_utils.publish("igenie-project", "tweets", tweets_modified) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], '{}_unmodified'.format( parameters["DESTINATION_TABLE"]), tweets_unmodified) except Exception as e: print(e) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], tweets_modified) except Exception as e: print(e) try: storage.save_to_mongodb(tweets_mongo, "dax_gcp", parameters["DESTINATION_TABLE"]) pass except Exception as e: print(e) time.sleep(1) print("Saved {} tweets for in {}".format(tweetCount, constituent_name, language)) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": tweetCount, "language": language }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) return "Downloaded tweets"
def extract_historical_data(url, driver, storage_client, dataset_name, table_name, constituent=None): from utils.Storage import Storage from utils import twitter_analytics_helpers as tah if isinstance(constituent, tuple): constituent_id = constituent[0] constituent_name = constituent[1] constituent_old_name = tah.get_old_constituent_name(constituent_id) driver.get(url) rows = [] time.sleep(20) soup = BeautifulSoup(driver.page_source, 'lxml') table = soup.find(id=re.compile(r'grid-table-.*')) if table is None: print("Not able to load the data for " + constituent) return table_body = table.find('tbody') table_row = table_body.find('tr') if constituent == 'DAX': while table_row.find_next_sibling('tr'): data = table_row.find_all('td') try: date = datetime.strptime(data[0].string, "%d/%m/%Y") date_string = date.strftime('%Y-%m-%d %H:%M:%S') rows.append({ 'constituent': 'DAX', 'date': date_string, 'opening_price': float(data[3].string.replace(',', '')), 'closing_price': float(data[2].string.replace(',', '')), 'daily_high': float(data[4].string.replace(',', '')), 'daily_low': float(data[5].string.replace(',', '')), 'turnover': None, 'volume': float(data[1].string.replace(',', '')), 'constituent_id': "DAX", 'constituent_name': "DAX" }) except AttributeError as e: pass table_row = table_row.find_next_sibling('tr') data = table_row.find_all('td') try: date = datetime.strptime(data[0].string, "%d/%m/%Y") date_string = date.strftime('%Y-%m-%d %H:%M:%S') rows.append({ 'constituent': 'DAX', 'date': date_string, 'opening_price': float(data[3].string.replace(',', '')), 'closing_price': float(data[2].string.replace(',', '')), 'daily_high': float(data[4].string.replace(',', '')), 'daily_low': float(data[5].string.replace(',', '')), 'turnover': None, 'volume': float(data[1].string.replace(',', '')), 'constituent_id': "DAX", 'constituent_name': "DAX" }) except Exception as e: print(e) else: while table_row.find_next_sibling('tr'): data = table_row.find_all('td') try: date = datetime.strptime(data[0].string, "%d/%m/%Y") date_string = date.strftime('%Y-%m-%d %H:%M:%S') rows.append({ 'constituent': constituent_old_name, 'date': date_string, 'opening_price': float(data[1].string.replace(',', '')), 'closing_price': float(data[2].string.replace(',', '')), 'daily_high': float(data[3].string.replace(',', '')), 'daily_low': float(data[4].string.replace(',', '')), 'turnover': float(data[5].string.replace(',', '')), 'volume': float(data[6].string.replace(',', '')), 'constituent_name': constituent_name, 'constituent_id': constituent_id }) except AttributeError as e: print(e) table_row = table_row.find_next_sibling('tr') data = table_row.find_all('td') try: date = datetime.strptime(data[0].string, "%d/%m/%Y") date_string = date.strftime('%Y-%m-%d %H:%M:%S') rows.append({ 'constituent': constituent_old_name, 'date': date_string, 'opening_price': float(data[1].string.replace(',', '')), 'closing_price': float(data[2].string.replace(',', '')), 'daily_high': float(data[3].string.replace(',', '')), 'daily_low': float(data[4].string.replace(',', '')), 'turnover': float(data[5].string.replace(',', '')), 'volume': float(data[6].string.replace(',', '')), 'constituent_name': constituent_name, 'constituent_id': constituent_id }) except AttributeError as e: print(e) try: storage_client.insert_bigquery_data(dataset_name, table_name, rows) except Exception as e: print(e)
def get_daily_orbis_news(args): if __name__ != "__main__": from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import logging_utils as logging_utils # get constituents soap = SOAPUtils() storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = [ "LOGGING", "BVD_USERNAME", "BVD_PASSWORD", "DESTINATION_TABLE", "LOGGING_TABLE" ] where = lambda x: x["SOURCE"] == 'Orbis' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, bvdid in constituents: #get last news date for the constituent query = """ SELECT max(news_id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Orbis" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue start = 0 max_count = 20 print("Constituent: {},{}".format(constituent_name, bvdid)) stop = False try: token = soap.get_token(parameters["BVD_USERNAME"], parameters["BVD_PASSWORD"], 'orbis') selection_token, selection_count = soap.find_by_bvd_id( token, bvdid, 'orbis') except Exception as e: print(e) return while not stop: try: query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \ "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \ "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \ "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \ "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \ "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \ "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \ "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \ "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \ "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \ "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format( start, max_count) get_data_result = soap.get_data(token, selection_token, selection_count, query, 'orbis', timeout=None) except Exception as e: print(str(e)) if token: soap.close_connection(token, 'orbis') token = None break result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) try: df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"]) except Exception as e: print(e) break if df.shape[0] == 0: print("No records in df") break initial_shape = df.shape[0] df.astype({"news_id": int}, copy=False, errors='ignore') df = df.loc[df["news_id"] > max_id] if df.shape[0] == 0: print("No new data") break if df.shape[0] != initial_shape: stop = True # Make news_title column a string df.astype({"news_title": str}, copy=False, errors='ignore') # Remove duplicate columns df.drop_duplicates(["news_title"], inplace=True) # Get sentiment score df["score"] = df.apply(lambda row: tah.get_nltk_sentiment( str(row["news_article_txt"])), axis=1) # get sentiment word df["sentiment"] = df.apply( lambda row: tah.get_sentiment_word(row["score"]), axis=1) # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) df["constituent"] = old_constituent_name # add URL df["url"] = None # add show df["show"] = True # get entity tags entity_tags = [] for text in df["news_title"]: tags = tah.get_spacey_tags(tagger.get_spacy_entities( str(text))) entity_tags.append(tags) fields = [ "news_date", "news_title", "news_article_txt", "news_companies", "news_source", "news_publication", "news_topics", "news_country", "news_region", "news_language", "news_id", "score", "sentiment", "constituent_id", "constituent_name", "constituent", "url", "show" ] # Get needed fields df_bigquery = df[fields] bigquery_data = json.loads( df_bigquery.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(bigquery_data)): bigquery_data[i]["entity_tags"] = entity_tags[i] # set news_date if "news_date" in bigquery_data[i] and bigquery_data[i][ "news_date"]: date = bigquery_data[i]["news_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) bigquery_data[i]["news_date"] = ts else: bigquery_data[i]["news_date"] = None if bigquery_data[i]["news_title"]: bigquery_data[i]["news_title"] = str( bigquery_data[i]["news_title"]) if bigquery_data[i]["news_article_txt"]: bigquery_data[i]["news_article_txt"] = str( bigquery_data[i]["news_article_txt"]) if bigquery_data[i]["news_companies"]: try: bigquery_data[i]["news_companies"] = [ i.strip() for i in bigquery_data[i] ["news_companies"].split(";") ] except Exception as e: bigquery_data[i]["news_companies"] = [] else: bigquery_data[i]["news_companies"] = [] if bigquery_data[i]["news_topics"]: try: if bigquery_data[i]["news_topics"].isdigit(): bigquery_data[i]["news_topics"] = [] elif bigquery_data[i]["news_topics"] == 'None': bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [ i.strip() for i in bigquery_data[i] ["news_topics"].split(";") ] except Exception as e: bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [] if bigquery_data[i]["news_country"]: try: bigquery_data[i]["news_country"] = [ i.strip() for i in bigquery_data[i] ["news_country"].split(";") ] except Exception as e: bigquery_data[i]["news_country"] = [] else: bigquery_data[i]["news_country"] = [] if bigquery_data[i]["news_region"]: try: bigquery_data[i]["news_region"] = [ i.strip() for i in bigquery_data[i]["news_region"].split(";") ] except Exception as e: bigquery_data[i]["news_region"] = [] else: bigquery_data[i]["news_region"] = [] if bigquery_data[i]["news_language"]: bigquery_data[i]["news_language"] = str( bigquery_data[i]["news_language"]) if bigquery_data[i]["news_source"]: bigquery_data[i]["news_source"] = str( bigquery_data[i]["news_source"]) if bigquery_data[i]["news_publication"]: bigquery_data[i]["news_publication"] = str( bigquery_data[i]["news_publication"]) if bigquery_data[i]["news_id"]: try: bigquery_data[i]["news_id"] = int( bigquery_data[i]["news_id"]) except Exception as e: bigquery_data[i]["news_id"] = str( bigquery_data[i]["news_id"]) if bigquery_data[i]["sentiment"]: bigquery_data[i]["sentiment"] = str( bigquery_data[i]["sentiment"]) if bigquery_data[i]["constituent_id"]: bigquery_data[i]["constituent_id"] = str( bigquery_data[i]["constituent_id"]) if bigquery_data[i]["constituent_name"]: bigquery_data[i]["constituent_name"] = str( bigquery_data[i]["constituent_name"]) if bigquery_data[i]["constituent"]: bigquery_data[i]["constituent"] = str( bigquery_data[i]["constituent"]) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], bigquery_data) except Exception as e: print(e) start = start + 20 print("Records saved: {}".format(df.shape[0])) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": df.shape[0], "source": "Orbis" }] if parameters["LOGGING"] and bigquery_data: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) if token: soap.close_connection(token, 'orbis')
def get_rss_feed(args): from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage storage_client = Storage(google_key_path=args.google_key_path) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'RSS' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # get constituents all_constituents = storage_client.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"]) for constituent_id, constituent_name, symbol in all_constituents: # get last news date for the constituent query = """ SELECT max(news_date) as max_date FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Yahoo Finance RSS" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) max_date = result[0]["max_date"] except Exception as e: max_date = None continue print("Getting RSS Feed for {}".format(constituent_name)) d = feedparser.parse( 'http://finance.yahoo.com/rss/headline?s={}'.format(symbol + ".DE")) to_insert = [] for post in d.entries: #print(post) date = datetime(post.published_parsed[0], post.published_parsed[1], post.published_parsed[2], post.published_parsed[3], post.published_parsed[4]) if max_date: max_date = str(max_date)[:19] max_date = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S') if date < max_date: continue doc = { "url": post.link, "news_date": date.strftime('%Y-%m-%d %H:%M:%S'), "news_language": post.summary_detail["language"], "news_title": post.title, "news_article_txt": post.summary, "news_origin": "Yahoo Finance RSS", "show": True } # Get sentiment score doc["score"] = tah.get_nltk_sentiment(str(doc["news_article_txt"])) # get sentiment word doc["sentiment"] = tah.get_sentiment_word(doc["score"]) # add constituent name, id and old name doc["constituent_id"] = constituent_id doc["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) doc["constituent"] = old_constituent_name to_insert.append(doc) try: print("Inserting into BQ") storage_client.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Yahoo Finance RSS" }] if parameters["LOGGING"] and to_insert: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client)
def get_stocktwits(args): from utils import logging_utils as logging_utils from utils.Storage import Storage from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU # get constituents storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"] table = "PARAM_STOCKTWITS_KEYS" all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_STOCKTWITS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, url_key in all_constituents: #Get the last id query = """ SELECT max(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and source = 'StockTwits' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue print("Gettins stocktwits for {}:{}".format(constituent_name, url_key)) response = requests.get( "https://api.stocktwits.com/api/2/streams/symbol/{}.json".format( url_key)) to_insert = [] if response.status_code == 200: for item in response.json()["messages"]: if max_id: if int(item['id']) < max_id: continue doc = tah.create_tweet_skelleton() doc['text'] = item['body'] doc['created_at'] = item['created_at'] doc['date'] = item['created_at'] doc['id'] = item['id'] doc['id_str'] = str(item['id']) if "symbols" in item: for symbol in item['symbols']: doc['entities']['symbols'].append({ 'indices': [], 'text': symbol['symbol'] }) if 'entities' in item: if 'sentiment' in item['entities'] and item['entities'][ 'sentiment']: if 'basic' in item['entities']['sentiment']: doc['user']['description'] = item['entities'][ 'sentiment']['basic'] doc['source'] = 'StockTwits' doc['constituent_name'] = constituent_name doc['constituent_id'] = constituent_id doc['search_term'] = '' doc["constituent"] = tah.get_old_constituent_name( constituent_id) doc['relevance'] = 1 doc["sentiment_score"] = tah.get_nltk_sentiment( str(doc["text"])) tagged_text = tagger.get_spacy_entities(str(doc["text"])) doc["entity_tags"] = tah.get_spacey_tags(tagged_text) to_insert.append(doc) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": len(to_insert), "language": 'StockTwits' }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) else: print(response.text)
def get_bloomberg_news(args): from utils.Storage import Storage from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils import logging_utils as logging_utils tagger = TU() storage_client = Storage(google_key_path=args.google_key_path) #Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Bloomberg' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) #get constituents all_constituents = storage_client.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="PARAM_NEWS_BLOOMBERG_KEYS", sql_column_list=[ "CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY", "PAGES" ]) # Define random user agent object #ua = UserAgent() # Base url for search queries base = 'https://bloomberg.com/search?query={}&sort=time:desc&page={}' # Iterate over companies for constituent_id, constituent_name, url_key, pages in all_constituents: # Get date of latest news article for this constituent for Bloomberg query = """ SELECT max(news_date) as last_date FROM `{}.{}` WHERE constituent_id = '{}' AND news_origin = 'Bloomberg' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) last_date_bq = result[0]["last_date"] except Exception as e: last_date_bq = None print('Scraping for {} up to {}.'.format(constituent_name, str(last_date_bq))) # iterate through specified page range and save articles to database in_target = True i = 1 while in_target: print('Scraping for page', i) # format url with company name and current page number and create bs4 object from the response url = base.format(url_key.replace('.', '_').replace(' ', '_'), i) i += 1 main_soup = BeautifulSoup(requests.get(url).content, 'html.parser') # check if there are any results on the page and finish the loop if none try: if 'No results for' in main_soup.find( 'div', { 'class': 'content-no-results' }).text: print('No more articles...') break except AttributeError: pass # iterate through article elelements on the page results = main_soup.findAll('div', {'class': 'search-result'}) times = [ x.find('time', { 'class': 'published-at' }).text.strip() for x in results ] times = [datetime.strptime(x, '%b %d, %Y') for x in times] print("Looping times") for time in times: if last_date_bq: if time < datetime.strptime( last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print('Target date reached') in_target = False to_insert = [] print("Looping articles...") for art in results: art_type = art.find('article')['class'] # check if the result is an article if 'type-article' in art_type: url = art.find('h1').find('a')['href'] d = get_article(url, None) if d is None: continue if last_date_bq: if datetime.strptime(d["news_date"], "%Y-%m-%d %H:%M:%S") < \ datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): continue #set extra fields: #score if d["news_article_txt"]: d['score'] = tah.get_nltk_sentiment( str(d["news_article_txt"])) # sentiment d['sentiment'] = tah.get_sentiment_word(d["score"]) # constituent fields d["constituent_id"] = constituent_id d["constituent_name"] = constituent_name d["constituent"] = tah.get_old_constituent_name( constituent_id) #url d["url"] = url #show d["show"] = True # entity_tags d["entity_tags"] = tah.get_spacey_tags( tagger.get_spacy_entities(str(d["news_title"]))) to_insert.append(d) if to_insert: print("Inserting records to BQ") try: storage_client.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Bloomberg" }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client) else: break
def get_daily_zephyr_ma_deals(args): from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils zephyr_query = """ DEFINE P1 AS [Parameters.Currency=SESSION;], P2 AS [Parameters.RepeatingDimension=NrOfTargets], P3 AS [Parameters.RepeatingDimension=NrOfBidders], P5 AS [Parameters.RepeatingDimension=NrOfVendors]; SELECT LINE RECORD_ID as record_id, LINE DEAL_OVERVIEW.DEAL_HEADLINE AS deal_headline, LINE DEAL_OVERVIEW.TGNAME USING P2 AS target, LINE DEAL_OVERVIEW.BDNAME USING P3 AS acquiror, LINE DEAL_OVERVIEW.VDNAME USING P5 AS vendor, LINE DEAL_OVERVIEW.DEALTYPE AS deal_type, LINE DEAL_OVERVIEW.DEAL_STATUS AS deal_status, LINE DEAL_STRUCTURE_AND_DATES.COMPLETION_DATE AS completion_date, LINE DEAL_STRUCTURE_AND_DATES.RUMOUR_DATE AS rumour_date, LINE DEAL_STRUCTURE_AND_DATES.ANNOUNCE_DATE AS announced_date, LINE DEAL_STRUCTURE_AND_DATES.EXPECTED_COMPLETION_DATE AS expected_completion_date, LINE DEAL_STRUCTURE_AND_DATES.ASSUMED_COMPLETION_DATE AS assumed_completion_date, LINE DEAL_STRUCTURE_AND_DATES.POSTPONED_DATE AS postponed_date, LINE DEAL_STRUCTURE_AND_DATES.WITHDRAWN_DATE AS withdrawn_date, LINE DEAL_OVERVIEW.ALLDLVALUE USING P1 AS deal_value FROM RemoteAccess.U ORDERBY 1 DESCENDING """ # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ['BVD_USERNAME', 'BVD_PASSWORD', "LOGGING","DESTINATION_TABLE","LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Zephyr' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # Get constituents soap = SOAPUtils() storage = Storage(args.google_key_path) columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "STRATEGY"] table = "PARAM_NEWS_ZEPHYR_STRATEGIES" constituents = storage.get_sql_data(sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) fields = ["record_id", "deal_headline", "target", "acquiror", "vendor", "deal_type", "deal_status", "completion_date", "deal_value", "rumour_date", "announced_date", "expected_completion_date", "assumed_completion_date", "postponed_date", "withdrawn_date"] for constituent_id, constituent_name, strategy in constituents: # get last deal query = """ SELECT max(record_id) as max_id FROM `{}.{}` WHERE constituent_id = '{}'; """.format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"],constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] print("Getting M&A deals for {}".format(constituent_name)) token = soap.get_token(parameters['BVD_USERNAME'], parameters['BVD_PASSWORD'], 'zephyr') selection_token, selection_count = soap.find_with_strategy(token, strategy, "zephyr") get_data_result = soap.get_data(token, selection_token, selection_count, zephyr_query, 'zephyr', timeout=None, number_of_records=100) result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) df = pd.read_csv(TESTDATA, sep=",", parse_dates=["completion_date", "rumour_date", "announced_date", "expected_completion_date", "assumed_completion_date", "postponed_date", "withdrawn_date"]) df.astype({"record_id": int}, copy=False, errors='ignore') print("Retrieved {} items".format(df.shape[0])) df = df.loc[df["record_id"] > max_id] print("New records {} items".format(df.shape[0])) if df.shape[0] == 0: if token: soap.close_connection(token, 'zephyr') continue df = df[fields] # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) df["constituent"] = old_constituent_name data = json.loads(df.to_json(orient="records", date_format="iso")) print(len(data)) for item in data: if "completion_date" in item and item["completion_date"]: date = item["completion_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["completion_date"] = ts if "rumour_date" in item and item["rumour_date"]: date = item["rumour_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["rumour_date"] = ts if "announced_date" in item and item["announced_date"]: date = item["announced_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["announced_date"] = ts if "expected_completion_dat" in item and item["expected_completion_dat"]: date = item["expected_completion_dat"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["expected_completion_date"] = ts if "assumed_completion_date" in item and item["assumed_completion_date"]: date = item["assumed_completion_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["assumed_completion_datee"] = ts if "postponed_date" in item and item["postponed_date"]: date = item["postponed_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["postponed_date"] = ts if "withdrawn_date" in item and item["withdrawn_date"]: date = item["withdrawn_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["withdrawn_date"] = ts if data: print("Inserting records to BQ") try: #open("ma_deals.json", 'w').write("\n".join(json.dumps(e, cls=MongoEncoder) for e in data)) storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], data) except Exception as e: print(e) soap.close_connection(token, 'zephyr') if parameters["LOGGING"]: doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_deals": len(data), "source": "Zephyr"}] logging_utils.logging(doc,common_parameters["BQ_DATASET"],parameters["LOGGING_TABLE"],storage) except Exception as e: print(e) if token: soap.close_connection(token, 'zephyr') continue if token: soap.close_connection(token, 'zephyr')
def get_reuters_news(args, driver): from utils import logging_utils as logging_utils from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils import twitter_analytics_helpers as tah tagger = TU() storage_client = Storage(google_key_path=args.google_key_path) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING","DESTINATION_TABLE","LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Reuters' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # get constituents all_constituents = storage_client.get_sql_data(sql_connection_string=args.param_connection_string, sql_table_name="PARAM_NEWS_REUTERS_KEYS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"]) for constituent_id, constituent_name, url_key in all_constituents: to_insert = [] # Get date of latest news article for this constituent for Reuters query = """ SELECT max(news_date) as last_date FROM `{}.{}` WHERE constituent_id = '{}' AND news_origin = 'Reuters' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"],constituent_id) try: try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) last_date_bq = result[0]["last_date"] except Exception as e: last_date_bq = None company_name = url_key print('Scraping articles for {}'.format(company_name)) query_url = 'https://reuters.com/search/news?blob={}&sortBy=date&dateRange=all'.format(company_name) # Find last article with date specified print(query_url) driver.get(query_url) sleep(2) skip = False if last_date_bq: try: last_date = \ driver.find_elements_by_xpath('//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[3]/div/div/h5')[ 0].get_attribute('innerHTML') except: last_date = BeautifulSoup(driver.page_source).findAll('h5', {'class': 'search-result-timestamp'})[ 0].text.strip() last_date = ' '.join(last_date.split(' ')[:-1]) month = [last_date.split(' ')[0][:3]] last_date = ' '.join(month + last_date.split(' ')[1:]).strip().replace(',', '') last_date = datetime.strptime(last_date, '%b %d %Y %I:%M%p') if last_date <= datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print("No newer articles") skip = True if skip: continue soup = BeautifulSoup(driver.page_source, 'html.parser') search = soup.find('div', {'class': 'search-result-list news-search'}).findAll('div', {'class': 'search-result-indiv'}) print("Getting articles") for article in search: title_elem = article.find('h3', {'class': 'search-result-title'}) title = title_elem.text.strip() url = "https://reuters.com" + title_elem.find('a')['href'] date_published = article.find('h5', {'class': 'search-result-timestamp'}).text date_published = ' '.join(date_published.split(' ')[:-1]) month = [date_published.split(' ')[0][:3]] date_published = ' '.join(month + date_published.split(' ')[1:]).strip().replace(',', '') date_published_dtype = datetime.strptime(date_published, '%b %d %Y %I:%M%p') date_published = date_published_dtype.strftime("%Y-%m-%d %H:%M:%S") if last_date_bq: if date_published_dtype < datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print("Skipping article") continue try: soup = BeautifulSoup( requests.get(url, headers={'User-Agent': random.choice(AGENTS)}).content, 'html.parser') article_body = soup.find('div', {'class': 'ArticleBody_body_2ECha'}).text.strip() source = soup.find('p', {'class': 'Attribution_content_27_rw'}).text.strip() except AttributeError: source = None continue d = {'news_title': title, 'news_date': date_published, 'news_source': source, 'news_origin':"Reuters", "news_article_txt": article_body} # set extra fields: # score if d["news_article_txt"]: d['score'] = tah.get_nltk_sentiment(str(d["news_article_txt"])) # sentiment d['sentiment'] = tah.get_sentiment_word(d["score"]) # constituent fields d["constituent_id"] = constituent_id d["constituent_name"] = constituent_name d["constituent"] = tah.get_old_constituent_name(constituent_id) # url d["url"] = url # show d["show"] = True # entity_tags d["entity_tags"] = tah.get_spacey_tags(tagger.get_spacy_entities(str(d["news_title"]))) to_insert.append(d) print('Article scraped...') #break # Save data to json if to_insert: print("Inserting records to BQ") try: storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Reuters"}] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client) except Exception as e: print(e) continue