def get_search_string(constituent_id, connection_string, table_keywords, table_exclusions): if __name__ != "__main__": from utils.TwitterDownloader import TwitterDownloader from utils.Storage import Storage from utils import twitter_analytics_helpers as tap from utils.TaggingUtils import TaggingUtils as TU storage = Storage() where = lambda x: and_((x["ACTIVE_STATE"] == 1), (x["CONSTITUENT_ID"] == constituent_id)) keywords = storage.get_sql_data(sql_connection_string=connection_string, sql_table_name=table_keywords, sql_column_list=["KEYWORD"], sql_where=where) keywords_list = ['"' + key[0] + '"' for key in keywords] keywords_string = " OR ".join(keywords_list) exclusions = storage.get_sql_data(sql_connection_string=connection_string, sql_table_name=table_exclusions, sql_column_list=["EXCLUSIONS"], sql_where=where) exclusions_list = ["-" + key[0] for key in exclusions] exclusions_string = " ".join(exclusions_list) all_words = keywords_string + exclusions_string return all_words
def main(arguments): storage = Storage() result = storage.get_sql_data( sql_connection_string=arguments.connection_string, sql_table_name=arguments.table_name, sql_column_list=["EMAIL_USERNAME"]) print(result)
def get_parameters(connection_string, table, column_list): storage = Storage() data = storage.get_sql_data(connection_string, table, column_list)[0] parameters = {} for i in range(0, len(column_list)): parameters[column_list[i]] = data[i] return parameters
def main(args): storage = Storage() data = {} #Get parameters parameters = storage.get_sql_data( sql_connection_string=args.connection_string, sql_table_name="PARAM_METADATA_COLLECTION", sql_column_list=[ "BVD_USERNAME", "BVD_PWD", "BVD_DATABASE", "FRANKFURT_FIELDS" ]) frankfurt_fields = parameters[3].split(',') #Get constituent metadata all_constituents = storage.get_sql_data( sql_connection_string=args.connection_string, sql_table_name="PARAM_METADATA_CONSTITUENTS", sql_column_list=[ "CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY", "BVDID" ]) print(all_constituents) username = getpass.getuser() to_insert = { "CONSTITUENT_ID": "", "CONSTITUENT_NAME": "", "ISIN": "", "BVDID": "", "INDUSTRY": "", "COUNTRY": "", "SYMBOL": "", "ACTIVE_STATE": True, "LISTED_SINCE": datetime.datetime.today(), "MARKET_SECTOR": "", "MARKET_SUBSECTOR": "", "TRADING_MODEL": "", "REUTERS_INSTRUMENT_CODE": "", "SHARE_TYPE": "", "MINIMUM_TRADE_UNIT": 0.0, "WEBSITE": "", "LAST_UPDATE_BY": username } for constituent_id, _, url_key, bvdid in all_constituents: print(constituent_id) fr = website_data(url_key, frankfurt_fields) bvd = get_bvd_data(parameters[0], parameters[1], parameters[2], bvdid) data.update(fr) data.update(bvd) to_insert["CONSTITUENT_ID"] = data["SD_TICKER"] + data['BVDID'] to_insert["CONSTITUENT_NAME"] = data['NAME'] to_insert["ISIN"] = data['SD_ISIN'] to_insert["BVDID"] = data['BVDID'] to_insert["INDUSTRY"] = data['MAJOR_SECTOR'] to_insert["COUNTRY"] = data['COUNTRY'] to_insert["SYMBOL"] = data['SD_TICKER'] if isinstance(data['IPO_DATE'], str): to_insert["LISTED_SINCE"] = datetime.datetime.strptime( data['IPO_DATE'], "%Y/%m/%d") to_insert["MARKET_SECTOR"] = data["Sector"] to_insert["MARKET_SUBSECTOR"] = data["Subsector"] to_insert["TRADING_MODEL"] = data['Trading Model'] to_insert["REUTERS_INSTRUMENT_CODE"] = data['Reuters Instrument Code'] to_insert["SHARE_TYPE"] = data['TYPE_SHARE'] to_insert["MINIMUM_TRADE_UNIT"] = data['Minimum tradeable Unit'] to_insert["WEBSITE"] = data["WEBSITE"] storage.insert_to_sql(sql_connection_string=args.connection_string, sql_table_name="MASTER_CONSTITUENTS", data=to_insert)
def get_tweets(args): from utils import logging_utils as logging_utils from utils.TwitterDownloader import TwitterDownloader from utils.Storage import Storage from utils import twitter_analytics_helpers as tap from utils.TaggingUtils import TaggingUtils as TU param_table = "PARAM_TWITTER_COLLECTION" parameters_list = [ "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING", "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME", "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET", "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE" ] parameters = tap.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tap.get_parameters(args.param_connection_string, common_table, common_list, common_where) languages = parameters["LANGUAGE"].split(",") storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) tagger = TU() downloader = TwitterDownloader(parameters["TWITTER_API_KEY"], parameters["TWITTER_API_SECRET"]) downloader.load_api() all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) fields_to_keep = [ "text", "favorite_count", "source", "retweeted", "entities", "id_str", "retweet_count", "favorited", "user", "lang", "created_at", "place", "constituent_name", "constituent_id", "search_term", "id", "sentiment_score", "entity_tags", "relevance", "constituent" ] for language in languages: for constituent_id, constituent_name in all_constituents: search_query = get_search_string(constituent_id, args.param_connection_string, "PARAM_TWITTER_KEYWORDS", "PARAM_TWITTER_EXCLUSIONS") #Get max id of all tweets to extract tweets with id highe than that q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \ "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"], constituent_id,language) try: sinceId = int( storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"]) except Exception as e: print(e) sinceId = None max_id = -1 tweetCount = 0 print("Downloading max {0} tweets for {1} in {2} on {3}".format( parameters["MAX_TWEETS"], constituent_name, language, str(datetime.now()))) while tweetCount < parameters["MAX_TWEETS"]: tweets_unmodified = [] tweets_modified = [] tweets_mongo = [] try: tweets, tmp_tweet_count, max_id = downloader.download( constituent_name, search_query, language, parameters["TWEETS_PER_QUERY"], sinceId, max_id) except Exception as e: continue if not tweets: break else: print("Downloaded {} tweets".format(tmp_tweet_count)) tweetCount += tmp_tweet_count #Add fields for both unmodified and modified tweets for tweet in tweets: tweet._json['source'] = "Twitter" tweet._json['constituent_name'] = constituent_name tweet._json['constituent_id'] = constituent_id tweet._json['search_term'] = search_query tweet._json["constituent"] = tap.get_old_constituent_name( constituent_id) #Removing bad fields clean_tweet = tap.scrub(tweet._json) # Separate the tweets that go to one topic or the other #unmodified t_unmodified = deepcopy(clean_tweet) t_unmodified["date"] = tap.convert_timestamp( t_unmodified["created_at"]) tweets_unmodified.append(t_unmodified) #Add additional fields clean_tweet["sentiment_score"] = tap.get_nltk_sentiment( str(clean_tweet["text"])) tagged_text = tagger.get_spacy_entities( str(clean_tweet["text"])) clean_tweet["entity_tags"] = tap.get_spacey_tags( tagged_text) clean_tweet["relevance"] = -1 #mongo t_mongo = deepcopy(clean_tweet) t_mongo['date'] = datetime.strptime( t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y') tweets_mongo.append(t_mongo) #modified tagged_tweet = dict((k, clean_tweet[k]) for k in fields_to_keep if k in clean_tweet) tagged_tweet['date'] = tap.convert_timestamp( clean_tweet["created_at"]) tweets_modified.append(tagged_tweet) #send to PubSub topic #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified) #ps_utils.publish("igenie-project", "tweets", tweets_modified) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], '{}_unmodified'.format( parameters["DESTINATION_TABLE"]), tweets_unmodified) except Exception as e: print(e) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], tweets_modified) except Exception as e: print(e) try: storage.save_to_mongodb(tweets_mongo, "dax_gcp", parameters["DESTINATION_TABLE"]) pass except Exception as e: print(e) time.sleep(1) print("Saved {} tweets for in {}".format(tweetCount, constituent_name, language)) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": tweetCount, "language": language }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) return "Downloaded tweets"
def update_from_bigquery_split(args): #load data storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) #constituents = [("BMWDE8170003036", )] start_time = time.time() for item in constituents: constituent_id = item[0] print("Loading {}".format(constituent_id)) try: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE constituent_id = '{}' ".format(constituent_id) tweets = storage.get_bigquery_data(query) operations = [] records = 0 for tweet in tweets: row = {} row["text"] = tweet["text"] row["id"] = tweet["id"] row["favorite_count"] = tweet["favorite_count"] row["source"] = tweet["source"] row["retweeted"] = tweet["retweeted"] row["entities"] = tweet["entities"] row["id_str"] = tweet["id_str"] row["retweet_count"] = tweet["retweet_count"] row["favorited"] = tweet["favorited"] row["user"] = tweet["user"] row["lang"] = tweet["lang"] row["created_at"] = tweet["created_at"] row["place"] = tweet["place"] row["constituent_name"] = tweet["constituent_name"] row["constituent_id"] = tweet["constituent_id"] row["search_term"] = tweet["search_term"] row["relevance"] = tweet["relevance"] # Additional fields if isinstance(tweet["created_at"], str): date = tweet["created_at"] ts = time.strptime(date, "%a %b %d %H:%M:%S %z %Y") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) row['date'] = ts # sentiment score row["sentiment_score"] = get_nltk_sentiment(tweet["text"]) # TO DO tagged_text = tagger.get_spacy_entities(tweet["text"]) row["entity_tags"] = get_spacey_tags(tagged_text) operations.append(row) # print(row) if len(operations) == 1000: result = None result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 print("Performed bulk write of {} records".format(records)) if not result: print("Records not inserted") operations = [] if len(operations) > 0: result = None result = storage.insert_bigquery_data('pecten_dataset', 'tweets', operations) records += 1000 if not result: print("Records not inserted") except Exception as e: print(e) print("--- %s seconds ---" % (time.time() - start_time))
def get_daily_orbis_news(args): if __name__ != "__main__": from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import logging_utils as logging_utils # get constituents soap = SOAPUtils() storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = [ "LOGGING", "BVD_USERNAME", "BVD_PASSWORD", "DESTINATION_TABLE", "LOGGING_TABLE" ] where = lambda x: x["SOURCE"] == 'Orbis' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, bvdid in constituents: #get last news date for the constituent query = """ SELECT max(news_id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Orbis" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue start = 0 max_count = 20 print("Constituent: {},{}".format(constituent_name, bvdid)) stop = False try: token = soap.get_token(parameters["BVD_USERNAME"], parameters["BVD_PASSWORD"], 'orbis') selection_token, selection_count = soap.find_by_bvd_id( token, bvdid, 'orbis') except Exception as e: print(e) return while not stop: try: query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \ "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \ "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \ "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \ "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \ "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \ "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \ "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \ "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \ "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \ "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format( start, max_count) get_data_result = soap.get_data(token, selection_token, selection_count, query, 'orbis', timeout=None) except Exception as e: print(str(e)) if token: soap.close_connection(token, 'orbis') token = None break result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) try: df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"]) except Exception as e: print(e) break if df.shape[0] == 0: print("No records in df") break initial_shape = df.shape[0] df.astype({"news_id": int}, copy=False, errors='ignore') df = df.loc[df["news_id"] > max_id] if df.shape[0] == 0: print("No new data") break if df.shape[0] != initial_shape: stop = True # Make news_title column a string df.astype({"news_title": str}, copy=False, errors='ignore') # Remove duplicate columns df.drop_duplicates(["news_title"], inplace=True) # Get sentiment score df["score"] = df.apply(lambda row: tah.get_nltk_sentiment( str(row["news_article_txt"])), axis=1) # get sentiment word df["sentiment"] = df.apply( lambda row: tah.get_sentiment_word(row["score"]), axis=1) # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) df["constituent"] = old_constituent_name # add URL df["url"] = None # add show df["show"] = True # get entity tags entity_tags = [] for text in df["news_title"]: tags = tah.get_spacey_tags(tagger.get_spacy_entities( str(text))) entity_tags.append(tags) fields = [ "news_date", "news_title", "news_article_txt", "news_companies", "news_source", "news_publication", "news_topics", "news_country", "news_region", "news_language", "news_id", "score", "sentiment", "constituent_id", "constituent_name", "constituent", "url", "show" ] # Get needed fields df_bigquery = df[fields] bigquery_data = json.loads( df_bigquery.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(bigquery_data)): bigquery_data[i]["entity_tags"] = entity_tags[i] # set news_date if "news_date" in bigquery_data[i] and bigquery_data[i][ "news_date"]: date = bigquery_data[i]["news_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) bigquery_data[i]["news_date"] = ts else: bigquery_data[i]["news_date"] = None if bigquery_data[i]["news_title"]: bigquery_data[i]["news_title"] = str( bigquery_data[i]["news_title"]) if bigquery_data[i]["news_article_txt"]: bigquery_data[i]["news_article_txt"] = str( bigquery_data[i]["news_article_txt"]) if bigquery_data[i]["news_companies"]: try: bigquery_data[i]["news_companies"] = [ i.strip() for i in bigquery_data[i] ["news_companies"].split(";") ] except Exception as e: bigquery_data[i]["news_companies"] = [] else: bigquery_data[i]["news_companies"] = [] if bigquery_data[i]["news_topics"]: try: if bigquery_data[i]["news_topics"].isdigit(): bigquery_data[i]["news_topics"] = [] elif bigquery_data[i]["news_topics"] == 'None': bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [ i.strip() for i in bigquery_data[i] ["news_topics"].split(";") ] except Exception as e: bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [] if bigquery_data[i]["news_country"]: try: bigquery_data[i]["news_country"] = [ i.strip() for i in bigquery_data[i] ["news_country"].split(";") ] except Exception as e: bigquery_data[i]["news_country"] = [] else: bigquery_data[i]["news_country"] = [] if bigquery_data[i]["news_region"]: try: bigquery_data[i]["news_region"] = [ i.strip() for i in bigquery_data[i]["news_region"].split(";") ] except Exception as e: bigquery_data[i]["news_region"] = [] else: bigquery_data[i]["news_region"] = [] if bigquery_data[i]["news_language"]: bigquery_data[i]["news_language"] = str( bigquery_data[i]["news_language"]) if bigquery_data[i]["news_source"]: bigquery_data[i]["news_source"] = str( bigquery_data[i]["news_source"]) if bigquery_data[i]["news_publication"]: bigquery_data[i]["news_publication"] = str( bigquery_data[i]["news_publication"]) if bigquery_data[i]["news_id"]: try: bigquery_data[i]["news_id"] = int( bigquery_data[i]["news_id"]) except Exception as e: bigquery_data[i]["news_id"] = str( bigquery_data[i]["news_id"]) if bigquery_data[i]["sentiment"]: bigquery_data[i]["sentiment"] = str( bigquery_data[i]["sentiment"]) if bigquery_data[i]["constituent_id"]: bigquery_data[i]["constituent_id"] = str( bigquery_data[i]["constituent_id"]) if bigquery_data[i]["constituent_name"]: bigquery_data[i]["constituent_name"] = str( bigquery_data[i]["constituent_name"]) if bigquery_data[i]["constituent"]: bigquery_data[i]["constituent"] = str( bigquery_data[i]["constituent"]) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], bigquery_data) except Exception as e: print(e) start = start + 20 print("Records saved: {}".format(df.shape[0])) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": df.shape[0], "source": "Orbis" }] if parameters["LOGGING"] and bigquery_data: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) if token: soap.close_connection(token, 'orbis')
def get_historical_orbis_news(user, pwd, database, google_key_path, param_connection_string): #get parameters soap = SOAPUtils() storage = Storage(google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=param_connection_string, sql_table_name=table, sql_column_list=columns)[2:] to_skip = [ "BAYERISCHE MOTOREN WERKE AG", "DEUTSCHE BANK AG", "SIEMENS AG", "SAP SE", "VOLKSWAGEN AG" ] for constituent_id, constituent_name, bvdid in constituents: limit = get_number_of_news_items(constituent_name) if constituent_name in to_skip: print("Skipping") continue records = 0 start = 0 max_count = 20 filename = "bq_news_{}.json".format(constituent_id) print("Constituent: {},{}".format(constituent_name, bvdid)) failed = 0 if constituent_name == "BASF SE": start = 4280 try: token = soap.get_token(user, pwd, database) selection_token, selection_count = soap.find_by_bvd_id( token, bvdid, database) except Exception as e: print(e) return with open(filename, "a") as f: while limit > records: try: query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \ "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \ "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \ "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \ "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \ "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \ "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \ "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \ "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \ "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \ "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format( start, max_count) get_data_result = soap.get_data(token, selection_token, selection_count, query, database, timeout=None) except Exception as e: print(str(e)) continue finally: pass #if token: # soap.close_connection(token, database) result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) try: df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"]) except Exception as e: print(e) continue if df.shape[0] == 0: print("No records in df") continue #Make news_title column a string df.astype({"news_title": str}, copy=False, errors='ignore') # Remove duplicate columns df.drop_duplicates(["news_title"], inplace=True) # Get sentiment score df["score"] = df.apply(lambda row: tah.get_nltk_sentiment( str(row["news_article_txt"])), axis=1) #df["score"] = df.apply(lambda row: sia.polarity_scores(str(row["news_article_txt"]))['compound'] , axis=1) # get sentiment word df["sentiment"] = df.apply( lambda row: tah.get_sentiment_word(row["score"]), axis=1) # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name( constituent_id) df["constituent"] = old_constituent_name # add URL df["url"] = None # add show df["show"] = True # get entity tags entity_tags = [] for text in df["news_title"]: tags = tah.get_spacey_tags( tagger.get_spacy_entities(str(text))) entity_tags.append(tags) fields = [ "news_date", "news_title", "news_article_txt", "news_companies", "news_source", "news_publication", "news_topics", "news_country", "news_region", "news_language", "news_id", "score", "sentiment", "constituent_id", "constituent_name", "constituent", "url", "show" ] ''' # Save to MongoDB filter = ["NEWS_DATE_NewsDim", "NEWS_TITLE_NewsDim", "NEWS_ARTICLE_TXT_NewsDim", "NEWS_SOURCE_NewsDim", "NEWS_PUBLICATION_NewsDim", "categorised_tag", "score", "sentiment", "constituent_id", "constituent_name", "constituent","url", "show"] df_mongo = df[fields] df_mongo.columns = filter mongo_data = json.loads(df_mongo.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(mongo_data)): mongo_data[i]["entity_tags"] = entity_tags[i] date = mongo_data[i]["NEWS_DATE_NewsDim"] #ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") mongo_data[i]["NEWS_DATE_NewsDim"] = ts #storage.save_to_mongodb(mongo_data, "dax_gcp", "all_news") ''' #Get needed fields df_bigquery = df[fields] bigquery_data = json.loads( df_bigquery.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(bigquery_data)): bigquery_data[i]["entity_tags"] = entity_tags[i] #set news_date if "news_date" in bigquery_data[i] and bigquery_data[i][ "news_date"]: date = bigquery_data[i]["news_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) bigquery_data[i]["news_date"] = ts else: bigquery_data[i]["news_date"] = None if bigquery_data[i]["news_title"]: bigquery_data[i]["news_title"] = str( bigquery_data[i]["news_title"]) if bigquery_data[i]["news_article_txt"]: bigquery_data[i]["news_article_txt"] = str( bigquery_data[i]["news_article_txt"]) if bigquery_data[i]["news_companies"]: try: bigquery_data[i]["news_companies"] = [ i.strip() for i in bigquery_data[i] ["news_companies"].split(";") ] except Exception as e: bigquery_data[i]["news_companies"] = [] else: bigquery_data[i]["news_companies"] = [] if bigquery_data[i]["news_topics"]: try: if bigquery_data[i]["news_topics"].isdigit(): bigquery_data[i]["news_topics"] = [] elif bigquery_data[i]["news_topics"] == 'None': bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [ i.strip() for i in bigquery_data[i] ["news_topics"].split(";") ] except Exception as e: bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [] if bigquery_data[i]["news_country"]: try: bigquery_data[i]["news_country"] = [ i.strip() for i in bigquery_data[i] ["news_country"].split(";") ] except Exception as e: bigquery_data[i]["news_country"] = [] else: bigquery_data[i]["news_country"] = [] if bigquery_data[i]["news_region"]: try: bigquery_data[i]["news_region"] = [ i.strip() for i in bigquery_data[i] ["news_region"].split(";") ] except Exception as e: bigquery_data[i]["news_region"] = [] else: bigquery_data[i]["news_region"] = [] if bigquery_data[i]["news_language"]: bigquery_data[i]["news_language"] = str( bigquery_data[i]["news_language"]) if bigquery_data[i]["news_source"]: bigquery_data[i]["news_source"] = str( bigquery_data[i]["news_source"]) if bigquery_data[i]["news_publication"]: bigquery_data[i]["news_publication"] = str( bigquery_data[i]["news_publication"]) if bigquery_data[i]["news_id"]: try: bigquery_data[i]["news_id"] = int( bigquery_data[i]["news_id"]) except Exception as e: bigquery_data[i]["news_id"] = str( bigquery_data[i]["news_id"]) if bigquery_data[i]["sentiment"]: bigquery_data[i]["sentiment"] = str( bigquery_data[i]["sentiment"]) if bigquery_data[i]["constituent_id"]: bigquery_data[i]["constituent_id"] = str( bigquery_data[i]["constituent_id"]) if bigquery_data[i]["constituent_name"]: bigquery_data[i]["constituent_name"] = str( bigquery_data[i]["constituent_name"]) if bigquery_data[i]["constituent"]: bigquery_data[i]["constituent"] = str( bigquery_data[i]["constituent"]) f.write( json.dumps(bigquery_data[i], cls=MongoEncoder) + '\n') # storage.insert_bigquery_data("pecten_dataset", "news", bigquery_data) start = start + 20 #end = start + 10 records += 20 print("Records saved: {}".format(records)) if token: soap.close_connection(token, database)
def get_rss_feed(args): from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage storage_client = Storage(google_key_path=args.google_key_path) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'RSS' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # get constituents all_constituents = storage_client.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"]) for constituent_id, constituent_name, symbol in all_constituents: # get last news date for the constituent query = """ SELECT max(news_date) as max_date FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Yahoo Finance RSS" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) max_date = result[0]["max_date"] except Exception as e: max_date = None continue print("Getting RSS Feed for {}".format(constituent_name)) d = feedparser.parse( 'http://finance.yahoo.com/rss/headline?s={}'.format(symbol + ".DE")) to_insert = [] for post in d.entries: #print(post) date = datetime(post.published_parsed[0], post.published_parsed[1], post.published_parsed[2], post.published_parsed[3], post.published_parsed[4]) if max_date: max_date = str(max_date)[:19] max_date = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S') if date < max_date: continue doc = { "url": post.link, "news_date": date.strftime('%Y-%m-%d %H:%M:%S'), "news_language": post.summary_detail["language"], "news_title": post.title, "news_article_txt": post.summary, "news_origin": "Yahoo Finance RSS", "show": True } # Get sentiment score doc["score"] = tah.get_nltk_sentiment(str(doc["news_article_txt"])) # get sentiment word doc["sentiment"] = tah.get_sentiment_word(doc["score"]) # add constituent name, id and old name doc["constituent_id"] = constituent_id doc["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) doc["constituent"] = old_constituent_name to_insert.append(doc) try: print("Inserting into BQ") storage_client.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Yahoo Finance RSS" }] if parameters["LOGGING"] and to_insert: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client)
def get_twitter_analytics_top_orgs_bq(args, from_date, to_date): param_table = "PARAM_TWITTER_COLLECTION" parameters_list = ["CONNECTION_STRING"] parameters = get_parameters(args.param_connection_string, param_table, parameters_list) storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) client = MongoClient(parameters["CONNECTION_STRING"]) db = client["dax_gcp"] collection = db["tweets"] twitter_analytics_top_orgs = db["twitter_analytics_top_orgs"] for constituent_id, constituent_name in all_constituents: print(constituent_name) pipeline = [{ "$match": { "constituent_id": constituent_id, "date": { "$gte": from_date, "$lte": to_date } } }, { "$project": { "constituent_id": "$constituent_id", "constituent_name": "$constituent_name", "org_tags": "$entity_tags.ORG", "constituent": "$constituent" } }, { "$unwind": "$org_tags" }, { "$group": { "_id": { "constituent_id": "$constituent_id", "org_tags": "$org_tags", "constituent_name": "$constituent_name" }, "count": { "$sum": 1 } } }, { "$sort": SON([("count", -1)]) }, { "$project": { "_id": 0, "constituent_id": "$_id.constituent_id", "constituent_name": "$_id.constituent_name", "org_tags": "$_id.org_tags", "count": "$count", "from_date": from_date, "to_date": to_date, "date": datetime.now() } }] results = list(collection.aggregate(pipeline)) to_return = [] for r in results: if r["constituent_name"].lower() in r["org_tags"].lower() or \ r["org_tags"].lower() in r["constituent_name"].lower() or \ " AG".lower() in r["org_tags"].lower() or \ len(r["org_tags"].split(" ")) > 3 or \ r["count"] < 10 or \ r["org_tags"].lower() == "Trends".lower() or \ r["org_tags"].lower() == "Share".lower(): continue else: to_return.append(r) if to_return: twitter_analytics_top_orgs.insert_many(to_return) else: if results: twitter_analytics_top_orgs.insert_many(results)
def get_twitter_analytics_latest_price_tweets(args, from_date, to_date): param_table = "PARAM_TWITTER_COLLECTION" parameters_list = ["CONNECTION_STRING"] parameters = get_parameters(args.param_connection_string, param_table, parameters_list) storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"]) client = MongoClient(parameters["CONNECTION_STRING"]) db = client["dax_gcp"] collection = db["tweets"] twitter_analytics_latest_price_tweets = db[ "twitter_analytics_latest_price_tweets"] for constituent_id, constituent_name, symbol in all_constituents: symbol = '$' + symbol print(symbol) to_return = [] final = [] results = list( collection.find( { "date": { "$gte": from_date, "$lte": to_date }, "constituent_id": constituent_id, "entities.symbols": { "$exists": True, "$ne": [] } }, { "text": 1, "constituent_id": 1, "constituent_name": 1, "constituent": 1, "entity_tags.MONEY": 1, "sentiment_score": 1, "date": 1, "_id": 0 }).sort([("date", -1)])) if results: print("A") for item in results: if symbol in item["text"].split(" "): if item["entity_tags"]["MONEY"]: text = item["text"].replace("$", "") text = text.replace("€", "") text = text.replace("EUR", " ") tokens = text.split(" ") contains_number = False for t in tokens: try: float(t) contains_number = True except Exception as e: pass if contains_number: clean_tokens = [ t for t in item["text"].split(" ") if "http" not in t ] item["text"] = " ".join(clean_tokens) item["tweet_date"] = item["date"] item["date"] = datetime.now() item["from_date"] = from_date item["to_date"] = to_date to_return.append(item) if to_return: df = pd.DataFrame(to_return) df.drop_duplicates("text", inplace=True) final = df.to_json(orient="records", date_format="iso") final = json.loads(final) for f in final: f["from_date"] = datetime.strptime( f["from_date"], "%Y-%m-%dT%H:%M:%S.%fZ") f["to_date"] = datetime.strptime(f["to_date"], "%Y-%m-%dT%H:%M:%S.%fZ") else: print("A2") results = list( collection.find( { "date": { "$gte": from_date, "$lte": to_date }, "constituent_id": constituent_id, "user.followers_count": { "$gte": 200 } }, { "text": 1, "constituent_id": 1, "constituent_name": 1, "constituent": 1, "entity_tags.MONEY": 1, "sentiment_score": 1, "date": 1, "_id": 0 }).limit(5).sort([("date", -1)])) if results: df = pd.DataFrame(results) df.drop_duplicates("text", inplace=True) final = df.to_json(orient="records", date_format="iso") final = json.loads(final) for f in final: f["from_date"] = from_date f["to_date"] = to_date else: print("B2") final = [{ 'constituent': constituent_name, 'constituent_id': constituent_id, 'constituent_name': constituent_name, 'date': datetime.now(), 'entity_tags': { 'MONEY': [''] }, 'from_date': from_date, 'sentiment_score': 0.0, 'text': 'No new tweets.', 'to_date': to_date, 'tweet_date': None }] else: print("B") results = list( collection.find( { "date": { "$gte": from_date, "$lte": to_date }, "constituent_id": constituent_id, "user.followers_count": { "$gte": 200 } }, { "text": 1, "constituent_id": 1, "constituent_name": 1, "constituent": 1, "entity_tags.MONEY": 1, "sentiment_score": 1, "date": 1, "_id": 0 }).limit(5).sort([("date", -1)])) if results: df = pd.DataFrame(results) df.drop_duplicates("text", inplace=True) final = df.to_json(orient="records", date_format="iso") final = json.loads(final) for f in final: f["from_date"] = from_date f["to_date"] = to_date else: print("C") results = list( collection.find({ "constituent_id": constituent_id }, { "text": 1, "constituent_id": 1, "constituent_name": 1, "constituent": 1, "entity_tags.MONEY": 1, "sentiment_score": 1, "date": 1, "_id": 0 }).limit(5).sort([("date", -1)])) if results: df = pd.DataFrame(results) df.drop_duplicates("text", inplace=True) final = df.to_json(orient="records", date_format="iso") final = json.loads(final) for f in final: f["from_date"] = from_date f["to_date"] = to_date else: print("D") final = [{ 'constituent': constituent_name, 'constituent_id': constituent_id, 'constituent_name': constituent_name, 'date': datetime.now(), 'entity_tags': { 'MONEY': [''] }, 'from_date': from_date, 'sentiment_score': 0.0, 'text': 'No new tweets.', 'to_date': to_date, 'tweet_date': None }] if final: twitter_analytics_latest_price_tweets.insert_many(final) time.sleep(1)
def extract_ticker_data(args): if __name__ != "__main__": from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage #ua = UserAgent() base_url = "http://charts.finanzen.net/ChartData.ashx?request=" #Get parameters param_table = "PARAM_TICKER_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list) #Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # Get constituents storage = Storage(args.google_key_path) columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"] table = "PARAM_TICKER_KEYS" all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) i = 0 for constituent_id, constituent_name, url_key in all_constituents: if i == 10 or i == 29: time.sleep(900) # get last deal query = """ SELECT max(datetime) as max_date FROM `{}.{}` WHERE constituent_id = '{}'; """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) except Exception as e: print(e) return max_date = result[0]["max_date"] print("Getting ticker data for {}".format(constituent_name)) url = base_url + url_key + '6000' list_of_ticks = [] try: response = requests.get(url) if response.status_code == requests.codes.ok: data = response.text.splitlines() for item in data[1:-2]: fields = item.split(';') if max_date and max_date.date() == datetime.strptime( fields[0], "%Y-%m-%d-%H-%M-%S-%f").date(): print("Last date in ticker_data is same date as today") break document = { 'constituent_name': constituent_name, 'constituent_id': constituent_id, 'date': fields[0][0:10], 'time': fields[0][11:19].replace('-', ':'), 'datetime': datetime.strptime(fields[0], "%Y-%m-%d-%H-%M-%S-%f").strftime( '%Y-%m-%d %H:%M:%S'), 'open': float(fields[1].replace(',', '')), 'high': float(fields[2].replace(',', '')), 'low': float(fields[3].replace(',', '')), 'close': float(fields[4].replace(',', '')), 'volume': float(fields[5].replace(',', '')) } list_of_ticks.append(document) if list_of_ticks: try: print("Inserting into BQ") storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], list_of_ticks) # pprint(list_of_ticks) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": datetime.now().date().strftime( '%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_ticks": len(list_of_ticks) }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) i += 1 else: print("Error making request, code {}. Retrying...".format( response.status_code)) time.sleep(20) response = requests.get(url) if response.status_code == requests.codes.ok: data = response.text.splitlines() for item in data[1:-2]: fields = item.split(';') document = { 'constituent_name': constituent_name, 'constituent_id': constituent_id, 'date': fields[0][0:10], 'time': fields[0][11:19].replace('-', ':'), 'datetime': datetime.strptime(fields[0], "%Y-%m-%d-%H-%M-%S-%f").strftime( '%Y-%m-%d %H:%M:%S'), 'open': float(fields[1].replace(',', '')), 'high': float(fields[2].replace(',', '')), 'low': float(fields[3].replace(',', '')), 'close': float(fields[4].replace(',', '')), 'volume': float(fields[5].replace(',', '')) } list_of_ticks.append(document) if list_of_ticks: try: print("Inserting into BQ") storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], list_of_ticks) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": datetime.now().date().strftime( '%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_ticks": len(list_of_ticks) }] logging_utils.logging( doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) i += 1 else: return "Error making request, code {}. Skipping.".format( response.status_code) except Exception as ex: return str(ex) finally: time.sleep(5)
def update_from_bigquery(args): #load data storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) start_time = time.time() for item in constituents: constituent_id = item[0] print("Loading {}".format(constituent_id)) while True: try: # get max id inserted from target table q = "SELECT max(id) as max_id " \ "FROM `pecten_dataset.tweets` " \ "WHERE constituent_id = '{}'".format(constituent_id) max_id = storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"] if not max_id: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE constituent_id = '{}' " \ "ORDER BY id ASC".format(constituent_id) else: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE id > {} AND constituent_id = '{}' " \ "ORDER BY id ASC".format(max_id, constituent_id) tweets = storage.get_bigquery_data(query) if tweet.total_rows == 0: print("Finished for {}".format(constituent_id)) break operations = [] records = 0 for tweet in tweets: row = {} row["text"] = tweet["text"] row["id"] = tweet["id"] row["favorite_count"] = tweet["favorite_count"] row["source"] = tweet["source"] row["retweeted"] = tweet["retweeted"] row["entities"] = tweet["entities"] row["id_str"] = tweet["id_str"] row["retweet_count"] = tweet["retweet_count"] row["favorited"] = tweet["favorited"] row["user"] = tweet["user"] row["lang"] = tweet["lang"] row["created_at"] = tweet["created_at"] row["place"] = tweet["place"] row["constituent_name"] = tweet["constituent_name"] row["constituent_id"] = tweet["constituent_id"] row["search_term"] = tweet["search_term"] row["relevance"] = tweet["relevance"] # Additional fields if isinstance(tweet["created_at"], str): row['date'] = convert_timestamp(tweet["created_at"]) # sentiment score row["sentiment_score"] = get_nltk_sentiment(tweet["text"]) # TO DO tagged_text = tagger.get_spacy_entities(tweet["text"]) row["entity_tags"] = get_spacey_tags(tagged_text) operations.append(row) # print(row) if len(operations) == 1000: result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 print("Performed bulk write of {} records".format( records)) if not result: print("Records not inserted") operations = [] if len(operations) > 0: result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 if not result: print("Records not inserted") except Exception as e: print(e) print("--- %s seconds ---" % (time.time() - start_time))
def get_stocktwits(args): from utils import logging_utils as logging_utils from utils.Storage import Storage from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU # get constituents storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"] table = "PARAM_STOCKTWITS_KEYS" all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_STOCKTWITS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, url_key in all_constituents: #Get the last id query = """ SELECT max(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and source = 'StockTwits' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue print("Gettins stocktwits for {}:{}".format(constituent_name, url_key)) response = requests.get( "https://api.stocktwits.com/api/2/streams/symbol/{}.json".format( url_key)) to_insert = [] if response.status_code == 200: for item in response.json()["messages"]: if max_id: if int(item['id']) < max_id: continue doc = tah.create_tweet_skelleton() doc['text'] = item['body'] doc['created_at'] = item['created_at'] doc['date'] = item['created_at'] doc['id'] = item['id'] doc['id_str'] = str(item['id']) if "symbols" in item: for symbol in item['symbols']: doc['entities']['symbols'].append({ 'indices': [], 'text': symbol['symbol'] }) if 'entities' in item: if 'sentiment' in item['entities'] and item['entities'][ 'sentiment']: if 'basic' in item['entities']['sentiment']: doc['user']['description'] = item['entities'][ 'sentiment']['basic'] doc['source'] = 'StockTwits' doc['constituent_name'] = constituent_name doc['constituent_id'] = constituent_id doc['search_term'] = '' doc["constituent"] = tah.get_old_constituent_name( constituent_id) doc['relevance'] = 1 doc["sentiment_score"] = tah.get_nltk_sentiment( str(doc["text"])) tagged_text = tagger.get_spacy_entities(str(doc["text"])) doc["entity_tags"] = tah.get_spacey_tags(tagged_text) to_insert.append(doc) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": len(to_insert), "language": 'StockTwits' }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) else: print(response.text)
def get_bloomberg_news(args): from utils.Storage import Storage from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils import logging_utils as logging_utils tagger = TU() storage_client = Storage(google_key_path=args.google_key_path) #Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Bloomberg' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) #get constituents all_constituents = storage_client.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="PARAM_NEWS_BLOOMBERG_KEYS", sql_column_list=[ "CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY", "PAGES" ]) # Define random user agent object #ua = UserAgent() # Base url for search queries base = 'https://bloomberg.com/search?query={}&sort=time:desc&page={}' # Iterate over companies for constituent_id, constituent_name, url_key, pages in all_constituents: # Get date of latest news article for this constituent for Bloomberg query = """ SELECT max(news_date) as last_date FROM `{}.{}` WHERE constituent_id = '{}' AND news_origin = 'Bloomberg' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) last_date_bq = result[0]["last_date"] except Exception as e: last_date_bq = None print('Scraping for {} up to {}.'.format(constituent_name, str(last_date_bq))) # iterate through specified page range and save articles to database in_target = True i = 1 while in_target: print('Scraping for page', i) # format url with company name and current page number and create bs4 object from the response url = base.format(url_key.replace('.', '_').replace(' ', '_'), i) i += 1 main_soup = BeautifulSoup(requests.get(url).content, 'html.parser') # check if there are any results on the page and finish the loop if none try: if 'No results for' in main_soup.find( 'div', { 'class': 'content-no-results' }).text: print('No more articles...') break except AttributeError: pass # iterate through article elelements on the page results = main_soup.findAll('div', {'class': 'search-result'}) times = [ x.find('time', { 'class': 'published-at' }).text.strip() for x in results ] times = [datetime.strptime(x, '%b %d, %Y') for x in times] print("Looping times") for time in times: if last_date_bq: if time < datetime.strptime( last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print('Target date reached') in_target = False to_insert = [] print("Looping articles...") for art in results: art_type = art.find('article')['class'] # check if the result is an article if 'type-article' in art_type: url = art.find('h1').find('a')['href'] d = get_article(url, None) if d is None: continue if last_date_bq: if datetime.strptime(d["news_date"], "%Y-%m-%d %H:%M:%S") < \ datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): continue #set extra fields: #score if d["news_article_txt"]: d['score'] = tah.get_nltk_sentiment( str(d["news_article_txt"])) # sentiment d['sentiment'] = tah.get_sentiment_word(d["score"]) # constituent fields d["constituent_id"] = constituent_id d["constituent_name"] = constituent_name d["constituent"] = tah.get_old_constituent_name( constituent_id) #url d["url"] = url #show d["show"] = True # entity_tags d["entity_tags"] = tah.get_spacey_tags( tagger.get_spacy_entities(str(d["news_title"]))) to_insert.append(d) if to_insert: print("Inserting records to BQ") try: storage_client.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Bloomberg" }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client) else: break
def get_daily_zephyr_ma_deals(args): from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils zephyr_query = """ DEFINE P1 AS [Parameters.Currency=SESSION;], P2 AS [Parameters.RepeatingDimension=NrOfTargets], P3 AS [Parameters.RepeatingDimension=NrOfBidders], P5 AS [Parameters.RepeatingDimension=NrOfVendors]; SELECT LINE RECORD_ID as record_id, LINE DEAL_OVERVIEW.DEAL_HEADLINE AS deal_headline, LINE DEAL_OVERVIEW.TGNAME USING P2 AS target, LINE DEAL_OVERVIEW.BDNAME USING P3 AS acquiror, LINE DEAL_OVERVIEW.VDNAME USING P5 AS vendor, LINE DEAL_OVERVIEW.DEALTYPE AS deal_type, LINE DEAL_OVERVIEW.DEAL_STATUS AS deal_status, LINE DEAL_STRUCTURE_AND_DATES.COMPLETION_DATE AS completion_date, LINE DEAL_STRUCTURE_AND_DATES.RUMOUR_DATE AS rumour_date, LINE DEAL_STRUCTURE_AND_DATES.ANNOUNCE_DATE AS announced_date, LINE DEAL_STRUCTURE_AND_DATES.EXPECTED_COMPLETION_DATE AS expected_completion_date, LINE DEAL_STRUCTURE_AND_DATES.ASSUMED_COMPLETION_DATE AS assumed_completion_date, LINE DEAL_STRUCTURE_AND_DATES.POSTPONED_DATE AS postponed_date, LINE DEAL_STRUCTURE_AND_DATES.WITHDRAWN_DATE AS withdrawn_date, LINE DEAL_OVERVIEW.ALLDLVALUE USING P1 AS deal_value FROM RemoteAccess.U ORDERBY 1 DESCENDING """ # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ['BVD_USERNAME', 'BVD_PASSWORD', "LOGGING","DESTINATION_TABLE","LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Zephyr' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # Get constituents soap = SOAPUtils() storage = Storage(args.google_key_path) columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "STRATEGY"] table = "PARAM_NEWS_ZEPHYR_STRATEGIES" constituents = storage.get_sql_data(sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) fields = ["record_id", "deal_headline", "target", "acquiror", "vendor", "deal_type", "deal_status", "completion_date", "deal_value", "rumour_date", "announced_date", "expected_completion_date", "assumed_completion_date", "postponed_date", "withdrawn_date"] for constituent_id, constituent_name, strategy in constituents: # get last deal query = """ SELECT max(record_id) as max_id FROM `{}.{}` WHERE constituent_id = '{}'; """.format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"],constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] print("Getting M&A deals for {}".format(constituent_name)) token = soap.get_token(parameters['BVD_USERNAME'], parameters['BVD_PASSWORD'], 'zephyr') selection_token, selection_count = soap.find_with_strategy(token, strategy, "zephyr") get_data_result = soap.get_data(token, selection_token, selection_count, zephyr_query, 'zephyr', timeout=None, number_of_records=100) result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) df = pd.read_csv(TESTDATA, sep=",", parse_dates=["completion_date", "rumour_date", "announced_date", "expected_completion_date", "assumed_completion_date", "postponed_date", "withdrawn_date"]) df.astype({"record_id": int}, copy=False, errors='ignore') print("Retrieved {} items".format(df.shape[0])) df = df.loc[df["record_id"] > max_id] print("New records {} items".format(df.shape[0])) if df.shape[0] == 0: if token: soap.close_connection(token, 'zephyr') continue df = df[fields] # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) df["constituent"] = old_constituent_name data = json.loads(df.to_json(orient="records", date_format="iso")) print(len(data)) for item in data: if "completion_date" in item and item["completion_date"]: date = item["completion_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["completion_date"] = ts if "rumour_date" in item and item["rumour_date"]: date = item["rumour_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["rumour_date"] = ts if "announced_date" in item and item["announced_date"]: date = item["announced_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["announced_date"] = ts if "expected_completion_dat" in item and item["expected_completion_dat"]: date = item["expected_completion_dat"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["expected_completion_date"] = ts if "assumed_completion_date" in item and item["assumed_completion_date"]: date = item["assumed_completion_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["assumed_completion_datee"] = ts if "postponed_date" in item and item["postponed_date"]: date = item["postponed_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["postponed_date"] = ts if "withdrawn_date" in item and item["withdrawn_date"]: date = item["withdrawn_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["withdrawn_date"] = ts if data: print("Inserting records to BQ") try: #open("ma_deals.json", 'w').write("\n".join(json.dumps(e, cls=MongoEncoder) for e in data)) storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], data) except Exception as e: print(e) soap.close_connection(token, 'zephyr') if parameters["LOGGING"]: doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_deals": len(data), "source": "Zephyr"}] logging_utils.logging(doc,common_parameters["BQ_DATASET"],parameters["LOGGING_TABLE"],storage) except Exception as e: print(e) if token: soap.close_connection(token, 'zephyr') continue if token: soap.close_connection(token, 'zephyr')
def get_news_analytics_topic_articles(from_date, to_date): param_table = "PARAM_TWITTER_COLLECTION" parameters_list = ["CONNECTION_STRING"] parameters = get_parameters(args.param_connection_string, param_table, parameters_list) storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) client = MongoClient(parameters["CONNECTION_STRING"]) db = client["dax_gcp"] all_news_landing = db["all_news_landing"] news_analytics_topic_sentiment = db["news_analytics_topic_sentiment"] news_analytics_topic_articles = db["news_analytics_topic_articles"] for constituent_id, constituent_name in all_constituents: print(constituent_name) # get the topics for that constituent from news_analytics_topic_sentiment topics = list( news_analytics_topic_sentiment.find( {"constituent_id": constituent_id}, { "categorised_tag": 1, "_id": 0 })) if len(topics) > 5: topics = topics[:5] #pprint(topics) # Get the latest (5) articles per topic for t in topics: pipeline = [{ "$match": { "constituent_id": constituent_id, "NEWS_DATE_NewsDim": { "$gte": from_date, "$lte": to_date }, "show": True, "categorised_tag": { "$nin": ["NA"] } } }, { "$unwind": "$categorised_tag" }, { "$match": { "categorised_tag": t['categorised_tag'] } }, { "$project": { "_id": 0, "from_date": from_date, "to_date": to_date, "date": datetime.now(), "sentiment": 1, "score": 1, "NEWS_ARTICLE_TXT_NewsDim": 1, "categorised_tag": 1, "NEWS_DATE_NewsDim": 1, "constituent_name": 1, "NEWS_TITLE_NewsDim": 1, "NEWS_SOURCE_NewsDim": 1, "constituent_id": 1, "constituent": 1 } }, { "$sort": SON([("NEWS_DATE_NewsDim", -1)]) }, { "$limit": 5 }] result = list(all_news_landing.aggregate(pipeline)) to_return = [ a for a in result if detect(a["NEWS_TITLE_NewsDim"]) == 'en' and a["categorised_tag"] and a["categorised_tag"] != 'None' ] # save result in a table if to_return: news_analytics_topic_articles.insert_many(to_return) time.sleep(3)
def get_reuters_news(args, driver): from utils import logging_utils as logging_utils from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils import twitter_analytics_helpers as tah tagger = TU() storage_client = Storage(google_key_path=args.google_key_path) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING","DESTINATION_TABLE","LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Reuters' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # get constituents all_constituents = storage_client.get_sql_data(sql_connection_string=args.param_connection_string, sql_table_name="PARAM_NEWS_REUTERS_KEYS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"]) for constituent_id, constituent_name, url_key in all_constituents: to_insert = [] # Get date of latest news article for this constituent for Reuters query = """ SELECT max(news_date) as last_date FROM `{}.{}` WHERE constituent_id = '{}' AND news_origin = 'Reuters' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"],constituent_id) try: try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) last_date_bq = result[0]["last_date"] except Exception as e: last_date_bq = None company_name = url_key print('Scraping articles for {}'.format(company_name)) query_url = 'https://reuters.com/search/news?blob={}&sortBy=date&dateRange=all'.format(company_name) # Find last article with date specified print(query_url) driver.get(query_url) sleep(2) skip = False if last_date_bq: try: last_date = \ driver.find_elements_by_xpath('//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[3]/div/div/h5')[ 0].get_attribute('innerHTML') except: last_date = BeautifulSoup(driver.page_source).findAll('h5', {'class': 'search-result-timestamp'})[ 0].text.strip() last_date = ' '.join(last_date.split(' ')[:-1]) month = [last_date.split(' ')[0][:3]] last_date = ' '.join(month + last_date.split(' ')[1:]).strip().replace(',', '') last_date = datetime.strptime(last_date, '%b %d %Y %I:%M%p') if last_date <= datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print("No newer articles") skip = True if skip: continue soup = BeautifulSoup(driver.page_source, 'html.parser') search = soup.find('div', {'class': 'search-result-list news-search'}).findAll('div', {'class': 'search-result-indiv'}) print("Getting articles") for article in search: title_elem = article.find('h3', {'class': 'search-result-title'}) title = title_elem.text.strip() url = "https://reuters.com" + title_elem.find('a')['href'] date_published = article.find('h5', {'class': 'search-result-timestamp'}).text date_published = ' '.join(date_published.split(' ')[:-1]) month = [date_published.split(' ')[0][:3]] date_published = ' '.join(month + date_published.split(' ')[1:]).strip().replace(',', '') date_published_dtype = datetime.strptime(date_published, '%b %d %Y %I:%M%p') date_published = date_published_dtype.strftime("%Y-%m-%d %H:%M:%S") if last_date_bq: if date_published_dtype < datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print("Skipping article") continue try: soup = BeautifulSoup( requests.get(url, headers={'User-Agent': random.choice(AGENTS)}).content, 'html.parser') article_body = soup.find('div', {'class': 'ArticleBody_body_2ECha'}).text.strip() source = soup.find('p', {'class': 'Attribution_content_27_rw'}).text.strip() except AttributeError: source = None continue d = {'news_title': title, 'news_date': date_published, 'news_source': source, 'news_origin':"Reuters", "news_article_txt": article_body} # set extra fields: # score if d["news_article_txt"]: d['score'] = tah.get_nltk_sentiment(str(d["news_article_txt"])) # sentiment d['sentiment'] = tah.get_sentiment_word(d["score"]) # constituent fields d["constituent_id"] = constituent_id d["constituent_name"] = constituent_name d["constituent"] = tah.get_old_constituent_name(constituent_id) # url d["url"] = url # show d["show"] = True # entity_tags d["entity_tags"] = tah.get_spacey_tags(tagger.get_spacy_entities(str(d["news_title"]))) to_insert.append(d) print('Article scraped...') #break # Save data to json if to_insert: print("Inserting records to BQ") try: storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Reuters"}] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client) except Exception as e: print(e) continue