def get_twitter_sentiment_popularity(from_date, to_date, google_key_path): print("twitter_sentiment_popularity") columns = ["count", "constituent", "avg_sentiment_all", "constituent_name", "constituent_id", "date"] query = """ SELECT constituent, avg(sentiment_score) as avg_sentiment_all, constituent_name, constituent_id, date, count(text) as count FROM `igenie-project.pecten_dataset.tweets` WHERE date between TIMESTAMP ('{}') and TIMESTAMP ('{}') GROUP BY constituent, constituent_name, constituent_id, date """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else (k,item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'twitter_sentiment_popularity_copy', to_insert) except Exception as e: print(e)
def get_news_all(from_date, to_date, google_key_path): print("news_all") columns = [ "NEWS_DATE_NewsDim", "score", "NEWS_PUBLICATION_NewsDim", "categorised_tag", "constituent_id", "NEWS_ARTICLE_TXT_NewsDim", "sentiment", "news_Title_NewsDim", "entity_tags", "entity_tags.FACILITY", "entity_tags.QUANTITY", "entity_tags.EVENT", "entity_tags.PERSON", "entity_tags.DATE", "entity_tags.TIME", "entity_tags.CARDINAL", "entity_tags.PRODUCT", "count" ] query = """ SELECT news_date as news_date_NewsDim, score, news_publication as NEWS_PUBLICATION_NewsDim, news_topics as categorised_tag, constituent_id, news_article_txt as NEWS_ARTICLE_TXT_NewsDim, sentiment, news_title as NEWS_TITLE_NewsDim, entity_tags, entity_tags.FACILITY, entity_tags.QUANTITY, entity_tags.EVENT, entity_tags.PERSON, entity_tags.DATE, entity_tags.TIME, entity_tags.PERSON, entity_tags.DATE, entity_tags.TIME, entity_tags.CARDINAL, entity_tags.PRODUCT, entity_tags.LOC, entity_tags.WORK_OF_ART, entity_tags.LAW, entity_tags.GPE, entity_tags.PERCENT, entity_tags.FAC, entity_tags.ORDINAL, entity_tags.ORG, entity_tags.NORP, entity_tags.LANGUAGE, entity_tags.MONEY, constituent_name, count(constituent_name) as count, url, news_language, news_id, news_country, news_companies, news_region, constituent FROM `pecten_dataset.all_news_bkp` WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}') """.format(from_date, to_date) storage_client = Storage(google_key_path='/igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'all_news', to_insert) except Exception as e: print(e)
def other_tables(args): param_table = "PARAM_TWITTER_COLLECTION" parameters_list = ["CONNECTION_STRING"] parameters = get_parameters(args.param_connection_string, param_table, parameters_list) storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) mongo_connection_string = parameters["CONNECTION_STRING"] client = MongoClient(mongo_connection_string) db = client["dax_gcp"] for collection_name in args.mongo_collections.split(","): collection = db[collection_name] cursor = collection.find({},{"_id":0}) data = list(cursor) file_name = "{}.json".format(collection_name) open(file_name, 'w').write("\n".join(json.dumps(e, cls=MongoEncoder) for e in data)) cloud_file_name = "{}/{}".format(args.bucket,file_name) if os.path.isfile(file_name): if storage.upload_to_cloud_storage(args.google_key_path, args.bucket, file_name, file_name): print("File uploaded to Cloud Storage") os.remove(file_name) else: print("File not uploaded to Cloud storage.") else: print("File does not exists in the local filesystem.")
def get_news_analytics_topic_articles_bq(from_date, to_date, google_key_path): print("news_analytics_topic_articles") columns = [] query = """ SELECT constituent_name, constituent_id, sentiment, News_Date_NewsDim, Constituent, News_source_News_Dim, Score, news_topic as Categorised_tag, news_Title_NewsDim, Date, NEWS_ARTICLE_TXT_NewsDim FROM [igenie-project:pecten_dataset.news] where date between TIMESTAMP ('{}') and TIMESTAMP ('{}') """.format(from_date, to_date) #` #UPDATE twitter_analytics_latest_price_tweets SET From_date = TIMESTAMP() where date between TIMESTAMP() and TIMESTAMP() #UPDATE twitter_analytics_latest_price_tweets SET To_date = TIMESTAMP() where date between TIMESTAMP() and TIMESTAMP() storage_client = Storage(google_key_path=google_key_path) result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', '', to_insert) except Exception as e: print(e)
def get_search_string(constituent_id, connection_string, table_keywords, table_exclusions): if __name__ != "__main__": from utils.TwitterDownloader import TwitterDownloader from utils.Storage import Storage from utils import twitter_analytics_helpers as tap from utils.TaggingUtils import TaggingUtils as TU storage = Storage() where = lambda x: and_((x["ACTIVE_STATE"] == 1), (x["CONSTITUENT_ID"] == constituent_id)) keywords = storage.get_sql_data(sql_connection_string=connection_string, sql_table_name=table_keywords, sql_column_list=["KEYWORD"], sql_where=where) keywords_list = ['"' + key[0] + '"' for key in keywords] keywords_string = " OR ".join(keywords_list) exclusions = storage.get_sql_data(sql_connection_string=connection_string, sql_table_name=table_exclusions, sql_column_list=["EXCLUSIONS"], sql_where=where) exclusions_list = ["-" + key[0] for key in exclusions] exclusions_string = " ".join(exclusions_list) all_words = keywords_string + exclusions_string return all_words
def get_news_analytics_topic_articles(from_date, to_date, google_key_path): print("news_analytics_topic_articles") columns = [ "constituent_name", "constituent_id", "sentiment", "News_Date_NewsDim", "constituent", "News_source_NewsDim", "To_Date", "Score", "Categorised_tag", "News_Title_NewsDim", "Date", "From_Date", "NEWS_ARTICLE_TXT_NewsDim" ] query = """ SELECT constituent_name, constituent_id, sentiment, news_date as News_Date_NewsDim, Constituent, news_source as News_source_News_Dim, Score, news_topics as Categorised_tag, news_title as news_Title_NewsDim, news_date, news_article_txt as NEWS_ARTICLE_TXT_NewsDim FROM `pecten_dataset.all_news_bkp` WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}') """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data( 'pecten_dataset', 'news_analytics_topic_articles_copy', to_insert) except Exception as e: print(e)
def get_news_analytics_daily_sentiment_bq(from_date, to_date, google_key_path): print("news_analytics_daily_sentiment") columns = [ "constituent_id", "avg_sentiment", "constituent_name", "date", "constituent" ] query = """ SELECT constituent_id, AVG(score) as avg_sentiment, constituent_name, news_date as date, constituent FROM `pecten_dataset.all_news` WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}') GROUP BY constituent_id, constituent_name, date, constituent """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data( 'pecten_dataset', 'news_analytics_daily_sentiment_copy', to_insert) except Exception as e: print(e)
def main(arguments): storage = Storage() result = storage.get_sql_data( sql_connection_string=arguments.connection_string, sql_table_name=arguments.table_name, sql_column_list=["EMAIL_USERNAME"]) print(result)
def update_result(table_store, args): storage = Storage(google_key_path=args.service_key_path) storage = Storage(google_key_path=args.service_key_path) query = 'UPDATE `' + table_store + '` SET Status = "inactive" WHERE Status = "active"' try: result = storage.get_bigquery_data(query) except Exception as e: print(e)
def get_parameters(connection_string, table, column_list): storage = Storage() data = storage.get_sql_data(connection_string, table, column_list)[0] parameters = {} for i in range(0, len(column_list)): parameters[column_list[i]] = data[i] return parameters
def get_twitter_top_organisations(from_date, to_date, google_key_path): print("twitter_top_organisations") columns = ["count", "constituent", "trend", "date_of_analysis", "status", "constituent_name", "constituent_id"] query = """ SELECT a.constituent, b.entity_tags as trend, COUNT(b.entity_tags) AS count, a.constituent_name, a.constituent_id, CASE WHEN date > '2017-11-01 00:00:00' THEN 'active' ELSE 'inactive' END AS status, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-01 00:00:00 UTC' END AS date_of_analysis FROM `pecten_dataset.tweets` a, (SELECT x.id_str, entity_tags FROM `pecten_dataset.tweets` AS x, UNNEST(entity_tags.ORG) AS entity_tags) b WHERE a.id_str= b.id_str AND a.date BETWEEN TIMESTAMP ('{}') AND TIMESTAMP ('{}') GROUP BY a.constituent, a.constituent_name, a.constituent_id, date, b.entity_tags; """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else (k,item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'twitter_top_organisations_copy', to_insert) except Exception as e: print(e)
def get_news_tags_bq(from_date, to_date, google_key_path): print("news_tags") columns = [ "Date", "constituent", "Tags", "Count", "constituent_name", "constituent_id" ] query = """ SELECT a.news_date AS Date, a.constituent, b.news_topics as Tags, COUNT(b.news_topics) AS Count, a.constituent_name, a.constituent_id FROM `pecten_dataset.all_news_bkp` a, (SELECT x.news_id, news_topics FROM `pecten_dataset.all_news_bkp` AS x, UNNEST(news_topics) AS news_topics) b WHERE a.news_id = b.news_id AND a.news_date BETWEEN TIMESTAMP ('{}') AND TIMESTAMP ('{}') GROUP BY Date, a.constituent, b.news_topics, a.constituent_name, a.constituent_id; """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'news_tag_copy', to_insert) except Exception as e: print(e)
def get_news_analytics_daily_sentiment_bq(args): common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where) print("news_analytics_daily_sentiment") columns = ["constituent_id", "avg_sentiment", "constituent_name", "date", "constituent", "To_date", "From_date"] query = """ SELECT constituent_id, AVG(score) as avg_sentiment, constituent_name, news_date as date, constituent, TIMESTAMP('{1}') as From_date, TIMESTAMP('{2}') as To_date FROM `{0}.all_news` WHERE news_date between TIMESTAMP ('{1}') and TIMESTAMP ('{2}') GROUP BY constituent_id, constituent_name, date, constituent """.format(common_parameters["BQ_DATASET"],common_parameters["FROM_DATE"].strftime("%Y-%m-%d"), common_parameters["TO_DATE"].strftime("%Y-%m-%d")) storage_client = Storage.Storage(args.google_key_path) result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else (k,item[k]) for k in columns)) try: print("Inserting into BQ") storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], 'news_analytics_daily_sentiment', to_insert) except Exception as e: print(e)
def get_twitter_analytics_latest_price_tweets(from_date, to_date, google_key_path): print("twitter_sentiment_popularity") columns = [ "tweet_date", "constituent_name", "from_date", "date", "text", "entity_tags", "entity_tags.MONEY", "sentiment_score", "constituent", "constituent_id", "to_date" ] query = """ SELECT date as tweet_date, constituent_name, text, entity_tags.MONEY, sentiment_score, constituent, constituent_id, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-09 00:00:00 UTC' END AS date, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-01 00:00:00 UTC' END AS from_date, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-11 00:00:00 UTC' END AS to_date FROM `igenie-project.pecten_dataset.tweets` WHERE text LIKE '%rating%'and text LIKE '%€%' and date between TIMESTAMP ('{}') and TIMESTAMP ('{}') """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data( 'pecten_dataset', 'twitter_analytics_latest_price_tweets_copy', to_insert) except Exception as e: print(e)
def file_to_mongodb(args): operations = [] storage = Storage( mongo_connection_string= "mongodb://*****:*****@35.189.89.82:27017/dax_gcp") with open(args.file1, "r") as f1: records = 0 for line in f1: data = json.loads(line) #change column names try: data["NEWS_TITLE_NewsDim"] = data.pop("news_title") data["NEWS_DATE_NewsDim"] = datetime.strptime( data.pop("news_date"), '%Y-%m-%d %H:%M:%S') data["NEWS_ARTICLE_TXT_NewsDim"] = data.pop("news_article_txt") data["NEWS_SOURCE_NewsDim"] = data.pop("news_source") data["NEWS_PUBLICATION_NewsDim"] = data.pop("news_publication") data["categorised_tag"] = data.pop("news_topics") if data["constituent"] == "BMW": data["constituent"] = "bmw" except Exception as e: print(e) continue operations.append(data) records += 1 if len(operations) == 1000: print("Saving {} records".format(records)) storage.save_to_mongodb(operations, "dax_gcp", "all_news") operations = [] if len(operations) > 0: storage.save_to_mongodb(operations, "dax_gcp", "all_news")
def get_twitter_sentiment_count_daily(from_date, to_date, google_key_path): print("twitter_sentiment_count_daily") columns = [ "constituent_name", "sentiment_score", "line", "date", "count", "constituent_id", "constituent" ] query = """ SELECT constituent_name, DATE(date) as date, count(date) as count, constituent_id, constituent, ( CASE WHEN sentiment_score > 0.25 THEN 'positive' WHEN sentiment_score < -0.25 THEN 'negative' ELSE 'neutral' END) AS line FROM `pecten_dataset.tweets` WHERE date between TIMESTAMP ('{}') and TIMESTAMP ('{}') GROUP BY constituent_name, constituent, constituent_id, date, line ORDER BY date """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data( 'pecten_dataset', 'twitter_sentiment_count_daily_copy', to_insert) except Exception as e: print(e)
def get_twitter_sentiment_count_daily(args): common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where) print("twitter_sentiment_count_daily") columns = [ "constituent_name", "line", "date", "count", "constituent_id", "constituent", "From_date", "To_date" ] query = """ SELECT constituent_name, DATE(date) as date, count(date) as count, constituent_id, constituent, TIMESTAMP('{1}') as From_date, TIMESTAMP('{2}') as To_date, ( CASE WHEN sentiment_score > 0.25 THEN 'positive' WHEN sentiment_score < -0.25 THEN 'negative' ELSE 'neutral' END) AS line FROM `{0}.tweets` WHERE date between TIMESTAMP ('{1}') and TIMESTAMP ('{2}') GROUP BY constituent_name, constituent, constituent_id, date, line ORDER BY date """.format(common_parameters["BQ_DATASET"], common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S"), common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S")) storage_client = Storage.Storage(args.google_key_path) result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], date) else (k, item[k]) for k in columns)) try: print("Inserting into BQ") storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], 'twitter_sentiment_count_daily', to_insert) except Exception as e: print(e)
def main(arguments): #connection string + database + table for storage all_constituents_dict_bi = { 'Allianz': 'Allianz', 'Adidas': 'Adidas', 'BASF': 'BASF', 'Bayer': 'Bayer', 'Beiersdorf': 'Beiersdorf', 'BMW': 'BMW', 'Commerzbank': 'Commerzbank', 'Continental': 'Continental', 'Daimler': 'Daimler', 'Deutsche Bank': 'Deutsche_Bank', 'Deutsche Börse': 'Deutsche_Boerse', 'Deutsche Post': 'Deutsche_Post', 'Deutsche Telekom': 'Deutsche_Telekom', 'EON': 'EON', 'Fresenius Medical Care': 'Fresenius_Medical_Care', 'Fresenius': 'Fresenius', 'HeidelbergCement': 'HeidelbergCement', 'Infineon': 'Infineon', 'Linde': 'Linde_6', 'Lufthansa': 'Lufthansa', 'Merck': 'Merck', 'Münchener Rückversicherungs-Gesellschaft': 'Munich_Re', 'ProSiebenSat1 Media': 'ProSiebenSat1_Media', 'RWE': 'RWE', 'Siemens': 'Siemens', 'Thyssenkrupp': 'thyssenkrupp', 'Volkswagen (VW) vz': 'Volkswagen_vz', 'Vonovia': 'Vonovia' } #,'Vonovia':'Vonovia' bi_analyst_table = analyst_businessinsider(all_constituents_dict_bi) ws_analyst_table = analyst_wallstreet() combined_analyst_table = combined_analyst(ws_analyst_table, bi_analyst_table) import json combined_analyst_json = json.loads( combined_analyst_table.to_json(orient='records')) bi_analyst_json = json.loads(bi_analyst_table.to_json(orient='records')) from utils.Storage import Storage storage = Storage() #save the result: combined - analyst_opinions, bi - analyst_opinions_all storage.save_to_mongodb(connection_string=args.param_connection_string, database=args.database, collection=args.collection_selected, data=combined_analyst_json) storage.save_to_mongodb(connection_string=args.param_connection_string, database=args.database, collection=args.collection_all, data=bi_analyst_json)
def get_bigquery(args): from utils.Storage import Storage storage_client = Storage(args.google_key_path) q1 = """SELECT date AS ds, closing_price AS y FROM `pecten_dataset.stockprice_historical_collection` where constituent_name = 'DAIMLER AG' order by date""" query1 = pd.read_gbq(q1, project_id='igenie-project', dialect='standard') # print(query1) args.query1 = query1
def get_country_data_bq(from_date, to_date, google_key_path): print("country_data") storage_client = Storage(google_key_path=google_key_path) query2 = """ UPDATE `pecten_dataset.country_data` SET status = 'inactive' where status = 'active'; """ try: storage_client.get_bigquery_data(query2, iterator_flag=True) except Exception as e: print(e) return columns = [ "constituent", "avg_sentiment", "count", "country_name", "constituent_name", "constituent_id", "date_of_analysis", 'status' ] query = """ SELECT constituent, AVG(sentiment_score) as avg_sentiment, count(place.country_code) as count, place.country_code as country_name, constituent_name, constituent_id, CURRENT_TIMESTAMP() as date_of_analysis, 'active' as status FROM `pecten_dataset.tweets` WHERE date between TIMESTAMP('{}') and TIMESTAMP('{}') GROUP BY constituent_id, constituent, country_name, constituent_name HAVING country_name IS NOT NULL; """.format(from_date, to_date) try: result = storage_client.get_bigquery_data(query, iterator_flag=True) except Exception as e: print(e) return None to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'country_data', to_insert) except Exception as e: print(e)
def get_twitter_sentiment_popularity(args): common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where) print("twitter_sentiment_popularity") columns = [ "count", "constituent", "avg_sentiment_all", "constituent_name", "constituent_id", "date", "from_date", "to_date" ] query = """ SELECT constituent_name, constituent_id, constituent, count(text) as count, avg(sentiment_score) as avg_sentiment_all, TIMESTAMP('{1}') as from_date, TIMESTAMP('{2}') as to_date, CASE WHEN date > '2017-11-17 00:00:00' THEN '2017-12-13 00:00:00 UTC' END AS date FROM `{0}.tweets` WHERE date between TIMESTAMP('{1}') and ('{2}') group by date, constituent_name, constituent_id, constituent order by count DESC LIMIT 10 """.format(common_parameters["BQ_DATASET"], common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S"), common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S")) storage_client = Storage.Storage(args.google_key_path) result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], date) else (k, item[k]) for k in columns)) try: print("Inserting into BQ") storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], 'twitter_sentiment_popularity', to_insert) except Exception as e: print(e)
def get_influencer_price_tweets(args): common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where) print("influencer_price_tweets") columns = ["tweet_date", "constituent_name", "from_date", "date", "text", "MONEY", "sentiment_score", "constituent", "constituent_id", "to_date"] query = """ SELECT date as tweet_date, constituent_name, text, entity_tags.MONEY, sentiment_score, constituent, constituent_id, TIMESTAMP('{1}') as from_date, TIMESTAMP('{2}') as to_date, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-13 00:00:00 UTC' END AS date FROM `{0}.tweets` WHERE text LIKE '%rating%'and text LIKE '%€%' and date between TIMESTAMP ('{1}') and TIMESTAMP ('{2}') and user.followers_count > 1000 """.format(common_parameters["BQ_DATASET"],common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S"), common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S")) storage_client = Storage.Storage(args.google_key_path) result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else (k,item[k]) for k in columns)) try: print("Inserting into BQ") storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], 'influencer_price_tweets', to_insert) except Exception as e: print(e)
def get_twitter_analytics_top_orgs_bq(args, from_date, to_date): param_table = "PARAM_TWITTER_COLLECTION" parameters_list = ["CONNECTION_STRING"] parameters = get_parameters(args.param_connection_string, param_table, parameters_list) storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) client = MongoClient(parameters["CONNECTION_STRING"]) db = client["dax_gcp"] collection = db["tweets"] twitter_analytics_top_orgs = db["twitter_analytics_top_orgs"] for constituent_id, constituent_name in all_constituents: print(constituent_name) pipeline = [{ "$match": { "constituent_id": constituent_id, "date": { "$gte": from_date, "$lte": to_date } } }, { "$project": { "constituent_id": "$constituent_id", "constituent_name": "$constituent_name", "org_tags": "$entity_tags.ORG", "constituent": "$constituent" } }, { "$unwind": "$org_tags" }, { "$group": { "_id": { "constituent_id": "$constituent_id", "org_tags": "$org_tags", "constituent_name": "$constituent_name" }, "count": { "$sum": 1 } } }, { "$sort": SON([("count", -1)]) }, { "$project": { "_id": 0, "constituent_id": "$_id.constituent_id", "constituent_name": "$_id.constituent_name", "org_tags": "$_id.org_tags", "count": "$count", "from_date": from_date, "to_date": to_date, "date": datetime.now() } }] results = list(collection.aggregate(pipeline)) to_return = [] for r in results: if r["constituent_name"].lower() in r["org_tags"].lower() or \ r["org_tags"].lower() in r["constituent_name"].lower() or \ " AG".lower() in r["org_tags"].lower() or \ len(r["org_tags"].split(" ")) > 3 or \ r["count"] < 10 or \ r["org_tags"].lower() == "Trends".lower() or \ r["org_tags"].lower() == "Share".lower(): continue else: to_return.append(r) if to_return: twitter_analytics_top_orgs.insert_many(to_return) else: if results: twitter_analytics_top_orgs.insert_many(results)
def get_tweets(args): from utils import logging_utils as logging_utils from utils.TwitterDownloader import TwitterDownloader from utils.Storage import Storage from utils import twitter_analytics_helpers as tap from utils.TaggingUtils import TaggingUtils as TU param_table = "PARAM_TWITTER_COLLECTION" parameters_list = [ "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING", "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME", "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET", "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE" ] parameters = tap.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tap.get_parameters(args.param_connection_string, common_table, common_list, common_where) languages = parameters["LANGUAGE"].split(",") storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) tagger = TU() downloader = TwitterDownloader(parameters["TWITTER_API_KEY"], parameters["TWITTER_API_SECRET"]) downloader.load_api() all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) fields_to_keep = [ "text", "favorite_count", "source", "retweeted", "entities", "id_str", "retweet_count", "favorited", "user", "lang", "created_at", "place", "constituent_name", "constituent_id", "search_term", "id", "sentiment_score", "entity_tags", "relevance", "constituent" ] for language in languages: for constituent_id, constituent_name in all_constituents: search_query = get_search_string(constituent_id, args.param_connection_string, "PARAM_TWITTER_KEYWORDS", "PARAM_TWITTER_EXCLUSIONS") #Get max id of all tweets to extract tweets with id highe than that q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \ "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"], constituent_id,language) try: sinceId = int( storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"]) except Exception as e: print(e) sinceId = None max_id = -1 tweetCount = 0 print("Downloading max {0} tweets for {1} in {2} on {3}".format( parameters["MAX_TWEETS"], constituent_name, language, str(datetime.now()))) while tweetCount < parameters["MAX_TWEETS"]: tweets_unmodified = [] tweets_modified = [] tweets_mongo = [] try: tweets, tmp_tweet_count, max_id = downloader.download( constituent_name, search_query, language, parameters["TWEETS_PER_QUERY"], sinceId, max_id) except Exception as e: continue if not tweets: break else: print("Downloaded {} tweets".format(tmp_tweet_count)) tweetCount += tmp_tweet_count #Add fields for both unmodified and modified tweets for tweet in tweets: tweet._json['source'] = "Twitter" tweet._json['constituent_name'] = constituent_name tweet._json['constituent_id'] = constituent_id tweet._json['search_term'] = search_query tweet._json["constituent"] = tap.get_old_constituent_name( constituent_id) #Removing bad fields clean_tweet = tap.scrub(tweet._json) # Separate the tweets that go to one topic or the other #unmodified t_unmodified = deepcopy(clean_tweet) t_unmodified["date"] = tap.convert_timestamp( t_unmodified["created_at"]) tweets_unmodified.append(t_unmodified) #Add additional fields clean_tweet["sentiment_score"] = tap.get_nltk_sentiment( str(clean_tweet["text"])) tagged_text = tagger.get_spacy_entities( str(clean_tweet["text"])) clean_tweet["entity_tags"] = tap.get_spacey_tags( tagged_text) clean_tweet["relevance"] = -1 #mongo t_mongo = deepcopy(clean_tweet) t_mongo['date'] = datetime.strptime( t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y') tweets_mongo.append(t_mongo) #modified tagged_tweet = dict((k, clean_tweet[k]) for k in fields_to_keep if k in clean_tweet) tagged_tweet['date'] = tap.convert_timestamp( clean_tweet["created_at"]) tweets_modified.append(tagged_tweet) #send to PubSub topic #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified) #ps_utils.publish("igenie-project", "tweets", tweets_modified) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], '{}_unmodified'.format( parameters["DESTINATION_TABLE"]), tweets_unmodified) except Exception as e: print(e) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], tweets_modified) except Exception as e: print(e) try: storage.save_to_mongodb(tweets_mongo, "dax_gcp", parameters["DESTINATION_TABLE"]) pass except Exception as e: print(e) time.sleep(1) print("Saved {} tweets for in {}".format(tweetCount, constituent_name, language)) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": tweetCount, "language": language }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) return "Downloaded tweets"
def get_daily_orbis_news(args): if __name__ != "__main__": from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import logging_utils as logging_utils # get constituents soap = SOAPUtils() storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = [ "LOGGING", "BVD_USERNAME", "BVD_PASSWORD", "DESTINATION_TABLE", "LOGGING_TABLE" ] where = lambda x: x["SOURCE"] == 'Orbis' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, bvdid in constituents: #get last news date for the constituent query = """ SELECT max(news_id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Orbis" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue start = 0 max_count = 20 print("Constituent: {},{}".format(constituent_name, bvdid)) stop = False try: token = soap.get_token(parameters["BVD_USERNAME"], parameters["BVD_PASSWORD"], 'orbis') selection_token, selection_count = soap.find_by_bvd_id( token, bvdid, 'orbis') except Exception as e: print(e) return while not stop: try: query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \ "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \ "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \ "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \ "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \ "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \ "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \ "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \ "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \ "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \ "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format( start, max_count) get_data_result = soap.get_data(token, selection_token, selection_count, query, 'orbis', timeout=None) except Exception as e: print(str(e)) if token: soap.close_connection(token, 'orbis') token = None break result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) try: df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"]) except Exception as e: print(e) break if df.shape[0] == 0: print("No records in df") break initial_shape = df.shape[0] df.astype({"news_id": int}, copy=False, errors='ignore') df = df.loc[df["news_id"] > max_id] if df.shape[0] == 0: print("No new data") break if df.shape[0] != initial_shape: stop = True # Make news_title column a string df.astype({"news_title": str}, copy=False, errors='ignore') # Remove duplicate columns df.drop_duplicates(["news_title"], inplace=True) # Get sentiment score df["score"] = df.apply(lambda row: tah.get_nltk_sentiment( str(row["news_article_txt"])), axis=1) # get sentiment word df["sentiment"] = df.apply( lambda row: tah.get_sentiment_word(row["score"]), axis=1) # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) df["constituent"] = old_constituent_name # add URL df["url"] = None # add show df["show"] = True # get entity tags entity_tags = [] for text in df["news_title"]: tags = tah.get_spacey_tags(tagger.get_spacy_entities( str(text))) entity_tags.append(tags) fields = [ "news_date", "news_title", "news_article_txt", "news_companies", "news_source", "news_publication", "news_topics", "news_country", "news_region", "news_language", "news_id", "score", "sentiment", "constituent_id", "constituent_name", "constituent", "url", "show" ] # Get needed fields df_bigquery = df[fields] bigquery_data = json.loads( df_bigquery.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(bigquery_data)): bigquery_data[i]["entity_tags"] = entity_tags[i] # set news_date if "news_date" in bigquery_data[i] and bigquery_data[i][ "news_date"]: date = bigquery_data[i]["news_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) bigquery_data[i]["news_date"] = ts else: bigquery_data[i]["news_date"] = None if bigquery_data[i]["news_title"]: bigquery_data[i]["news_title"] = str( bigquery_data[i]["news_title"]) if bigquery_data[i]["news_article_txt"]: bigquery_data[i]["news_article_txt"] = str( bigquery_data[i]["news_article_txt"]) if bigquery_data[i]["news_companies"]: try: bigquery_data[i]["news_companies"] = [ i.strip() for i in bigquery_data[i] ["news_companies"].split(";") ] except Exception as e: bigquery_data[i]["news_companies"] = [] else: bigquery_data[i]["news_companies"] = [] if bigquery_data[i]["news_topics"]: try: if bigquery_data[i]["news_topics"].isdigit(): bigquery_data[i]["news_topics"] = [] elif bigquery_data[i]["news_topics"] == 'None': bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [ i.strip() for i in bigquery_data[i] ["news_topics"].split(";") ] except Exception as e: bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [] if bigquery_data[i]["news_country"]: try: bigquery_data[i]["news_country"] = [ i.strip() for i in bigquery_data[i] ["news_country"].split(";") ] except Exception as e: bigquery_data[i]["news_country"] = [] else: bigquery_data[i]["news_country"] = [] if bigquery_data[i]["news_region"]: try: bigquery_data[i]["news_region"] = [ i.strip() for i in bigquery_data[i]["news_region"].split(";") ] except Exception as e: bigquery_data[i]["news_region"] = [] else: bigquery_data[i]["news_region"] = [] if bigquery_data[i]["news_language"]: bigquery_data[i]["news_language"] = str( bigquery_data[i]["news_language"]) if bigquery_data[i]["news_source"]: bigquery_data[i]["news_source"] = str( bigquery_data[i]["news_source"]) if bigquery_data[i]["news_publication"]: bigquery_data[i]["news_publication"] = str( bigquery_data[i]["news_publication"]) if bigquery_data[i]["news_id"]: try: bigquery_data[i]["news_id"] = int( bigquery_data[i]["news_id"]) except Exception as e: bigquery_data[i]["news_id"] = str( bigquery_data[i]["news_id"]) if bigquery_data[i]["sentiment"]: bigquery_data[i]["sentiment"] = str( bigquery_data[i]["sentiment"]) if bigquery_data[i]["constituent_id"]: bigquery_data[i]["constituent_id"] = str( bigquery_data[i]["constituent_id"]) if bigquery_data[i]["constituent_name"]: bigquery_data[i]["constituent_name"] = str( bigquery_data[i]["constituent_name"]) if bigquery_data[i]["constituent"]: bigquery_data[i]["constituent"] = str( bigquery_data[i]["constituent"]) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], bigquery_data) except Exception as e: print(e) start = start + 20 print("Records saved: {}".format(df.shape[0])) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": df.shape[0], "source": "Orbis" }] if parameters["LOGGING"] and bigquery_data: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) if token: soap.close_connection(token, 'orbis')
def get_historical_orbis_news(user, pwd, database, google_key_path, param_connection_string): #get parameters soap = SOAPUtils() storage = Storage(google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=param_connection_string, sql_table_name=table, sql_column_list=columns)[2:] to_skip = [ "BAYERISCHE MOTOREN WERKE AG", "DEUTSCHE BANK AG", "SIEMENS AG", "SAP SE", "VOLKSWAGEN AG" ] for constituent_id, constituent_name, bvdid in constituents: limit = get_number_of_news_items(constituent_name) if constituent_name in to_skip: print("Skipping") continue records = 0 start = 0 max_count = 20 filename = "bq_news_{}.json".format(constituent_id) print("Constituent: {},{}".format(constituent_name, bvdid)) failed = 0 if constituent_name == "BASF SE": start = 4280 try: token = soap.get_token(user, pwd, database) selection_token, selection_count = soap.find_by_bvd_id( token, bvdid, database) except Exception as e: print(e) return with open(filename, "a") as f: while limit > records: try: query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \ "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \ "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \ "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \ "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \ "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \ "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \ "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \ "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \ "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \ "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format( start, max_count) get_data_result = soap.get_data(token, selection_token, selection_count, query, database, timeout=None) except Exception as e: print(str(e)) continue finally: pass #if token: # soap.close_connection(token, database) result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) try: df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"]) except Exception as e: print(e) continue if df.shape[0] == 0: print("No records in df") continue #Make news_title column a string df.astype({"news_title": str}, copy=False, errors='ignore') # Remove duplicate columns df.drop_duplicates(["news_title"], inplace=True) # Get sentiment score df["score"] = df.apply(lambda row: tah.get_nltk_sentiment( str(row["news_article_txt"])), axis=1) #df["score"] = df.apply(lambda row: sia.polarity_scores(str(row["news_article_txt"]))['compound'] , axis=1) # get sentiment word df["sentiment"] = df.apply( lambda row: tah.get_sentiment_word(row["score"]), axis=1) # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name( constituent_id) df["constituent"] = old_constituent_name # add URL df["url"] = None # add show df["show"] = True # get entity tags entity_tags = [] for text in df["news_title"]: tags = tah.get_spacey_tags( tagger.get_spacy_entities(str(text))) entity_tags.append(tags) fields = [ "news_date", "news_title", "news_article_txt", "news_companies", "news_source", "news_publication", "news_topics", "news_country", "news_region", "news_language", "news_id", "score", "sentiment", "constituent_id", "constituent_name", "constituent", "url", "show" ] ''' # Save to MongoDB filter = ["NEWS_DATE_NewsDim", "NEWS_TITLE_NewsDim", "NEWS_ARTICLE_TXT_NewsDim", "NEWS_SOURCE_NewsDim", "NEWS_PUBLICATION_NewsDim", "categorised_tag", "score", "sentiment", "constituent_id", "constituent_name", "constituent","url", "show"] df_mongo = df[fields] df_mongo.columns = filter mongo_data = json.loads(df_mongo.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(mongo_data)): mongo_data[i]["entity_tags"] = entity_tags[i] date = mongo_data[i]["NEWS_DATE_NewsDim"] #ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") mongo_data[i]["NEWS_DATE_NewsDim"] = ts #storage.save_to_mongodb(mongo_data, "dax_gcp", "all_news") ''' #Get needed fields df_bigquery = df[fields] bigquery_data = json.loads( df_bigquery.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(bigquery_data)): bigquery_data[i]["entity_tags"] = entity_tags[i] #set news_date if "news_date" in bigquery_data[i] and bigquery_data[i][ "news_date"]: date = bigquery_data[i]["news_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) bigquery_data[i]["news_date"] = ts else: bigquery_data[i]["news_date"] = None if bigquery_data[i]["news_title"]: bigquery_data[i]["news_title"] = str( bigquery_data[i]["news_title"]) if bigquery_data[i]["news_article_txt"]: bigquery_data[i]["news_article_txt"] = str( bigquery_data[i]["news_article_txt"]) if bigquery_data[i]["news_companies"]: try: bigquery_data[i]["news_companies"] = [ i.strip() for i in bigquery_data[i] ["news_companies"].split(";") ] except Exception as e: bigquery_data[i]["news_companies"] = [] else: bigquery_data[i]["news_companies"] = [] if bigquery_data[i]["news_topics"]: try: if bigquery_data[i]["news_topics"].isdigit(): bigquery_data[i]["news_topics"] = [] elif bigquery_data[i]["news_topics"] == 'None': bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [ i.strip() for i in bigquery_data[i] ["news_topics"].split(";") ] except Exception as e: bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [] if bigquery_data[i]["news_country"]: try: bigquery_data[i]["news_country"] = [ i.strip() for i in bigquery_data[i] ["news_country"].split(";") ] except Exception as e: bigquery_data[i]["news_country"] = [] else: bigquery_data[i]["news_country"] = [] if bigquery_data[i]["news_region"]: try: bigquery_data[i]["news_region"] = [ i.strip() for i in bigquery_data[i] ["news_region"].split(";") ] except Exception as e: bigquery_data[i]["news_region"] = [] else: bigquery_data[i]["news_region"] = [] if bigquery_data[i]["news_language"]: bigquery_data[i]["news_language"] = str( bigquery_data[i]["news_language"]) if bigquery_data[i]["news_source"]: bigquery_data[i]["news_source"] = str( bigquery_data[i]["news_source"]) if bigquery_data[i]["news_publication"]: bigquery_data[i]["news_publication"] = str( bigquery_data[i]["news_publication"]) if bigquery_data[i]["news_id"]: try: bigquery_data[i]["news_id"] = int( bigquery_data[i]["news_id"]) except Exception as e: bigquery_data[i]["news_id"] = str( bigquery_data[i]["news_id"]) if bigquery_data[i]["sentiment"]: bigquery_data[i]["sentiment"] = str( bigquery_data[i]["sentiment"]) if bigquery_data[i]["constituent_id"]: bigquery_data[i]["constituent_id"] = str( bigquery_data[i]["constituent_id"]) if bigquery_data[i]["constituent_name"]: bigquery_data[i]["constituent_name"] = str( bigquery_data[i]["constituent_name"]) if bigquery_data[i]["constituent"]: bigquery_data[i]["constituent"] = str( bigquery_data[i]["constituent"]) f.write( json.dumps(bigquery_data[i], cls=MongoEncoder) + '\n') # storage.insert_bigquery_data("pecten_dataset", "news", bigquery_data) start = start + 20 #end = start + 10 records += 20 print("Records saved: {}".format(records)) if token: soap.close_connection(token, database)
def get_twitter_analytics_latest_price_tweets(args, from_date, to_date): param_table = "PARAM_TWITTER_COLLECTION" parameters_list = ["CONNECTION_STRING"] parameters = get_parameters(args.param_connection_string, param_table, parameters_list) storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"]) client = MongoClient(parameters["CONNECTION_STRING"]) db = client["dax_gcp"] collection = db["tweets"] twitter_analytics_latest_price_tweets = db[ "twitter_analytics_latest_price_tweets"] for constituent_id, constituent_name, symbol in all_constituents: symbol = '$' + symbol print(symbol) to_return = [] final = [] results = list( collection.find( { "date": { "$gte": from_date, "$lte": to_date }, "constituent_id": constituent_id, "entities.symbols": { "$exists": True, "$ne": [] } }, { "text": 1, "constituent_id": 1, "constituent_name": 1, "constituent": 1, "entity_tags.MONEY": 1, "sentiment_score": 1, "date": 1, "_id": 0 }).sort([("date", -1)])) if results: print("A") for item in results: if symbol in item["text"].split(" "): if item["entity_tags"]["MONEY"]: text = item["text"].replace("$", "") text = text.replace("€", "") text = text.replace("EUR", " ") tokens = text.split(" ") contains_number = False for t in tokens: try: float(t) contains_number = True except Exception as e: pass if contains_number: clean_tokens = [ t for t in item["text"].split(" ") if "http" not in t ] item["text"] = " ".join(clean_tokens) item["tweet_date"] = item["date"] item["date"] = datetime.now() item["from_date"] = from_date item["to_date"] = to_date to_return.append(item) if to_return: df = pd.DataFrame(to_return) df.drop_duplicates("text", inplace=True) final = df.to_json(orient="records", date_format="iso") final = json.loads(final) for f in final: f["from_date"] = datetime.strptime( f["from_date"], "%Y-%m-%dT%H:%M:%S.%fZ") f["to_date"] = datetime.strptime(f["to_date"], "%Y-%m-%dT%H:%M:%S.%fZ") else: print("A2") results = list( collection.find( { "date": { "$gte": from_date, "$lte": to_date }, "constituent_id": constituent_id, "user.followers_count": { "$gte": 200 } }, { "text": 1, "constituent_id": 1, "constituent_name": 1, "constituent": 1, "entity_tags.MONEY": 1, "sentiment_score": 1, "date": 1, "_id": 0 }).limit(5).sort([("date", -1)])) if results: df = pd.DataFrame(results) df.drop_duplicates("text", inplace=True) final = df.to_json(orient="records", date_format="iso") final = json.loads(final) for f in final: f["from_date"] = from_date f["to_date"] = to_date else: print("B2") final = [{ 'constituent': constituent_name, 'constituent_id': constituent_id, 'constituent_name': constituent_name, 'date': datetime.now(), 'entity_tags': { 'MONEY': [''] }, 'from_date': from_date, 'sentiment_score': 0.0, 'text': 'No new tweets.', 'to_date': to_date, 'tweet_date': None }] else: print("B") results = list( collection.find( { "date": { "$gte": from_date, "$lte": to_date }, "constituent_id": constituent_id, "user.followers_count": { "$gte": 200 } }, { "text": 1, "constituent_id": 1, "constituent_name": 1, "constituent": 1, "entity_tags.MONEY": 1, "sentiment_score": 1, "date": 1, "_id": 0 }).limit(5).sort([("date", -1)])) if results: df = pd.DataFrame(results) df.drop_duplicates("text", inplace=True) final = df.to_json(orient="records", date_format="iso") final = json.loads(final) for f in final: f["from_date"] = from_date f["to_date"] = to_date else: print("C") results = list( collection.find({ "constituent_id": constituent_id }, { "text": 1, "constituent_id": 1, "constituent_name": 1, "constituent": 1, "entity_tags.MONEY": 1, "sentiment_score": 1, "date": 1, "_id": 0 }).limit(5).sort([("date", -1)])) if results: df = pd.DataFrame(results) df.drop_duplicates("text", inplace=True) final = df.to_json(orient="records", date_format="iso") final = json.loads(final) for f in final: f["from_date"] = from_date f["to_date"] = to_date else: print("D") final = [{ 'constituent': constituent_name, 'constituent_id': constituent_id, 'constituent_name': constituent_name, 'date': datetime.now(), 'entity_tags': { 'MONEY': [''] }, 'from_date': from_date, 'sentiment_score': 0.0, 'text': 'No new tweets.', 'to_date': to_date, 'tweet_date': None }] if final: twitter_analytics_latest_price_tweets.insert_many(final) time.sleep(1)
def get_news_tags_bq(args): # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where) print("news_tags") columns = [ "Date", "constituent", "Tags", "Count", "constituent_name", "constituent_id", "From_date", "To_date" ] query = """ SELECT a.news_date AS Date, a.constituent, b.news_topics as Tags, COUNT(b.news_topics) AS Count, a.constituent_name, a.constituent_id, TIMESTAMP('{1}') as From_date, TIMESTAMP('{2}') as To_date FROM `{0}.all_news` a, (SELECT x.news_id, news_topics FROM `{0}.all_news` AS x, UNNEST(news_topics) AS news_topics) b WHERE a.news_id = b.news_id AND a.news_date BETWEEN TIMESTAMP ('{1}') AND TIMESTAMP ('{2}') GROUP BY Date, a.constituent, b.news_topics, a.constituent_name, a.constituent_id; """.format(common_parameters["BQ_DATASET"], common_parameters["FROM_DATE"].strftime("%Y-%m-%d"), common_parameters["TO_DATE"].strftime("%Y-%m-%d")) storage_client = Storage.Storage(args.google_key_path) result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], 'news_tag', to_insert) except Exception as e: print(e)
def main(args): if __name__ != "__main__": sys.path.insert(0, args.python_path) from utils.Storage import Storage from utils import twitter_analytics_helpers as tah driver = webdriver.PhantomJS() # Get parameters param_table = "PARAM_HISTORICAL_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE"] parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # Get constituents storage = Storage(google_key_path=args.google_key_path) query = """ SELECT a.CONSTITUENT_ID, a.CONSTITUENT_NAME, b.URL_KEY FROM MASTER_CONSTITUENTS a, PARAM_FINANCIAL_URL_KEYS b WHERE a.CONSTITUENT_ID = b.CONSTITUENT_ID """ all_constituents = storage.get_sql_data_text_query( args.param_connection_string, query) # Get dates query = """ SELECT max(date) as last_date FROM `{}.{}` """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"]) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) except Exception as e: print(e) return from_date = result[0]["last_date"] + timedelta(days=1) ts = from_date.strftime("%d.%m.%Y") from_date_parts = ts.split(".") if from_date_parts[0][0] == "0": from_date_parts[0] = from_date_parts[0][1:] if from_date_parts[1][0] == "0": from_date_parts[1] = from_date_parts[1][1:] from_date = ".".join(from_date_parts) to_date = datetime.now().strftime("%d.%m.%Y") to_date_parts = to_date.split(".") if to_date_parts[0][0] == "0": to_date_parts[0] = to_date_parts[0][1:] if to_date_parts[1][0] == "0": to_date_parts[1] = to_date_parts[1][1:] to_date = ".".join(to_date_parts) if from_date == to_date: return dax_url = 'http://en.boerse-frankfurt.de/index/pricehistory/DAX/{}_{}#History'.format( from_date, to_date) constituent_base_url = 'http://en.boerse-frankfurt.de/stock/pricehistory/' constituent_date_url = '-share/FSE/{}_{}#Price_History'.format( from_date, to_date) if args.all: extract_historical_data(dax_url, driver, storage, common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent='DAX') for constituent_id, constituent_name, url_key in all_constituents: print("Extracting data for {} from {} to {}".format( constituent_name, from_date, to_date)) extract_historical_data(urljoin(constituent_base_url, url_key + constituent_date_url), driver, storage, common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent=(constituent_id, constituent_name)) time.sleep(10) else: if args.constituent == 'DAX': extract_historical_data(dax_url, driver, storage, common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent='DAX') else: for constituent_id, constituent_name, url_key in all_constituents: if constituent_id == args.constituent: print("Extracting data for {} from {} to {}".format( constituent_name, from_date, to_date)) constituent_url = urljoin(constituent_base_url, url_key + constituent_date_url) extract_historical_data(constituent_url, driver, storage, common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent=(constituent_id, constituent_name)) driver.quit()
def get_country_data(args): common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where) print("country_data") #Feature PECTEN-9 backup_table_name = backup_table(args.google_key_path,common_parameters["BQ_DATASET"],"country_data") columns = ["count", "avg_sentiment", "constituent_name", "country_name", "constituent", "constituent_id", "from_date", "to_date"] query = """ SELECT constituent, AVG(sentiment_score) as avg_sentiment, count(place.country_code) as count, place.country_code as country_name, constituent_name, constituent_id, TIMESTAMP('{1}') as from_date, TIMESTAMP('{2}') as to_date FROM `{0}.tweets` WHERE date BETWEEN TIMESTAMP ('{1}') and TIMESTAMP ('{2}') and place.country_code is not null GROUP BY constituent_id, constituent, country_name, constituent_name """.format(common_parameters["BQ_DATASET"],common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S"), common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S")) storage_client = Storage.Storage(args.google_key_path) result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else (k,item[k]) for k in columns)) # Feature PECTEN-9 try: validate_data(args.google_key_path,to_insert,common_parameters["BQ_DATASET"],'country_data') except AssertionError as e: drop_backup_table(args.google_key_path, common_parameters["BQ_DATASET"], backup_table_name) e.args += ("Schema of results does not match table schema.",) raise #Feature PECTEN-9 from_date = common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S") to_date = common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S") try: before_insert(args.google_key_path,common_parameters["BQ_DATASET"],"country_data",from_date,to_date,storage_client) except AssertionError as e: drop_backup_table(args.google_key_path,common_parameters["BQ_DATASET"],backup_table_name) e.args += ("Data already exists",) raise try: print("Inserting into BQ") storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], 'country_data', to_insert) except Exception as e: print(e) raise #Feature PECTEN-9 try: after_insert(args.google_key_path,common_parameters["BQ_DATASET"],"country_data",from_date,to_date) except AssertionError as e: e.args += ("No data was inserted.",) raise finally: drop_backup_table(args.google_key_path,common_parameters["BQ_DATASET"],backup_table_name)
def write_to_file(args): from utils.Storage import Storage,MongoEncoder # Get tweets query = """ SELECT * FROM pecten_dataset.tweets WHERE lang = 'de' """ storage_client = Storage(args.google_key_path) dataset = 'pecten_dataset' if args.environment != 'production': dataset += "_" + args.environment it = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] total = 0 ids = set() with open("tr_tweets.json", "r") as fo: for line in fo: ids.add(json.loads(line)['id']) with open("tr_tweets.json", "a") as f: for tweet in it: if tweet.id in ids: continue start_time = time.time() data = dict((k, tweet[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(tweet[k], datetime) else (k, tweet[k]) for k in list(tweet._xxx_field_to_index.keys())) response = None try: url = "http://{}:5674/translate".format(args.ip_address) payload = {'q': tweet['text']} print("Making request") response = requests.get(url, params=payload,timeout=11) response = json.loads(response.text) except Exception as e: print(e) continue translated_text = None if response: try: translated_text = response['data']['translations'][0]['translatedText'] except Exception as e: print(e) continue if translated_text: data['text_original_language'] = data['text'] data['text'] = translated_text f.write(json.dumps(data, cls=MongoEncoder) + '\n') print("--- {} seconds ---".format(time.time() - start_time)) total += 1 if total % 1000 == 0: print("Saved {} records".format(total))
def get_rss_feed(args): from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage storage_client = Storage(google_key_path=args.google_key_path) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'RSS' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # get constituents all_constituents = storage_client.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"]) for constituent_id, constituent_name, symbol in all_constituents: # get last news date for the constituent query = """ SELECT max(news_date) as max_date FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Yahoo Finance RSS" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) max_date = result[0]["max_date"] except Exception as e: max_date = None continue print("Getting RSS Feed for {}".format(constituent_name)) d = feedparser.parse( 'http://finance.yahoo.com/rss/headline?s={}'.format(symbol + ".DE")) to_insert = [] for post in d.entries: #print(post) date = datetime(post.published_parsed[0], post.published_parsed[1], post.published_parsed[2], post.published_parsed[3], post.published_parsed[4]) if max_date: max_date = str(max_date)[:19] max_date = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S') if date < max_date: continue doc = { "url": post.link, "news_date": date.strftime('%Y-%m-%d %H:%M:%S'), "news_language": post.summary_detail["language"], "news_title": post.title, "news_article_txt": post.summary, "news_origin": "Yahoo Finance RSS", "show": True } # Get sentiment score doc["score"] = tah.get_nltk_sentiment(str(doc["news_article_txt"])) # get sentiment word doc["sentiment"] = tah.get_sentiment_word(doc["score"]) # add constituent name, id and old name doc["constituent_id"] = constituent_id doc["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) doc["constituent"] = old_constituent_name to_insert.append(doc) try: print("Inserting into BQ") storage_client.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Yahoo Finance RSS" }] if parameters["LOGGING"] and to_insert: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client)
def main(args): from utils.Storage import Storage #Get tweets query = """ SELECT * FROM pecten_dataset.tweets_backup WHERE lang = 'de' and id NOT IN( SELECT distinct(id) FROM pecten_dataset.tweets WHERE text_original_language IS NOT NULL) """ storage_client = Storage(args.google_key_path) dataset = 'pecten_dataset' if args.environment != 'production': dataset += "_" + args.environment it = storage_client.get_bigquery_data(query,iterator_flag=True) to_insert = [] total = 0 for tweet in it: data = dict((k,tweet[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(tweet[k],datetime) else (k,tweet[k]) for k in list(tweet._xxx_field_to_index.keys())) response = None try: url = "http://{}:5674/translate".format(args.ip_address) payload = {'q': tweet['text']} response = requests.get(url, params=payload,timeout=11) response = json.loads(response.text) except Exception as e: print(e) continue translated_text = None if response: try: translated_text = response['data']['translations'][0]['translatedText'] except Exception as e: print(e) continue if translated_text: data['text_original_language'] = data['text'] data['text'] = translated_text to_insert.append(data) if len(to_insert) == 500: print("Inserting to BQ production") try: result = storage_client.insert_bigquery_data("pecten_dataset", "tweets", to_insert) if result: print("Data inserted") else: print("Data not inserted") except Exception as e: print(e) print("Inserting to BQ test") try: result = storage_client.insert_bigquery_data("pecten_dataset_test", "tweets", to_insert) if result: print("Data inserted") else: print("Data not inserted") except Exception as e: print(e) to_insert = []