def get_search_string(constituent_id, connection_string, table_keywords,
                      table_exclusions):
    if __name__ != "__main__":
        from utils.TwitterDownloader import TwitterDownloader
        from utils.Storage import Storage
        from utils import twitter_analytics_helpers as tap
        from utils.TaggingUtils import TaggingUtils as TU

    storage = Storage()

    where = lambda x: and_((x["ACTIVE_STATE"] == 1),
                           (x["CONSTITUENT_ID"] == constituent_id))

    keywords = storage.get_sql_data(sql_connection_string=connection_string,
                                    sql_table_name=table_keywords,
                                    sql_column_list=["KEYWORD"],
                                    sql_where=where)

    keywords_list = ['"' + key[0] + '"' for key in keywords]
    keywords_string = " OR ".join(keywords_list)

    exclusions = storage.get_sql_data(sql_connection_string=connection_string,
                                      sql_table_name=table_exclusions,
                                      sql_column_list=["EXCLUSIONS"],
                                      sql_where=where)

    exclusions_list = ["-" + key[0] for key in exclusions]
    exclusions_string = " ".join(exclusions_list)

    all_words = keywords_string + exclusions_string

    return all_words
示例#2
0
def main(arguments):
    storage = Storage()
    result = storage.get_sql_data(
        sql_connection_string=arguments.connection_string,
        sql_table_name=arguments.table_name,
        sql_column_list=["EMAIL_USERNAME"])

    print(result)
示例#3
0
def get_parameters(connection_string, table, column_list):
    storage = Storage()

    data = storage.get_sql_data(connection_string, table, column_list)[0]
    parameters = {}

    for i in range(0, len(column_list)):
        parameters[column_list[i]] = data[i]

    return parameters
示例#4
0
def get_parameters(connection_string, table, column_list):
    storage = Storage()

    data = storage.get_sql_data(connection_string, table, column_list)[0]
    parameters = {}

    for i in range(0, len(column_list)):
        parameters[column_list[i]] = data[i]

    return parameters
示例#5
0
def main(args):
    storage = Storage()
    data = {}
    #Get parameters
    parameters = storage.get_sql_data(
        sql_connection_string=args.connection_string,
        sql_table_name="PARAM_METADATA_COLLECTION",
        sql_column_list=[
            "BVD_USERNAME", "BVD_PWD", "BVD_DATABASE", "FRANKFURT_FIELDS"
        ])

    frankfurt_fields = parameters[3].split(',')

    #Get constituent metadata
    all_constituents = storage.get_sql_data(
        sql_connection_string=args.connection_string,
        sql_table_name="PARAM_METADATA_CONSTITUENTS",
        sql_column_list=[
            "CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY", "BVDID"
        ])

    print(all_constituents)

    username = getpass.getuser()

    to_insert = {
        "CONSTITUENT_ID": "",
        "CONSTITUENT_NAME": "",
        "ISIN": "",
        "BVDID": "",
        "INDUSTRY": "",
        "COUNTRY": "",
        "SYMBOL": "",
        "ACTIVE_STATE": True,
        "LISTED_SINCE": datetime.datetime.today(),
        "MARKET_SECTOR": "",
        "MARKET_SUBSECTOR": "",
        "TRADING_MODEL": "",
        "REUTERS_INSTRUMENT_CODE": "",
        "SHARE_TYPE": "",
        "MINIMUM_TRADE_UNIT": 0.0,
        "WEBSITE": "",
        "LAST_UPDATE_BY": username
    }

    for constituent_id, _, url_key, bvdid in all_constituents:
        print(constituent_id)
        fr = website_data(url_key, frankfurt_fields)
        bvd = get_bvd_data(parameters[0], parameters[1], parameters[2], bvdid)
        data.update(fr)
        data.update(bvd)

        to_insert["CONSTITUENT_ID"] = data["SD_TICKER"] + data['BVDID']
        to_insert["CONSTITUENT_NAME"] = data['NAME']
        to_insert["ISIN"] = data['SD_ISIN']
        to_insert["BVDID"] = data['BVDID']
        to_insert["INDUSTRY"] = data['MAJOR_SECTOR']
        to_insert["COUNTRY"] = data['COUNTRY']
        to_insert["SYMBOL"] = data['SD_TICKER']
        if isinstance(data['IPO_DATE'], str):
            to_insert["LISTED_SINCE"] = datetime.datetime.strptime(
                data['IPO_DATE'], "%Y/%m/%d")
        to_insert["MARKET_SECTOR"] = data["Sector"]
        to_insert["MARKET_SUBSECTOR"] = data["Subsector"]
        to_insert["TRADING_MODEL"] = data['Trading Model']
        to_insert["REUTERS_INSTRUMENT_CODE"] = data['Reuters Instrument Code']
        to_insert["SHARE_TYPE"] = data['TYPE_SHARE']
        to_insert["MINIMUM_TRADE_UNIT"] = data['Minimum tradeable Unit']
        to_insert["WEBSITE"] = data["WEBSITE"]

        storage.insert_to_sql(sql_connection_string=args.connection_string,
                              sql_table_name="MASTER_CONSTITUENTS",
                              data=to_insert)
def get_tweets(args):
    from utils import logging_utils as logging_utils
    from utils.TwitterDownloader import TwitterDownloader
    from utils.Storage import Storage
    from utils import twitter_analytics_helpers as tap
    from utils.TaggingUtils import TaggingUtils as TU

    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = [
        "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING",
        "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME",
        "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET",
        "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE"
    ]

    parameters = tap.get_parameters(args.param_connection_string, param_table,
                                    parameters_list)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tap.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    languages = parameters["LANGUAGE"].split(",")

    storage = Storage(google_key_path=args.google_key_path,
                      mongo_connection_string=parameters["CONNECTION_STRING"])
    tagger = TU()

    downloader = TwitterDownloader(parameters["TWITTER_API_KEY"],
                                   parameters["TWITTER_API_SECRET"])
    downloader.load_api()

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"])

    fields_to_keep = [
        "text", "favorite_count", "source", "retweeted", "entities", "id_str",
        "retweet_count", "favorited", "user", "lang", "created_at", "place",
        "constituent_name", "constituent_id", "search_term", "id",
        "sentiment_score", "entity_tags", "relevance", "constituent"
    ]

    for language in languages:
        for constituent_id, constituent_name in all_constituents:
            search_query = get_search_string(constituent_id,
                                             args.param_connection_string,
                                             "PARAM_TWITTER_KEYWORDS",
                                             "PARAM_TWITTER_EXCLUSIONS")

            #Get max id of all tweets to extract tweets with id highe than that
            q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \
                "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"],
                                        constituent_id,language)
            try:
                sinceId = int(
                    storage.get_bigquery_data(
                        q, iterator_flag=False)[0]["max_id"])
            except Exception as e:
                print(e)
                sinceId = None

            max_id = -1
            tweetCount = 0

            print("Downloading max {0} tweets for {1} in {2} on {3}".format(
                parameters["MAX_TWEETS"], constituent_name, language,
                str(datetime.now())))
            while tweetCount < parameters["MAX_TWEETS"]:
                tweets_unmodified = []
                tweets_modified = []
                tweets_mongo = []

                try:
                    tweets, tmp_tweet_count, max_id = downloader.download(
                        constituent_name, search_query, language,
                        parameters["TWEETS_PER_QUERY"], sinceId, max_id)
                except Exception as e:
                    continue

                if not tweets:
                    break
                else:
                    print("Downloaded {} tweets".format(tmp_tweet_count))

                tweetCount += tmp_tweet_count

                #Add fields for both unmodified and modified tweets
                for tweet in tweets:
                    tweet._json['source'] = "Twitter"
                    tweet._json['constituent_name'] = constituent_name
                    tweet._json['constituent_id'] = constituent_id
                    tweet._json['search_term'] = search_query
                    tweet._json["constituent"] = tap.get_old_constituent_name(
                        constituent_id)

                    #Removing bad fields
                    clean_tweet = tap.scrub(tweet._json)

                    # Separate the tweets that go to one topic or the other

                    #unmodified
                    t_unmodified = deepcopy(clean_tweet)
                    t_unmodified["date"] = tap.convert_timestamp(
                        t_unmodified["created_at"])
                    tweets_unmodified.append(t_unmodified)

                    #Add additional fields
                    clean_tweet["sentiment_score"] = tap.get_nltk_sentiment(
                        str(clean_tweet["text"]))
                    tagged_text = tagger.get_spacy_entities(
                        str(clean_tweet["text"]))
                    clean_tweet["entity_tags"] = tap.get_spacey_tags(
                        tagged_text)
                    clean_tweet["relevance"] = -1

                    #mongo
                    t_mongo = deepcopy(clean_tweet)
                    t_mongo['date'] = datetime.strptime(
                        t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y')
                    tweets_mongo.append(t_mongo)

                    #modified
                    tagged_tweet = dict((k, clean_tweet[k])
                                        for k in fields_to_keep
                                        if k in clean_tweet)
                    tagged_tweet['date'] = tap.convert_timestamp(
                        clean_tweet["created_at"])
                    tweets_modified.append(tagged_tweet)

                #send to PubSub topic
                #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified)
                #ps_utils.publish("igenie-project", "tweets", tweets_modified)
                try:
                    storage.insert_bigquery_data(
                        common_parameters["BQ_DATASET"],
                        '{}_unmodified'.format(
                            parameters["DESTINATION_TABLE"]),
                        tweets_unmodified)
                except Exception as e:
                    print(e)
                try:
                    storage.insert_bigquery_data(
                        common_parameters["BQ_DATASET"],
                        parameters["DESTINATION_TABLE"], tweets_modified)
                except Exception as e:
                    print(e)
                try:
                    storage.save_to_mongodb(tweets_mongo, "dax_gcp",
                                            parameters["DESTINATION_TABLE"])
                    pass
                except Exception as e:
                    print(e)

                time.sleep(1)

            print("Saved {} tweets for in {}".format(tweetCount,
                                                     constituent_name,
                                                     language))

            if parameters["LOGGING"]:
                doc = [{
                    "date":
                    time.strftime('%Y-%m-%d %H:%M:%S',
                                  datetime.now().date().timetuple()),
                    "constituent_name":
                    constituent_name,
                    "constituent_id":
                    constituent_id,
                    "downloaded_tweets":
                    tweetCount,
                    "language":
                    language
                }]
                logging_utils.logging(doc, common_parameters["BQ_DATASET"],
                                      parameters["LOGGING_TABLE"], storage)

    return "Downloaded tweets"
示例#7
0
def update_from_bigquery_split(args):
    #load data
    storage = Storage(args.google_key_path)
    tagger = TU()

    columns = ["CONSTITUENT_ID"]
    table = "MASTER_CONSTITUENTS"

    constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name=table,
        sql_column_list=columns)

    #constituents = [("BMWDE8170003036", )]

    start_time = time.time()

    for item in constituents:
        constituent_id = item[0]
        print("Loading {}".format(constituent_id))

        try:

            query = "SELECT text, id,favorite_count, source, retweeted,entities," \
                    "id_str,retweet_count,favorited,user,lang,created_at,place," \
                    "constituent_name,constituent_id,search_term, relevance " \
                    "FROM `pecten_dataset.tweets_unmodified` " \
                    "WHERE constituent_id = '{}' ".format(constituent_id)

            tweets = storage.get_bigquery_data(query)

            operations = []
            records = 0

            for tweet in tweets:
                row = {}
                row["text"] = tweet["text"]
                row["id"] = tweet["id"]
                row["favorite_count"] = tweet["favorite_count"]
                row["source"] = tweet["source"]
                row["retweeted"] = tweet["retweeted"]
                row["entities"] = tweet["entities"]
                row["id_str"] = tweet["id_str"]
                row["retweet_count"] = tweet["retweet_count"]
                row["favorited"] = tweet["favorited"]
                row["user"] = tweet["user"]
                row["lang"] = tweet["lang"]
                row["created_at"] = tweet["created_at"]
                row["place"] = tweet["place"]
                row["constituent_name"] = tweet["constituent_name"]
                row["constituent_id"] = tweet["constituent_id"]
                row["search_term"] = tweet["search_term"]
                row["relevance"] = tweet["relevance"]

                # Additional fields
                if isinstance(tweet["created_at"], str):
                    date = tweet["created_at"]
                    ts = time.strptime(date, "%a %b %d %H:%M:%S %z %Y")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    row['date'] = ts

                # sentiment score
                row["sentiment_score"] = get_nltk_sentiment(tweet["text"])

                # TO DO
                tagged_text = tagger.get_spacy_entities(tweet["text"])
                row["entity_tags"] = get_spacey_tags(tagged_text)

                operations.append(row)
                # print(row)

                if len(operations) == 1000:
                    result = None
                    result = storage.insert_bigquery_data(
                        'pecten_dataset', 'tweets', operations)
                    records += 1000
                    print("Performed bulk write of {} records".format(records))
                    if not result:
                        print("Records not inserted")

                    operations = []

            if len(operations) > 0:
                result = None
                result = storage.insert_bigquery_data('pecten_dataset',
                                                      'tweets', operations)
                records += 1000
                if not result:
                    print("Records not inserted")

        except Exception as e:
            print(e)

    print("--- %s seconds ---" % (time.time() - start_time))
示例#8
0
def get_daily_orbis_news(args):
    if __name__ != "__main__":
        from utils.Storage import Storage
        from utils.Storage import MongoEncoder
        from utils.SOAPUtils import SOAPUtils
        from utils import twitter_analytics_helpers as tah
        from utils.TaggingUtils import TaggingUtils as TU
        from utils.Storage import Storage
        from utils.Storage import MongoEncoder
        from utils.SOAPUtils import SOAPUtils
        from utils import logging_utils as logging_utils

    # get constituents
    soap = SOAPUtils()
    storage = Storage(args.google_key_path)
    tagger = TU()

    columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"]
    table = "MASTER_CONSTITUENTS"

    constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name=table,
        sql_column_list=columns)

    # Get parameters
    param_table = "PARAM_NEWS_COLLECTION"
    parameters_list = [
        "LOGGING", "BVD_USERNAME", "BVD_PASSWORD", "DESTINATION_TABLE",
        "LOGGING_TABLE"
    ]
    where = lambda x: x["SOURCE"] == 'Orbis'

    parameters = tah.get_parameters(args.param_connection_string, param_table,
                                    parameters_list, where)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    for constituent_id, constituent_name, bvdid in constituents:
        #get last news date for the constituent
        query = """
            SELECT max(news_id) as max_id FROM `{}.{}`
            WHERE constituent_id = '{}' and news_origin = "Orbis"
            """.format(common_parameters["BQ_DATASET"],
                       parameters["DESTINATION_TABLE"], constituent_id)

        try:
            result = storage.get_bigquery_data(query=query,
                                               iterator_flag=False)
            max_id = result[0]["max_id"]
        except Exception as e:
            max_id = None
            continue

        start = 0
        max_count = 20
        print("Constituent: {},{}".format(constituent_name, bvdid))

        stop = False

        try:
            token = soap.get_token(parameters["BVD_USERNAME"],
                                   parameters["BVD_PASSWORD"], 'orbis')
            selection_token, selection_count = soap.find_by_bvd_id(
                token, bvdid, 'orbis')
        except Exception as e:
            print(e)
            return

        while not stop:
            try:
                query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \
                        "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \
                        "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \
                        "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \
                        "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \
                        "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \
                        "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \
                        "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \
                        "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \
                        "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \
                        "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format(
                    start, max_count)

                get_data_result = soap.get_data(token,
                                                selection_token,
                                                selection_count,
                                                query,
                                                'orbis',
                                                timeout=None)
            except Exception as e:
                print(str(e))
                if token:
                    soap.close_connection(token, 'orbis')
                    token = None
                break

            result = ET.fromstring(get_data_result)
            csv_result = result[0][0][0].text

            TESTDATA = StringIO(csv_result)
            try:
                df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"])
            except Exception as e:
                print(e)
                break

            if df.shape[0] == 0:
                print("No records in df")
                break

            initial_shape = df.shape[0]

            df.astype({"news_id": int}, copy=False, errors='ignore')
            df = df.loc[df["news_id"] > max_id]

            if df.shape[0] == 0:
                print("No new data")
                break

            if df.shape[0] != initial_shape:
                stop = True

            # Make news_title column a string
            df.astype({"news_title": str}, copy=False, errors='ignore')

            # Remove duplicate columns
            df.drop_duplicates(["news_title"], inplace=True)

            # Get sentiment score
            df["score"] = df.apply(lambda row: tah.get_nltk_sentiment(
                str(row["news_article_txt"])),
                                   axis=1)

            # get sentiment word
            df["sentiment"] = df.apply(
                lambda row: tah.get_sentiment_word(row["score"]), axis=1)

            # add constituent name, id and old name
            df["constituent_id"] = constituent_id
            df["constituent_name"] = constituent_name
            old_constituent_name = tah.get_old_constituent_name(constituent_id)
            df["constituent"] = old_constituent_name

            # add URL
            df["url"] = None

            # add show
            df["show"] = True

            # get entity tags
            entity_tags = []
            for text in df["news_title"]:
                tags = tah.get_spacey_tags(tagger.get_spacy_entities(
                    str(text)))
                entity_tags.append(tags)

            fields = [
                "news_date", "news_title", "news_article_txt",
                "news_companies", "news_source", "news_publication",
                "news_topics", "news_country", "news_region", "news_language",
                "news_id", "score", "sentiment", "constituent_id",
                "constituent_name", "constituent", "url", "show"
            ]

            # Get needed fields
            df_bigquery = df[fields]
            bigquery_data = json.loads(
                df_bigquery.to_json(orient="records", date_format="iso"))

            # set entity_tag field
            i = 0
            for i in range(0, len(bigquery_data)):
                bigquery_data[i]["entity_tags"] = entity_tags[i]

                # set news_date
                if "news_date" in bigquery_data[i] and bigquery_data[i][
                        "news_date"]:
                    date = bigquery_data[i]["news_date"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    bigquery_data[i]["news_date"] = ts
                else:
                    bigquery_data[i]["news_date"] = None

                if bigquery_data[i]["news_title"]:
                    bigquery_data[i]["news_title"] = str(
                        bigquery_data[i]["news_title"])

                if bigquery_data[i]["news_article_txt"]:
                    bigquery_data[i]["news_article_txt"] = str(
                        bigquery_data[i]["news_article_txt"])

                if bigquery_data[i]["news_companies"]:
                    try:
                        bigquery_data[i]["news_companies"] = [
                            i.strip() for i in bigquery_data[i]
                            ["news_companies"].split(";")
                        ]
                    except Exception as e:
                        bigquery_data[i]["news_companies"] = []
                else:
                    bigquery_data[i]["news_companies"] = []

                if bigquery_data[i]["news_topics"]:
                    try:
                        if bigquery_data[i]["news_topics"].isdigit():
                            bigquery_data[i]["news_topics"] = []
                        elif bigquery_data[i]["news_topics"] == 'None':
                            bigquery_data[i]["news_topics"] = []
                        else:
                            bigquery_data[i]["news_topics"] = [
                                i.strip() for i in bigquery_data[i]
                                ["news_topics"].split(";")
                            ]
                    except Exception as e:
                        bigquery_data[i]["news_topics"] = []
                else:
                    bigquery_data[i]["news_topics"] = []

                if bigquery_data[i]["news_country"]:
                    try:
                        bigquery_data[i]["news_country"] = [
                            i.strip() for i in bigquery_data[i]
                            ["news_country"].split(";")
                        ]
                    except Exception as e:
                        bigquery_data[i]["news_country"] = []
                else:
                    bigquery_data[i]["news_country"] = []

                if bigquery_data[i]["news_region"]:
                    try:
                        bigquery_data[i]["news_region"] = [
                            i.strip()
                            for i in bigquery_data[i]["news_region"].split(";")
                        ]
                    except Exception as e:
                        bigquery_data[i]["news_region"] = []
                else:
                    bigquery_data[i]["news_region"] = []

                if bigquery_data[i]["news_language"]:
                    bigquery_data[i]["news_language"] = str(
                        bigquery_data[i]["news_language"])

                if bigquery_data[i]["news_source"]:
                    bigquery_data[i]["news_source"] = str(
                        bigquery_data[i]["news_source"])

                if bigquery_data[i]["news_publication"]:
                    bigquery_data[i]["news_publication"] = str(
                        bigquery_data[i]["news_publication"])

                if bigquery_data[i]["news_id"]:
                    try:
                        bigquery_data[i]["news_id"] = int(
                            bigquery_data[i]["news_id"])
                    except Exception as e:
                        bigquery_data[i]["news_id"] = str(
                            bigquery_data[i]["news_id"])

                if bigquery_data[i]["sentiment"]:
                    bigquery_data[i]["sentiment"] = str(
                        bigquery_data[i]["sentiment"])

                if bigquery_data[i]["constituent_id"]:
                    bigquery_data[i]["constituent_id"] = str(
                        bigquery_data[i]["constituent_id"])

                if bigquery_data[i]["constituent_name"]:
                    bigquery_data[i]["constituent_name"] = str(
                        bigquery_data[i]["constituent_name"])

                if bigquery_data[i]["constituent"]:
                    bigquery_data[i]["constituent"] = str(
                        bigquery_data[i]["constituent"])

            try:
                storage.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                             parameters["DESTINATION_TABLE"],
                                             bigquery_data)
            except Exception as e:
                print(e)

            start = start + 20

            print("Records saved: {}".format(df.shape[0]))

            log = [{
                "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'),
                "constituent_name": constituent_name,
                "constituent_id": constituent_id,
                "downloaded_news": df.shape[0],
                "source": "Orbis"
            }]

            if parameters["LOGGING"] and bigquery_data:
                logging_utils.logging(log, common_parameters["BQ_DATASET"],
                                      parameters["LOGGING_TABLE"], storage)

        if token:
            soap.close_connection(token, 'orbis')
示例#9
0
def get_historical_orbis_news(user, pwd, database, google_key_path,
                              param_connection_string):
    #get parameters
    soap = SOAPUtils()
    storage = Storage(google_key_path)
    tagger = TU()

    columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"]
    table = "MASTER_CONSTITUENTS"

    constituents = storage.get_sql_data(
        sql_connection_string=param_connection_string,
        sql_table_name=table,
        sql_column_list=columns)[2:]

    to_skip = [
        "BAYERISCHE MOTOREN WERKE AG", "DEUTSCHE BANK AG", "SIEMENS AG",
        "SAP SE", "VOLKSWAGEN AG"
    ]

    for constituent_id, constituent_name, bvdid in constituents:
        limit = get_number_of_news_items(constituent_name)
        if constituent_name in to_skip:
            print("Skipping")
            continue

        records = 0
        start = 0
        max_count = 20
        filename = "bq_news_{}.json".format(constituent_id)
        print("Constituent: {},{}".format(constituent_name, bvdid))
        failed = 0

        if constituent_name == "BASF SE":
            start = 4280

        try:
            token = soap.get_token(user, pwd, database)
            selection_token, selection_count = soap.find_by_bvd_id(
                token, bvdid, database)
        except Exception as e:
            print(e)
            return

        with open(filename, "a") as f:
            while limit > records:
                try:
                    query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \
                            "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \
                            "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \
                            "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \
                            "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \
                            "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \
                            "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \
                            "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \
                            "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \
                            "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \
                            "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format(
                        start, max_count)

                    get_data_result = soap.get_data(token,
                                                    selection_token,
                                                    selection_count,
                                                    query,
                                                    database,
                                                    timeout=None)
                except Exception as e:
                    print(str(e))
                    continue
                finally:
                    pass
                    #if token:
                    #    soap.close_connection(token, database)

                result = ET.fromstring(get_data_result)
                csv_result = result[0][0][0].text

                TESTDATA = StringIO(csv_result)
                try:
                    df = pd.read_csv(TESTDATA,
                                     sep=",",
                                     parse_dates=["news_date"])
                except Exception as e:
                    print(e)
                    continue

                if df.shape[0] == 0:
                    print("No records in df")
                    continue

                #Make news_title column a string
                df.astype({"news_title": str}, copy=False, errors='ignore')

                # Remove duplicate columns
                df.drop_duplicates(["news_title"], inplace=True)

                # Get sentiment score
                df["score"] = df.apply(lambda row: tah.get_nltk_sentiment(
                    str(row["news_article_txt"])),
                                       axis=1)
                #df["score"] = df.apply(lambda row: sia.polarity_scores(str(row["news_article_txt"]))['compound'] , axis=1)

                # get sentiment word
                df["sentiment"] = df.apply(
                    lambda row: tah.get_sentiment_word(row["score"]), axis=1)

                # add constituent name, id and old name
                df["constituent_id"] = constituent_id
                df["constituent_name"] = constituent_name
                old_constituent_name = tah.get_old_constituent_name(
                    constituent_id)
                df["constituent"] = old_constituent_name

                # add URL
                df["url"] = None

                # add show
                df["show"] = True

                # get entity tags
                entity_tags = []
                for text in df["news_title"]:
                    tags = tah.get_spacey_tags(
                        tagger.get_spacy_entities(str(text)))
                    entity_tags.append(tags)

                fields = [
                    "news_date", "news_title", "news_article_txt",
                    "news_companies", "news_source", "news_publication",
                    "news_topics", "news_country", "news_region",
                    "news_language", "news_id", "score", "sentiment",
                    "constituent_id", "constituent_name", "constituent", "url",
                    "show"
                ]
                '''
                # Save to MongoDB
                filter = ["NEWS_DATE_NewsDim", "NEWS_TITLE_NewsDim", "NEWS_ARTICLE_TXT_NewsDim",
                          "NEWS_SOURCE_NewsDim", "NEWS_PUBLICATION_NewsDim", "categorised_tag", "score", "sentiment",
                          "constituent_id", "constituent_name", "constituent","url", "show"]

                df_mongo = df[fields]
                df_mongo.columns = filter

                mongo_data = json.loads(df_mongo.to_json(orient="records", date_format="iso"))

                # set entity_tag field
                i = 0
                for i in range(0, len(mongo_data)):
                    mongo_data[i]["entity_tags"] = entity_tags[i]
                    date = mongo_data[i]["NEWS_DATE_NewsDim"]
                    #ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    mongo_data[i]["NEWS_DATE_NewsDim"] = ts

                #storage.save_to_mongodb(mongo_data, "dax_gcp", "all_news")
                '''

                #Get needed fields
                df_bigquery = df[fields]
                bigquery_data = json.loads(
                    df_bigquery.to_json(orient="records", date_format="iso"))

                # set entity_tag field
                i = 0
                for i in range(0, len(bigquery_data)):
                    bigquery_data[i]["entity_tags"] = entity_tags[i]

                    #set news_date
                    if "news_date" in bigquery_data[i] and bigquery_data[i][
                            "news_date"]:
                        date = bigquery_data[i]["news_date"]
                        ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                        ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                        bigquery_data[i]["news_date"] = ts
                    else:
                        bigquery_data[i]["news_date"] = None

                    if bigquery_data[i]["news_title"]:
                        bigquery_data[i]["news_title"] = str(
                            bigquery_data[i]["news_title"])

                    if bigquery_data[i]["news_article_txt"]:
                        bigquery_data[i]["news_article_txt"] = str(
                            bigquery_data[i]["news_article_txt"])

                    if bigquery_data[i]["news_companies"]:
                        try:
                            bigquery_data[i]["news_companies"] = [
                                i.strip() for i in bigquery_data[i]
                                ["news_companies"].split(";")
                            ]
                        except Exception as e:
                            bigquery_data[i]["news_companies"] = []
                    else:
                        bigquery_data[i]["news_companies"] = []

                    if bigquery_data[i]["news_topics"]:
                        try:
                            if bigquery_data[i]["news_topics"].isdigit():
                                bigquery_data[i]["news_topics"] = []
                            elif bigquery_data[i]["news_topics"] == 'None':
                                bigquery_data[i]["news_topics"] = []
                            else:
                                bigquery_data[i]["news_topics"] = [
                                    i.strip() for i in bigquery_data[i]
                                    ["news_topics"].split(";")
                                ]
                        except Exception as e:
                            bigquery_data[i]["news_topics"] = []
                    else:
                        bigquery_data[i]["news_topics"] = []

                    if bigquery_data[i]["news_country"]:
                        try:
                            bigquery_data[i]["news_country"] = [
                                i.strip() for i in bigquery_data[i]
                                ["news_country"].split(";")
                            ]
                        except Exception as e:
                            bigquery_data[i]["news_country"] = []
                    else:
                        bigquery_data[i]["news_country"] = []

                    if bigquery_data[i]["news_region"]:
                        try:
                            bigquery_data[i]["news_region"] = [
                                i.strip() for i in bigquery_data[i]
                                ["news_region"].split(";")
                            ]
                        except Exception as e:
                            bigquery_data[i]["news_region"] = []
                    else:
                        bigquery_data[i]["news_region"] = []

                    if bigquery_data[i]["news_language"]:
                        bigquery_data[i]["news_language"] = str(
                            bigquery_data[i]["news_language"])

                    if bigquery_data[i]["news_source"]:
                        bigquery_data[i]["news_source"] = str(
                            bigquery_data[i]["news_source"])

                    if bigquery_data[i]["news_publication"]:
                        bigquery_data[i]["news_publication"] = str(
                            bigquery_data[i]["news_publication"])

                    if bigquery_data[i]["news_id"]:
                        try:
                            bigquery_data[i]["news_id"] = int(
                                bigquery_data[i]["news_id"])
                        except Exception as e:
                            bigquery_data[i]["news_id"] = str(
                                bigquery_data[i]["news_id"])

                    if bigquery_data[i]["sentiment"]:
                        bigquery_data[i]["sentiment"] = str(
                            bigquery_data[i]["sentiment"])

                    if bigquery_data[i]["constituent_id"]:
                        bigquery_data[i]["constituent_id"] = str(
                            bigquery_data[i]["constituent_id"])

                    if bigquery_data[i]["constituent_name"]:
                        bigquery_data[i]["constituent_name"] = str(
                            bigquery_data[i]["constituent_name"])

                    if bigquery_data[i]["constituent"]:
                        bigquery_data[i]["constituent"] = str(
                            bigquery_data[i]["constituent"])

                    f.write(
                        json.dumps(bigquery_data[i], cls=MongoEncoder) + '\n')

                # storage.insert_bigquery_data("pecten_dataset", "news", bigquery_data)

                start = start + 20
                #end = start + 10
                records += 20
                print("Records saved: {}".format(records))

        if token:
            soap.close_connection(token, database)
示例#10
0
def get_rss_feed(args):
    from utils import logging_utils as logging_utils
    from utils import twitter_analytics_helpers as tah
    from utils.Storage import Storage

    storage_client = Storage(google_key_path=args.google_key_path)

    # Get parameters
    param_table = "PARAM_NEWS_COLLECTION"
    parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"]
    where = lambda x: x["SOURCE"] == 'RSS'

    parameters = tah.get_parameters(args.param_connection_string, param_table,
                                    parameters_list, where)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    # get constituents
    all_constituents = storage_client.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"])

    for constituent_id, constituent_name, symbol in all_constituents:
        # get last news date for the constituent
        query = """
                    SELECT max(news_date) as max_date FROM `{}.{}`
                    WHERE constituent_id = '{}' and news_origin = "Yahoo Finance RSS"
                    """.format(common_parameters["BQ_DATASET"],
                               parameters["DESTINATION_TABLE"], constituent_id)

        try:
            result = storage_client.get_bigquery_data(query=query,
                                                      iterator_flag=False)
            max_date = result[0]["max_date"]
        except Exception as e:
            max_date = None
            continue

        print("Getting RSS Feed for {}".format(constituent_name))
        d = feedparser.parse(
            'http://finance.yahoo.com/rss/headline?s={}'.format(symbol +
                                                                ".DE"))

        to_insert = []

        for post in d.entries:
            #print(post)
            date = datetime(post.published_parsed[0], post.published_parsed[1],
                            post.published_parsed[2], post.published_parsed[3],
                            post.published_parsed[4])
            if max_date:
                max_date = str(max_date)[:19]
                max_date = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S')
                if date < max_date:
                    continue

            doc = {
                "url": post.link,
                "news_date": date.strftime('%Y-%m-%d %H:%M:%S'),
                "news_language": post.summary_detail["language"],
                "news_title": post.title,
                "news_article_txt": post.summary,
                "news_origin": "Yahoo Finance RSS",
                "show": True
            }

            # Get sentiment score
            doc["score"] = tah.get_nltk_sentiment(str(doc["news_article_txt"]))

            # get sentiment word
            doc["sentiment"] = tah.get_sentiment_word(doc["score"])

            # add constituent name, id and old name
            doc["constituent_id"] = constituent_id
            doc["constituent_name"] = constituent_name
            old_constituent_name = tah.get_old_constituent_name(constituent_id)
            doc["constituent"] = old_constituent_name

            to_insert.append(doc)

        try:
            print("Inserting into BQ")
            storage_client.insert_bigquery_data(
                common_parameters["BQ_DATASET"],
                parameters["DESTINATION_TABLE"], to_insert)
        except Exception as e:
            print(e)

        log = [{
            "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'),
            "constituent_name": constituent_name,
            "constituent_id": constituent_id,
            "downloaded_news": len(to_insert),
            "source": "Yahoo Finance RSS"
        }]

        if parameters["LOGGING"] and to_insert:
            logging_utils.logging(log, common_parameters["BQ_DATASET"],
                                  parameters["LOGGING_TABLE"], storage_client)
示例#11
0
def get_twitter_analytics_top_orgs_bq(args, from_date, to_date):
    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = ["CONNECTION_STRING"]

    parameters = get_parameters(args.param_connection_string, param_table,
                                parameters_list)

    storage = Storage(google_key_path=args.google_key_path,
                      mongo_connection_string=parameters["CONNECTION_STRING"])

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"])

    client = MongoClient(parameters["CONNECTION_STRING"])
    db = client["dax_gcp"]
    collection = db["tweets"]
    twitter_analytics_top_orgs = db["twitter_analytics_top_orgs"]

    for constituent_id, constituent_name in all_constituents:
        print(constituent_name)
        pipeline = [{
            "$match": {
                "constituent_id": constituent_id,
                "date": {
                    "$gte": from_date,
                    "$lte": to_date
                }
            }
        }, {
            "$project": {
                "constituent_id": "$constituent_id",
                "constituent_name": "$constituent_name",
                "org_tags": "$entity_tags.ORG",
                "constituent": "$constituent"
            }
        }, {
            "$unwind": "$org_tags"
        }, {
            "$group": {
                "_id": {
                    "constituent_id": "$constituent_id",
                    "org_tags": "$org_tags",
                    "constituent_name": "$constituent_name"
                },
                "count": {
                    "$sum": 1
                }
            }
        }, {
            "$sort": SON([("count", -1)])
        }, {
            "$project": {
                "_id": 0,
                "constituent_id": "$_id.constituent_id",
                "constituent_name": "$_id.constituent_name",
                "org_tags": "$_id.org_tags",
                "count": "$count",
                "from_date": from_date,
                "to_date": to_date,
                "date": datetime.now()
            }
        }]

        results = list(collection.aggregate(pipeline))
        to_return = []

        for r in results:
            if r["constituent_name"].lower() in r["org_tags"].lower() or \
               r["org_tags"].lower() in r["constituent_name"].lower() or \
               " AG".lower() in r["org_tags"].lower() or \
               len(r["org_tags"].split(" ")) > 3 or \
               r["count"] < 10 or \
               r["org_tags"].lower() == "Trends".lower() or \
               r["org_tags"].lower() == "Share".lower():
                continue
            else:
                to_return.append(r)

        if to_return:
            twitter_analytics_top_orgs.insert_many(to_return)
        else:
            if results:
                twitter_analytics_top_orgs.insert_many(results)
示例#12
0
def get_twitter_analytics_latest_price_tweets(args, from_date, to_date):
    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = ["CONNECTION_STRING"]

    parameters = get_parameters(args.param_connection_string, param_table,
                                parameters_list)

    storage = Storage(google_key_path=args.google_key_path,
                      mongo_connection_string=parameters["CONNECTION_STRING"])

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"])

    client = MongoClient(parameters["CONNECTION_STRING"])
    db = client["dax_gcp"]
    collection = db["tweets"]
    twitter_analytics_latest_price_tweets = db[
        "twitter_analytics_latest_price_tweets"]

    for constituent_id, constituent_name, symbol in all_constituents:
        symbol = '$' + symbol
        print(symbol)
        to_return = []
        final = []

        results = list(
            collection.find(
                {
                    "date": {
                        "$gte": from_date,
                        "$lte": to_date
                    },
                    "constituent_id": constituent_id,
                    "entities.symbols": {
                        "$exists": True,
                        "$ne": []
                    }
                }, {
                    "text": 1,
                    "constituent_id": 1,
                    "constituent_name": 1,
                    "constituent": 1,
                    "entity_tags.MONEY": 1,
                    "sentiment_score": 1,
                    "date": 1,
                    "_id": 0
                }).sort([("date", -1)]))

        if results:
            print("A")
            for item in results:
                if symbol in item["text"].split(" "):
                    if item["entity_tags"]["MONEY"]:
                        text = item["text"].replace("$", "")
                        text = text.replace("€", "")
                        text = text.replace("EUR", " ")
                        tokens = text.split(" ")
                        contains_number = False
                        for t in tokens:
                            try:
                                float(t)
                                contains_number = True
                            except Exception as e:
                                pass

                        if contains_number:
                            clean_tokens = [
                                t for t in item["text"].split(" ")
                                if "http" not in t
                            ]
                            item["text"] = " ".join(clean_tokens)
                            item["tweet_date"] = item["date"]
                            item["date"] = datetime.now()
                            item["from_date"] = from_date
                            item["to_date"] = to_date
                            to_return.append(item)

            if to_return:
                df = pd.DataFrame(to_return)
                df.drop_duplicates("text", inplace=True)

                final = df.to_json(orient="records", date_format="iso")
                final = json.loads(final)

                for f in final:
                    f["from_date"] = datetime.strptime(
                        f["from_date"], "%Y-%m-%dT%H:%M:%S.%fZ")
                    f["to_date"] = datetime.strptime(f["to_date"],
                                                     "%Y-%m-%dT%H:%M:%S.%fZ")
            else:
                print("A2")
                results = list(
                    collection.find(
                        {
                            "date": {
                                "$gte": from_date,
                                "$lte": to_date
                            },
                            "constituent_id": constituent_id,
                            "user.followers_count": {
                                "$gte": 200
                            }
                        }, {
                            "text": 1,
                            "constituent_id": 1,
                            "constituent_name": 1,
                            "constituent": 1,
                            "entity_tags.MONEY": 1,
                            "sentiment_score": 1,
                            "date": 1,
                            "_id": 0
                        }).limit(5).sort([("date", -1)]))

                if results:
                    df = pd.DataFrame(results)
                    df.drop_duplicates("text", inplace=True)

                    final = df.to_json(orient="records", date_format="iso")
                    final = json.loads(final)

                    for f in final:
                        f["from_date"] = from_date
                        f["to_date"] = to_date
                else:
                    print("B2")
                    final = [{
                        'constituent': constituent_name,
                        'constituent_id': constituent_id,
                        'constituent_name': constituent_name,
                        'date': datetime.now(),
                        'entity_tags': {
                            'MONEY': ['']
                        },
                        'from_date': from_date,
                        'sentiment_score': 0.0,
                        'text': 'No new tweets.',
                        'to_date': to_date,
                        'tweet_date': None
                    }]

        else:
            print("B")
            results = list(
                collection.find(
                    {
                        "date": {
                            "$gte": from_date,
                            "$lte": to_date
                        },
                        "constituent_id": constituent_id,
                        "user.followers_count": {
                            "$gte": 200
                        }
                    }, {
                        "text": 1,
                        "constituent_id": 1,
                        "constituent_name": 1,
                        "constituent": 1,
                        "entity_tags.MONEY": 1,
                        "sentiment_score": 1,
                        "date": 1,
                        "_id": 0
                    }).limit(5).sort([("date", -1)]))

            if results:
                df = pd.DataFrame(results)
                df.drop_duplicates("text", inplace=True)

                final = df.to_json(orient="records", date_format="iso")
                final = json.loads(final)

                for f in final:
                    f["from_date"] = from_date
                    f["to_date"] = to_date

            else:
                print("C")
                results = list(
                    collection.find({
                        "constituent_id": constituent_id
                    }, {
                        "text": 1,
                        "constituent_id": 1,
                        "constituent_name": 1,
                        "constituent": 1,
                        "entity_tags.MONEY": 1,
                        "sentiment_score": 1,
                        "date": 1,
                        "_id": 0
                    }).limit(5).sort([("date", -1)]))

                if results:
                    df = pd.DataFrame(results)
                    df.drop_duplicates("text", inplace=True)

                    final = df.to_json(orient="records", date_format="iso")
                    final = json.loads(final)

                    for f in final:
                        f["from_date"] = from_date
                        f["to_date"] = to_date

                else:
                    print("D")
                    final = [{
                        'constituent': constituent_name,
                        'constituent_id': constituent_id,
                        'constituent_name': constituent_name,
                        'date': datetime.now(),
                        'entity_tags': {
                            'MONEY': ['']
                        },
                        'from_date': from_date,
                        'sentiment_score': 0.0,
                        'text': 'No new tweets.',
                        'to_date': to_date,
                        'tweet_date': None
                    }]

        if final:
            twitter_analytics_latest_price_tweets.insert_many(final)
        time.sleep(1)
示例#13
0
def extract_ticker_data(args):
    if __name__ != "__main__":
        from utils import logging_utils as logging_utils
        from utils import twitter_analytics_helpers as tah
        from utils.Storage import Storage

    #ua = UserAgent()
    base_url = "http://charts.finanzen.net/ChartData.ashx?request="

    #Get parameters
    param_table = "PARAM_TICKER_COLLECTION"
    parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"]

    parameters = tah.get_parameters(args.param_connection_string, param_table,
                                    parameters_list)

    #Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    # Get constituents
    storage = Storage(args.google_key_path)

    columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"]
    table = "PARAM_TICKER_KEYS"

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name=table,
        sql_column_list=columns)

    i = 0
    for constituent_id, constituent_name, url_key in all_constituents:
        if i == 10 or i == 29:
            time.sleep(900)

        # get last deal
        query = """
                SELECT max(datetime) as max_date FROM `{}.{}`
                WHERE constituent_id = '{}';
        """.format(common_parameters["BQ_DATASET"],
                   parameters["DESTINATION_TABLE"], constituent_id)

        try:
            result = storage.get_bigquery_data(query=query,
                                               iterator_flag=False)
        except Exception as e:
            print(e)
            return

        max_date = result[0]["max_date"]

        print("Getting ticker data for {}".format(constituent_name))
        url = base_url + url_key + '6000'

        list_of_ticks = []

        try:
            response = requests.get(url)
            if response.status_code == requests.codes.ok:
                data = response.text.splitlines()

                for item in data[1:-2]:
                    fields = item.split(';')

                    if max_date and max_date.date() == datetime.strptime(
                            fields[0], "%Y-%m-%d-%H-%M-%S-%f").date():
                        print("Last date in ticker_data is same date as today")
                        break

                    document = {
                        'constituent_name':
                        constituent_name,
                        'constituent_id':
                        constituent_id,
                        'date':
                        fields[0][0:10],
                        'time':
                        fields[0][11:19].replace('-', ':'),
                        'datetime':
                        datetime.strptime(fields[0],
                                          "%Y-%m-%d-%H-%M-%S-%f").strftime(
                                              '%Y-%m-%d %H:%M:%S'),
                        'open':
                        float(fields[1].replace(',', '')),
                        'high':
                        float(fields[2].replace(',', '')),
                        'low':
                        float(fields[3].replace(',', '')),
                        'close':
                        float(fields[4].replace(',', '')),
                        'volume':
                        float(fields[5].replace(',', ''))
                    }

                    list_of_ticks.append(document)

                if list_of_ticks:
                    try:
                        print("Inserting into BQ")
                        storage.insert_bigquery_data(
                            common_parameters["BQ_DATASET"],
                            parameters["DESTINATION_TABLE"], list_of_ticks)
                        # pprint(list_of_ticks)
                    except Exception as e:
                        print(e)

                    if parameters["LOGGING"]:
                        doc = [{
                            "date":
                            datetime.now().date().strftime(
                                '%Y-%m-%d %H:%M:%S'),
                            "constituent_name":
                            constituent_name,
                            "constituent_id":
                            constituent_id,
                            "downloaded_ticks":
                            len(list_of_ticks)
                        }]
                        logging_utils.logging(doc,
                                              common_parameters["BQ_DATASET"],
                                              parameters["LOGGING_TABLE"],
                                              storage)

                i += 1

            else:
                print("Error making request, code {}. Retrying...".format(
                    response.status_code))
                time.sleep(20)
                response = requests.get(url)
                if response.status_code == requests.codes.ok:
                    data = response.text.splitlines()

                    for item in data[1:-2]:
                        fields = item.split(';')

                        document = {
                            'constituent_name':
                            constituent_name,
                            'constituent_id':
                            constituent_id,
                            'date':
                            fields[0][0:10],
                            'time':
                            fields[0][11:19].replace('-', ':'),
                            'datetime':
                            datetime.strptime(fields[0],
                                              "%Y-%m-%d-%H-%M-%S-%f").strftime(
                                                  '%Y-%m-%d %H:%M:%S'),
                            'open':
                            float(fields[1].replace(',', '')),
                            'high':
                            float(fields[2].replace(',', '')),
                            'low':
                            float(fields[3].replace(',', '')),
                            'close':
                            float(fields[4].replace(',', '')),
                            'volume':
                            float(fields[5].replace(',', ''))
                        }

                        list_of_ticks.append(document)

                    if list_of_ticks:
                        try:
                            print("Inserting into BQ")
                            storage.insert_bigquery_data(
                                common_parameters["BQ_DATASET"],
                                parameters["DESTINATION_TABLE"], list_of_ticks)
                        except Exception as e:
                            print(e)

                        if parameters["LOGGING"]:
                            doc = [{
                                "date":
                                datetime.now().date().strftime(
                                    '%Y-%m-%d %H:%M:%S'),
                                "constituent_name":
                                constituent_name,
                                "constituent_id":
                                constituent_id,
                                "downloaded_ticks":
                                len(list_of_ticks)
                            }]
                            logging_utils.logging(
                                doc, common_parameters["BQ_DATASET"],
                                parameters["LOGGING_TABLE"], storage)

                    i += 1

                else:
                    return "Error making request, code {}. Skipping.".format(
                        response.status_code)

        except Exception as ex:
            return str(ex)
        finally:
            time.sleep(5)
示例#14
0
def update_from_bigquery(args):
    #load data
    storage = Storage(args.google_key_path)
    tagger = TU()

    columns = ["CONSTITUENT_ID"]
    table = "MASTER_CONSTITUENTS"

    constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name=table,
        sql_column_list=columns)

    start_time = time.time()

    for item in constituents:
        constituent_id = item[0]
        print("Loading {}".format(constituent_id))
        while True:

            try:
                # get max id inserted from target table
                q = "SELECT max(id) as max_id " \
                    "FROM `pecten_dataset.tweets` " \
                    "WHERE constituent_id = '{}'".format(constituent_id)

                max_id = storage.get_bigquery_data(
                    q, iterator_flag=False)[0]["max_id"]

                if not max_id:
                    query = "SELECT text, id,favorite_count, source, retweeted,entities," \
                            "id_str,retweet_count,favorited,user,lang,created_at,place," \
                            "constituent_name,constituent_id,search_term, relevance " \
                            "FROM `pecten_dataset.tweets_unmodified` " \
                            "WHERE constituent_id = '{}' " \
                            "ORDER BY id ASC".format(constituent_id)

                else:
                    query = "SELECT text, id,favorite_count, source, retweeted,entities," \
                            "id_str,retweet_count,favorited,user,lang,created_at,place," \
                            "constituent_name,constituent_id,search_term, relevance " \
                            "FROM `pecten_dataset.tweets_unmodified` " \
                            "WHERE id > {} AND constituent_id = '{}' " \
                            "ORDER BY id ASC".format(max_id, constituent_id)

                tweets = storage.get_bigquery_data(query)

                if tweet.total_rows == 0:
                    print("Finished for {}".format(constituent_id))
                    break

                operations = []
                records = 0

                for tweet in tweets:
                    row = {}
                    row["text"] = tweet["text"]
                    row["id"] = tweet["id"]
                    row["favorite_count"] = tweet["favorite_count"]
                    row["source"] = tweet["source"]
                    row["retweeted"] = tweet["retweeted"]
                    row["entities"] = tweet["entities"]
                    row["id_str"] = tweet["id_str"]
                    row["retweet_count"] = tweet["retweet_count"]
                    row["favorited"] = tweet["favorited"]
                    row["user"] = tweet["user"]
                    row["lang"] = tweet["lang"]
                    row["created_at"] = tweet["created_at"]
                    row["place"] = tweet["place"]
                    row["constituent_name"] = tweet["constituent_name"]
                    row["constituent_id"] = tweet["constituent_id"]
                    row["search_term"] = tweet["search_term"]
                    row["relevance"] = tweet["relevance"]

                    # Additional fields
                    if isinstance(tweet["created_at"], str):
                        row['date'] = convert_timestamp(tweet["created_at"])

                    # sentiment score
                    row["sentiment_score"] = get_nltk_sentiment(tweet["text"])

                    # TO DO
                    tagged_text = tagger.get_spacy_entities(tweet["text"])
                    row["entity_tags"] = get_spacey_tags(tagged_text)

                    operations.append(row)
                    # print(row)

                    if len(operations) == 1000:
                        result = storage.insert_bigquery_data(
                            'pecten_dataset', 'tweets', operations)
                        records += 1000
                        print("Performed bulk write of {} records".format(
                            records))
                        if not result:
                            print("Records not inserted")

                        operations = []

                if len(operations) > 0:
                    result = storage.insert_bigquery_data(
                        'pecten_dataset', 'tweets', operations)
                    records += 1000
                    if not result:
                        print("Records not inserted")

            except Exception as e:
                print(e)

    print("--- %s seconds ---" % (time.time() - start_time))
示例#15
0
def get_stocktwits(args):
    from utils import logging_utils as logging_utils
    from utils.Storage import Storage
    from utils import twitter_analytics_helpers as tah
    from utils.TaggingUtils import TaggingUtils as TU

    # get constituents
    storage = Storage(args.google_key_path)
    tagger = TU()

    columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"]
    table = "PARAM_STOCKTWITS_KEYS"

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name=table,
        sql_column_list=columns)

    # Get parameters
    param_table = "PARAM_STOCKTWITS_COLLECTION"
    parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"]

    parameters = tah.get_parameters(args.param_connection_string, param_table,
                                    parameters_list)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    for constituent_id, constituent_name, url_key in all_constituents:
        #Get the last id
        query = """
                SELECT max(id) as max_id FROM `{}.{}`
                WHERE constituent_id = '{}' and source = 'StockTwits'
        """.format(common_parameters["BQ_DATASET"],
                   parameters["DESTINATION_TABLE"], constituent_id)

        try:
            result = storage.get_bigquery_data(query=query,
                                               iterator_flag=False)
            max_id = result[0]["max_id"]
        except Exception as e:
            max_id = None
            continue

        print("Gettins stocktwits for {}:{}".format(constituent_name, url_key))
        response = requests.get(
            "https://api.stocktwits.com/api/2/streams/symbol/{}.json".format(
                url_key))
        to_insert = []

        if response.status_code == 200:
            for item in response.json()["messages"]:
                if max_id:
                    if int(item['id']) < max_id:
                        continue
                doc = tah.create_tweet_skelleton()
                doc['text'] = item['body']
                doc['created_at'] = item['created_at']
                doc['date'] = item['created_at']
                doc['id'] = item['id']
                doc['id_str'] = str(item['id'])

                if "symbols" in item:
                    for symbol in item['symbols']:
                        doc['entities']['symbols'].append({
                            'indices': [],
                            'text':
                            symbol['symbol']
                        })

                if 'entities' in item:
                    if 'sentiment' in item['entities'] and item['entities'][
                            'sentiment']:
                        if 'basic' in item['entities']['sentiment']:
                            doc['user']['description'] = item['entities'][
                                'sentiment']['basic']

                doc['source'] = 'StockTwits'
                doc['constituent_name'] = constituent_name
                doc['constituent_id'] = constituent_id
                doc['search_term'] = ''
                doc["constituent"] = tah.get_old_constituent_name(
                    constituent_id)
                doc['relevance'] = 1
                doc["sentiment_score"] = tah.get_nltk_sentiment(
                    str(doc["text"]))
                tagged_text = tagger.get_spacy_entities(str(doc["text"]))
                doc["entity_tags"] = tah.get_spacey_tags(tagged_text)

                to_insert.append(doc)

            try:
                storage.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                             parameters["DESTINATION_TABLE"],
                                             to_insert)
            except Exception as e:
                print(e)

            if parameters["LOGGING"]:
                doc = [{
                    "date":
                    time.strftime('%Y-%m-%d %H:%M:%S',
                                  datetime.now().date().timetuple()),
                    "constituent_name":
                    constituent_name,
                    "constituent_id":
                    constituent_id,
                    "downloaded_tweets":
                    len(to_insert),
                    "language":
                    'StockTwits'
                }]
                logging_utils.logging(doc, common_parameters["BQ_DATASET"],
                                      parameters["LOGGING_TABLE"], storage)

        else:
            print(response.text)
示例#16
0
def get_bloomberg_news(args):
    from utils.Storage import Storage
    from utils import twitter_analytics_helpers as tah
    from utils.TaggingUtils import TaggingUtils as TU
    from utils import logging_utils as logging_utils

    tagger = TU()
    storage_client = Storage(google_key_path=args.google_key_path)

    #Get parameters
    param_table = "PARAM_NEWS_COLLECTION"
    parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"]
    where = lambda x: x["SOURCE"] == 'Bloomberg'

    parameters = tah.get_parameters(args.param_connection_string, param_table,
                                    parameters_list, where)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    #get constituents
    all_constituents = storage_client.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="PARAM_NEWS_BLOOMBERG_KEYS",
        sql_column_list=[
            "CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY", "PAGES"
        ])

    # Define random user agent object
    #ua = UserAgent()

    # Base url for search queries
    base = 'https://bloomberg.com/search?query={}&sort=time:desc&page={}'

    # Iterate over companies
    for constituent_id, constituent_name, url_key, pages in all_constituents:
        # Get date of latest news article for this constituent for Bloomberg
        query = """
        SELECT max(news_date) as last_date FROM `{}.{}`
        WHERE constituent_id = '{}' AND news_origin = 'Bloomberg'
        """.format(common_parameters["BQ_DATASET"],
                   parameters["DESTINATION_TABLE"], constituent_id)

        try:
            result = storage_client.get_bigquery_data(query=query,
                                                      iterator_flag=False)
            last_date_bq = result[0]["last_date"]
        except Exception as e:
            last_date_bq = None

        print('Scraping for {} up to {}.'.format(constituent_name,
                                                 str(last_date_bq)))
        # iterate through specified page range and save articles to database
        in_target = True
        i = 1
        while in_target:
            print('Scraping for page', i)
            # format url with company name and current page number and create bs4 object from the response
            url = base.format(url_key.replace('.', '_').replace(' ', '_'), i)
            i += 1
            main_soup = BeautifulSoup(requests.get(url).content, 'html.parser')

            # check if there are any results on the page and finish the loop if none
            try:
                if 'No results for' in main_soup.find(
                        'div', {
                            'class': 'content-no-results'
                        }).text:
                    print('No more articles...')
                    break
            except AttributeError:
                pass

            # iterate through article elelements on the page
            results = main_soup.findAll('div', {'class': 'search-result'})
            times = [
                x.find('time', {
                    'class': 'published-at'
                }).text.strip() for x in results
            ]
            times = [datetime.strptime(x, '%b %d, %Y') for x in times]
            print("Looping times")
            for time in times:
                if last_date_bq:
                    if time < datetime.strptime(
                            last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"):
                        print('Target date reached')
                        in_target = False

            to_insert = []
            print("Looping articles...")
            for art in results:
                art_type = art.find('article')['class']
                # check if the result is an article
                if 'type-article' in art_type:
                    url = art.find('h1').find('a')['href']
                    d = get_article(url, None)

                    if d is None:
                        continue

                    if last_date_bq:
                        if datetime.strptime(d["news_date"], "%Y-%m-%d %H:%M:%S") < \
                                datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"):
                            continue

                    #set extra fields:
                    #score
                    if d["news_article_txt"]:
                        d['score'] = tah.get_nltk_sentiment(
                            str(d["news_article_txt"]))

                    # sentiment
                    d['sentiment'] = tah.get_sentiment_word(d["score"])

                    # constituent fields
                    d["constituent_id"] = constituent_id
                    d["constituent_name"] = constituent_name
                    d["constituent"] = tah.get_old_constituent_name(
                        constituent_id)

                    #url
                    d["url"] = url

                    #show
                    d["show"] = True

                    # entity_tags
                    d["entity_tags"] = tah.get_spacey_tags(
                        tagger.get_spacy_entities(str(d["news_title"])))

                    to_insert.append(d)

            if to_insert:
                print("Inserting records to BQ")
                try:
                    storage_client.insert_bigquery_data(
                        common_parameters["BQ_DATASET"],
                        parameters["DESTINATION_TABLE"], to_insert)
                except Exception as e:
                    print(e)

                if parameters["LOGGING"]:
                    doc = [{
                        "date":
                        datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'),
                        "constituent_name":
                        constituent_name,
                        "constituent_id":
                        constituent_id,
                        "downloaded_news":
                        len(to_insert),
                        "source":
                        "Bloomberg"
                    }]
                    logging_utils.logging(doc, common_parameters["BQ_DATASET"],
                                          parameters["LOGGING_TABLE"],
                                          storage_client)
            else:
                break
示例#17
0
def get_daily_zephyr_ma_deals(args):
    from utils import logging_utils as logging_utils
    from utils import twitter_analytics_helpers as tah
    from utils.Storage import Storage
    from utils.Storage import MongoEncoder
    from utils.SOAPUtils import SOAPUtils

    zephyr_query = """
        DEFINE P1 AS [Parameters.Currency=SESSION;],
        P2 AS [Parameters.RepeatingDimension=NrOfTargets],
        P3 AS [Parameters.RepeatingDimension=NrOfBidders],
        P5 AS [Parameters.RepeatingDimension=NrOfVendors];
        SELECT LINE RECORD_ID as record_id,
        LINE DEAL_OVERVIEW.DEAL_HEADLINE AS deal_headline,
        LINE DEAL_OVERVIEW.TGNAME USING P2 AS target,
        LINE DEAL_OVERVIEW.BDNAME USING P3 AS acquiror,
        LINE DEAL_OVERVIEW.VDNAME USING P5 AS vendor,
        LINE DEAL_OVERVIEW.DEALTYPE AS deal_type,
        LINE DEAL_OVERVIEW.DEAL_STATUS AS deal_status,
        LINE DEAL_STRUCTURE_AND_DATES.COMPLETION_DATE AS completion_date,
        LINE DEAL_STRUCTURE_AND_DATES.RUMOUR_DATE AS rumour_date,
        LINE DEAL_STRUCTURE_AND_DATES.ANNOUNCE_DATE AS announced_date,
        LINE DEAL_STRUCTURE_AND_DATES.EXPECTED_COMPLETION_DATE AS expected_completion_date,
        LINE DEAL_STRUCTURE_AND_DATES.ASSUMED_COMPLETION_DATE AS assumed_completion_date,
        LINE DEAL_STRUCTURE_AND_DATES.POSTPONED_DATE AS postponed_date,
        LINE DEAL_STRUCTURE_AND_DATES.WITHDRAWN_DATE AS withdrawn_date,
        LINE DEAL_OVERVIEW.ALLDLVALUE USING P1 AS deal_value
        FROM RemoteAccess.U ORDERBY 1 DESCENDING
        """

    # Get parameters
    param_table = "PARAM_NEWS_COLLECTION"
    parameters_list = ['BVD_USERNAME', 'BVD_PASSWORD', "LOGGING","DESTINATION_TABLE","LOGGING_TABLE"]
    where = lambda x: x["SOURCE"] == 'Zephyr'

    parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where)

    # Get constituents
    soap = SOAPUtils()
    storage = Storage(args.google_key_path)

    columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "STRATEGY"]
    table = "PARAM_NEWS_ZEPHYR_STRATEGIES"

    constituents = storage.get_sql_data(sql_connection_string=args.param_connection_string,
                                        sql_table_name=table,
                                        sql_column_list=columns)

    fields = ["record_id", "deal_headline", "target", "acquiror", "vendor", "deal_type", "deal_status",
              "completion_date", "deal_value",
              "rumour_date", "announced_date", "expected_completion_date", "assumed_completion_date", "postponed_date",
              "withdrawn_date"]

    for constituent_id, constituent_name, strategy in constituents:
        # get last deal
        query = """
                SELECT max(record_id) as max_id FROM `{}.{}`
                WHERE constituent_id = '{}';
        """.format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"],constituent_id)

        try:
            result = storage.get_bigquery_data(query=query, iterator_flag=False)
            max_id = result[0]["max_id"]
            print("Getting M&A deals for {}".format(constituent_name))

            token = soap.get_token(parameters['BVD_USERNAME'], parameters['BVD_PASSWORD'], 'zephyr')
            selection_token, selection_count = soap.find_with_strategy(token, strategy, "zephyr")
            get_data_result = soap.get_data(token, selection_token, selection_count, zephyr_query, 'zephyr',
                                            timeout=None,
                                            number_of_records=100)

            result = ET.fromstring(get_data_result)
            csv_result = result[0][0][0].text

            TESTDATA = StringIO(csv_result)
            df = pd.read_csv(TESTDATA, sep=",", parse_dates=["completion_date", "rumour_date", "announced_date",
                                                         "expected_completion_date", "assumed_completion_date",
                                                         "postponed_date", "withdrawn_date"])

            df.astype({"record_id": int}, copy=False, errors='ignore')
            print("Retrieved {} items".format(df.shape[0]))

            df = df.loc[df["record_id"] > max_id]
            print("New records {} items".format(df.shape[0]))

            if df.shape[0] == 0:
                if token:
                    soap.close_connection(token, 'zephyr')
                continue

            df = df[fields]

            # add constituent name, id and old name
            df["constituent_id"] = constituent_id
            df["constituent_name"] = constituent_name
            old_constituent_name = tah.get_old_constituent_name(constituent_id)
            df["constituent"] = old_constituent_name

            data = json.loads(df.to_json(orient="records", date_format="iso"))
            print(len(data))

            for item in data:
                if "completion_date" in item and item["completion_date"]:
                    date = item["completion_date"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    item["completion_date"] = ts

                if "rumour_date" in item and item["rumour_date"]:
                    date = item["rumour_date"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    item["rumour_date"] = ts

                if "announced_date" in item and item["announced_date"]:
                    date = item["announced_date"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    item["announced_date"] = ts

                if "expected_completion_dat" in item and item["expected_completion_dat"]:
                    date = item["expected_completion_dat"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    item["expected_completion_date"] = ts

                if "assumed_completion_date" in item and item["assumed_completion_date"]:
                    date = item["assumed_completion_date"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    item["assumed_completion_datee"] = ts

                if "postponed_date" in item and item["postponed_date"]:
                    date = item["postponed_date"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    item["postponed_date"] = ts

                if "withdrawn_date" in item and item["withdrawn_date"]:
                    date = item["withdrawn_date"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    item["withdrawn_date"] = ts

            if data:
                print("Inserting records to BQ")
                try:
                    #open("ma_deals.json", 'w').write("\n".join(json.dumps(e, cls=MongoEncoder) for e in data))
                    storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], data)
                except Exception as e:
                    print(e)
                    soap.close_connection(token, 'zephyr')

                if parameters["LOGGING"]:
                    doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'),
                            "constituent_name": constituent_name,
                            "constituent_id": constituent_id,
                            "downloaded_deals": len(data),
                            "source": "Zephyr"}]
                    logging_utils.logging(doc,common_parameters["BQ_DATASET"],parameters["LOGGING_TABLE"],storage)

        except Exception as e:
            print(e)
            if token:
                soap.close_connection(token, 'zephyr')
            continue

        if token:
            soap.close_connection(token, 'zephyr')
示例#18
0
def get_news_analytics_topic_articles(from_date, to_date):
    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = ["CONNECTION_STRING"]

    parameters = get_parameters(args.param_connection_string, param_table,
                                parameters_list)

    storage = Storage(google_key_path=args.google_key_path,
                      mongo_connection_string=parameters["CONNECTION_STRING"])

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"])

    client = MongoClient(parameters["CONNECTION_STRING"])
    db = client["dax_gcp"]
    all_news_landing = db["all_news_landing"]
    news_analytics_topic_sentiment = db["news_analytics_topic_sentiment"]
    news_analytics_topic_articles = db["news_analytics_topic_articles"]

    for constituent_id, constituent_name in all_constituents:
        print(constituent_name)
        # get the topics for that constituent from news_analytics_topic_sentiment
        topics = list(
            news_analytics_topic_sentiment.find(
                {"constituent_id": constituent_id}, {
                    "categorised_tag": 1,
                    "_id": 0
                }))
        if len(topics) > 5:
            topics = topics[:5]
        #pprint(topics)

        # Get the latest (5) articles per topic
        for t in topics:
            pipeline = [{
                "$match": {
                    "constituent_id": constituent_id,
                    "NEWS_DATE_NewsDim": {
                        "$gte": from_date,
                        "$lte": to_date
                    },
                    "show": True,
                    "categorised_tag": {
                        "$nin": ["NA"]
                    }
                }
            }, {
                "$unwind": "$categorised_tag"
            }, {
                "$match": {
                    "categorised_tag": t['categorised_tag']
                }
            }, {
                "$project": {
                    "_id": 0,
                    "from_date": from_date,
                    "to_date": to_date,
                    "date": datetime.now(),
                    "sentiment": 1,
                    "score": 1,
                    "NEWS_ARTICLE_TXT_NewsDim": 1,
                    "categorised_tag": 1,
                    "NEWS_DATE_NewsDim": 1,
                    "constituent_name": 1,
                    "NEWS_TITLE_NewsDim": 1,
                    "NEWS_SOURCE_NewsDim": 1,
                    "constituent_id": 1,
                    "constituent": 1
                }
            }, {
                "$sort": SON([("NEWS_DATE_NewsDim", -1)])
            }, {
                "$limit": 5
            }]

            result = list(all_news_landing.aggregate(pipeline))
            to_return = [
                a for a in result if detect(a["NEWS_TITLE_NewsDim"]) == 'en'
                and a["categorised_tag"] and a["categorised_tag"] != 'None'
            ]

            # save result in a table
            if to_return:
                news_analytics_topic_articles.insert_many(to_return)

        time.sleep(3)
示例#19
0
def get_reuters_news(args, driver):
    from utils import logging_utils as logging_utils
    from utils.TaggingUtils import TaggingUtils as TU
    from utils.Storage import Storage
    from utils import twitter_analytics_helpers as tah

    tagger = TU()
    storage_client = Storage(google_key_path=args.google_key_path)

    # Get parameters
    param_table = "PARAM_NEWS_COLLECTION"
    parameters_list = ["LOGGING","DESTINATION_TABLE","LOGGING_TABLE"]
    where = lambda x: x["SOURCE"] == 'Reuters'

    parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where)

    # get constituents
    all_constituents = storage_client.get_sql_data(sql_connection_string=args.param_connection_string,
                                                   sql_table_name="PARAM_NEWS_REUTERS_KEYS",
                                                   sql_column_list=["CONSTITUENT_ID",
                                                                    "CONSTITUENT_NAME",
                                                                    "URL_KEY"])

    for constituent_id, constituent_name, url_key in all_constituents:
        to_insert = []
        # Get date of latest news article for this constituent for Reuters
        query = """
                SELECT max(news_date) as last_date FROM `{}.{}`
                WHERE constituent_id = '{}' AND news_origin = 'Reuters'
                """.format(common_parameters["BQ_DATASET"],
        parameters["DESTINATION_TABLE"],constituent_id)
        try:
            try:
                result = storage_client.get_bigquery_data(query=query, iterator_flag=False)
                last_date_bq = result[0]["last_date"]
            except Exception as e:
                last_date_bq = None

            company_name = url_key
            print('Scraping articles for {}'.format(company_name))
            query_url = 'https://reuters.com/search/news?blob={}&sortBy=date&dateRange=all'.format(company_name)
            # Find last article with date specified
            print(query_url)
            driver.get(query_url)
            sleep(2)

            skip = False
            if last_date_bq:
                try:
                    last_date = \
                    driver.find_elements_by_xpath('//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[3]/div/div/h5')[
                        0].get_attribute('innerHTML')
                except:
                    last_date = BeautifulSoup(driver.page_source).findAll('h5', {'class': 'search-result-timestamp'})[
                        0].text.strip()

                last_date = ' '.join(last_date.split(' ')[:-1])
                month = [last_date.split(' ')[0][:3]]
                last_date = ' '.join(month + last_date.split(' ')[1:]).strip().replace(',', '')
                last_date = datetime.strptime(last_date, '%b %d %Y %I:%M%p')

                if last_date <= datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"):
                    print("No newer articles")
                    skip = True

            if skip:
                continue

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            search = soup.find('div', {'class': 'search-result-list news-search'}).findAll('div',
                                                                                           {'class': 'search-result-indiv'})
            print("Getting articles")
            for article in search:
                title_elem = article.find('h3', {'class': 'search-result-title'})
                title = title_elem.text.strip()
                url = "https://reuters.com" + title_elem.find('a')['href']

                date_published = article.find('h5', {'class': 'search-result-timestamp'}).text
                date_published = ' '.join(date_published.split(' ')[:-1])
                month = [date_published.split(' ')[0][:3]]
                date_published = ' '.join(month + date_published.split(' ')[1:]).strip().replace(',', '')
                date_published_dtype = datetime.strptime(date_published, '%b %d %Y %I:%M%p')
                date_published = date_published_dtype.strftime("%Y-%m-%d %H:%M:%S")

                if last_date_bq:
                    if date_published_dtype < datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"):
                        print("Skipping article")
                        continue

                try:
                    soup = BeautifulSoup(
                        requests.get(url, headers={'User-Agent': random.choice(AGENTS)}).content,
                        'html.parser')
                    article_body = soup.find('div', {'class': 'ArticleBody_body_2ECha'}).text.strip()
                    source = soup.find('p', {'class': 'Attribution_content_27_rw'}).text.strip()
                except AttributeError:
                    source = None
                    continue

                d = {'news_title': title, 'news_date': date_published, 'news_source': source, 'news_origin':"Reuters", "news_article_txt": article_body}
                # set extra fields:
                # score
                if d["news_article_txt"]:
                    d['score'] = tah.get_nltk_sentiment(str(d["news_article_txt"]))

                # sentiment
                d['sentiment'] = tah.get_sentiment_word(d["score"])

                # constituent fields
                d["constituent_id"] = constituent_id
                d["constituent_name"] = constituent_name
                d["constituent"] = tah.get_old_constituent_name(constituent_id)

                # url
                d["url"] = url

                # show
                d["show"] = True

                # entity_tags
                d["entity_tags"] = tah.get_spacey_tags(tagger.get_spacy_entities(str(d["news_title"])))

                to_insert.append(d)
                print('Article scraped...')
                #break
                # Save data to json

            if to_insert:
                print("Inserting records to BQ")
                try:
                    storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                                        parameters["DESTINATION_TABLE"], to_insert)
                except Exception as e:
                    print(e)

                if parameters["LOGGING"]:
                    doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'),
                            "constituent_name": constituent_name,
                            "constituent_id": constituent_id,
                            "downloaded_news": len(to_insert),
                            "source": "Reuters"}]
                    logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client)

        except Exception as e:
            print(e)
            continue