Пример #1
0
def run_news_scraper():
    try:

        with multiprocessing.Manager() as manager:

            # creating new processes
            p1 = multiprocessing.Process(target=mint.start_process)
            p2 = multiprocessing.Process(
                target=business_standard.start_process)
            p3 = multiprocessing.Process(target=money_control.start_process)
            p4 = multiprocessing.Process(target=economic_times.start_process)
            p5 = multiprocessing.Process(target=reuters.start_process)
            p6 = multiprocessing.Process(target=infoline.start_process)
            p7 = multiprocessing.Process(
                target=financial_express.start_process)
            p8 = multiprocessing.Process(target=bloombergquint.start_process)
            p9 = multiprocessing.Process(target=businesstoday.start_process)

            p1.start()
            p2.start()
            p3.start()
            p4.start()
            p5.start()
            p6.start()
            p7.start()
            p8.start()
            p9.start()

    except Exception as error:
        database_log.error_log("read_website_configuration", error)
Пример #2
0
def read_news_data():

    try:

        query = """ CREATE TEMPORARY TABLE temp_news_headlines
                    (
                          id INT,
                          header TEXT NULL,
                    	  sub_header TEXT NULL
                    );

                    INSERT INTO temp_news_headlines(id,header,sub_header)
                    SELECT ndump.id, ndump.header, ndump.sub_header FROM news_feeds_dump ndump
                    WHERE ndump.id NOT IN (SELECT DISTINCT news_id FROM news_feeds_sentiment)
                    AND (ndump.sub_header IS NOT NULL AND ndump.sub_header !='')
                    ORDER BY id;

                    delete from temp_news_headlines temp1 using temp_news_headlines temp2
					where temp1.id<temp2.id and temp1.sub_header=temp2.sub_header;

					delete from temp_news_headlines temp1 using temp_news_headlines temp2
					where temp1.id<temp2.id and temp1.header=temp2.header;

                    SELECT ndump.id, ndump.header, ndump.sub_header FROM temp_news_headlines ndump
                    WHERE ndump.header NOT IN (SELECT DISTINCT header FROM news_feeds_dump nfdump join news_feeds_sentiment nsentiment on nfdump.id=nsentiment.news_id)
                    ORDER BY id; """

        with CursorFromConnectionFromPool() as cursor:
            cursor.execute(query)

            news_data = cursor.fetchall()
            return news_data

    except Exception as error:
        database_log.error_log("read_news_data", error)
Пример #3
0
def load_data_v4():

    try:

        train_dataset_path = '../TrainingData/TwitterData/tweets.csv'
        test_dataset_path = '../TrainingData/TwitterData/tweetstest.csv'

        tweets_train = pd.read_csv(train_dataset_path, encoding='latin-1')
        tweets_test = pd.read_csv(test_dataset_path, encoding='latin-1')

        tweets_train = clean_dataset_v3(tweets_train)
        tweets_test = clean_dataset_v3(tweets_test)

        # merge and filter 50000 tweets
        df_tweets_train_pos = tweets_train.loc[tweets_train['label'] == 1].head(30000)
        df_tweets_train_neg = tweets_train.loc[tweets_train['label'] == 0].head(30000)

        # print(" train -{} ; {}".format(df_tweets_train_pos.shape,df_tweets_train_neg.shape))

        tweets_train = pd.concat([df_tweets_train_pos, df_tweets_train_neg])

        return tweets_train, tweets_test

    except Exception as error:
        database_log.error_log("data_load : load_data_v3", error)
Пример #4
0
def load_data_v6():

    try:

        tweets_train, tweets_test = load_data_v4()

        return tweets_train

    except Exception as error:
        database_log.error_log("data_load : load_data_v6", error)
Пример #5
0
def read_source_link(lookup_value):

    try:

        web_config_list = read_website_configuration(lookup_value)
        df_web_config = pd.DataFrame(web_config_list, columns=['website', 'website_category', 'website_link'])
        return df_web_config

    except Exception as error:
        database_log.error_log("read_source_link", error)
Пример #6
0
def load_data_v1():

    try:

        train_dataset_path = '../TrainingData/TwitterData/Sentiment_Analysis_Dataset.csv'
        tweets_train = pd.read_csv(train_dataset_path)

        return tweets_train

    except Exception as error:
        database_log.error_log("data_load : load_data_v1", error)
Пример #7
0
def read_website_configuration(lookup_value):

    try:

        with CursorFromConnectionFromPool() as cursor:
            cursor.execute("SELECT website,website_category,website_link FROM website_configuration "
                           "WHERE is_active=true and website = %s", (lookup_value,))
            web_config_list = cursor.fetchall()
            return web_config_list

    except Exception as error:
        database_log.error_log("read_website_configuration", error)
def bulk_insert_indices_data(records):

    try:

        sql_insert_query = """ INSERT INTO indices_data (index_id,high,low,open,close,adj_close,entry_date)
                                                VALUES (%s,%s,%s,%s,%s,%s,%s) """

        with CursorFromConnectionFromPool() as cursor:
            cursor.executemany(sql_insert_query, records)

    except Exception as error:
        database_log.error_log("bulk_insert_twitter_feeds", error)
Пример #9
0
def load_data():

    try:

        train_dataset_path = '../TrainingData/TwitterData/tweets_train.csv'
        test_dataset_path = '../TrainingData/TwitterData/tweets_test.csv'

        tweets_train = pd.read_csv(train_dataset_path)
        tweets_test = pd.read_csv(test_dataset_path)

        return tweets_train, tweets_test

    except Exception as error:
        database_log.error_log("data_load : load_data", error)
Пример #10
0
def load_data_v5():

    try:

        twitter_training_data_path = '../TrainingData/TwitterData/twitter_training_data.csv'
        twitter_training_df = pd.read_csv(twitter_training_data_path)

        tweets_train, tweets_test = load_data_v4()
        train_data_v1 = pd.concat([twitter_training_df, tweets_train])
        # return twitter_training_df

        return tweets_train

    except Exception as error:
        database_log.error_log("data_load : load_data_v5", error)
def read_indices():

    try:

        query = """ SELECT id, index_symbol, yahoo_symbol, start_date FROM indices WHERE is_active = true
                    ORDER BY id """

        with CursorFromConnectionFromPool() as cursor:
            cursor.execute(query)

            news_data = cursor.fetchall()
            return news_data

    except Exception as error:
        database_log.error_log("read_indices", error)
Пример #12
0
def read_twitter_data():

    try:

        query = """ SELECT tdump.id, tdump.tweet_id, tdump.screen_id, tdump.tweet_message, tdump.tweet_date FROM twitter_data_dump tdump
                    WHERE tdump.id NOT IN (select DISTINCT tweet_id from twitter_sentiment)
                    AND tweet_message!='NaN' AND tweet_message!='' """

        with CursorFromConnectionFromPool() as cursor:
            cursor.execute(query)

            twitter_data = cursor.fetchall()
            return twitter_data

    except Exception as error:
        database_log.error_log("read_twitter_data", error)
Пример #13
0
def read_twitter_account():

    try:

        query = """ SELECT ta.id, ta.screen_id, CASE WHEN max(tweet_id) IS NULL THEN '1'
                    ELSE max(tweet_id) END tweet_id FROM twitter_account ta LEFT JOIN
                    twitter_data_dump td ON ta.screen_id = td.screen_id WHERE ta.is_active = true
                    GROUP BY ta.id ORDER BY ta.id """

        with CursorFromConnectionFromPool() as cursor:
            cursor.execute(query)
            twitter_account_list = cursor.fetchall()
            return twitter_account_list

    except Exception as error:
        database_log.error_log("read_twitter_account", error)
Пример #14
0
def load_data_v3():

    try:

        train_dataset_path = '../TrainingData/TwitterData/tweets.csv'
        test_dataset_path = '../TrainingData/TwitterData/tweetstest.csv'

        tweets_train = pd.read_csv(train_dataset_path, encoding='latin-1')
        tweets_test = pd.read_csv(test_dataset_path, encoding='latin-1')

        tweets_train = clean_dataset_v3(tweets_train)
        tweets_test = clean_dataset_v3(tweets_test)

        return tweets_train, tweets_test

    except Exception as error:
        database_log.error_log("data_load : load_data_v3", error)
def read_daily_indices():

    try:

        query = """ SELECT indices.id, indices.yahoo_symbol,indices.time_zone,CASE WHEN MAX(indices_data.entry_date) is null
                    THEN indices.start_date ELSE MAX(indices_data.entry_date) + INTERVAL '1 day' end AS last_date
                    FROM indices LEFT JOIN indices_data ON indices.id = indices_data.index_id
                    WHERE indices.is_active = true GROUP BY indices.id ORDER BY id """

        with CursorFromConnectionFromPool() as cursor:
            cursor.execute(query)

            news_data = cursor.fetchall()
            return news_data

    except Exception as error:
        database_log.error_log("read_daily_indices", error)
def start_process():

    try:
        print("***********Initiate Process - mint**************")
        database_log.process_log("India - mint : start_process", "Initiate Process")

        while True:

            scheduled_sleeping_seconds = app_config.india_market_scheduled_task_sleeping
            loader.start_load_process(lookup_value)
            database_log.process_log("India - mint : start_process", "Re-Run Process")

            print("Last run was successful for India - mint, next run in {} seconds.".format(scheduled_sleeping_seconds))
            time.sleep(scheduled_sleeping_seconds)

    except Exception as error:
        database_log.error_log("India - mint : start_process", error)
Пример #17
0
def init():

    # Create API object
    api = tweepy.API(auth,
                     wait_on_rate_limit=True,
                     wait_on_rate_limit_notify=True)

    try:

        api.verify_credentials()
        print("Authentication OK")

    except Exception as error:
        database_log.error_log("twitter_scheduler - init - Error creating API",
                               error)
        # auth_status = False

    return api
Пример #18
0
def clean_dataset_v3(dataset):

    try:

        dataset.columns=["label","ItemID","Date","Blank","SentimentSource","tweet"]
        dataset.drop(['ItemID','Date','Blank','SentimentSource'], axis=1, inplace=True)
        dataset = dataset[dataset.label.isnull() == False]
        dataset['label'] = dataset['label'].map( {4:1, 0:0}) #Converting 4 to 1
        dataset = dataset[dataset['tweet'].isnull() == False]
        dataset = dataset[dataset['label'].isnull() == False] # remove NaN row
        dataset.reset_index(inplace=True)
        dataset.drop('index', axis=1, inplace=True)
        # print ('dataset loaded with shape', dataset.shape  )

        return dataset

    except Exception as error:
        database_log.error_log("data_load : clean_dataset_v3", error)
Пример #19
0
def read_configuration(lookup_value):

    try:

        with CursorFromConnectionFromPool() as cursor:
            cursor.execute("SELECT website,scheduled_task_sleeping,market_time_zone,market_start_time,market_end_time,"
                           "market_hours_delay,market_off_hours_delay,market_weekend_hours_delay,market_location "
                           "FROM configuration WHERE is_active=true and website = %s", (lookup_value,))
            config_list = cursor.fetchall()

            df_web_config = pd.DataFrame(config_list,
                                         columns=['website', 'scheduled_task_sleeping', 'market_time_zone',
                                                  'market_start_time', 'market_end_time', 'market_hours_delay',
                                                  'market_off_hours_delay', 'market_weekend_hours_delay',
                                                  'market_location'])
            return df_web_config

    except Exception as error:
        database_log.error_log("read_source_link", error)
def start_process():

    try:

        print("***********Initiate Process - business_today**************")
        database_log.process_log("India - business_today : start_process",
                                 "Initiate Process")

        while True:
            # Checks whether a scheduled task - is pending to run or not
            scheduled_sleeping_seconds = app_config.india_market_scheduled_task_sleeping
            loader.start_load_process(lookup_value)
            database_log.process_log("India - business_today : start_process",
                                     "Re-Run Process")

            print(
                "Last run was successful for India - business_today, next run in {} seconds."
                .format(scheduled_sleeping_seconds))
            time.sleep(scheduled_sleeping_seconds)

    except Exception as error:
        database_log.error_log("India - business_today : start_process", error)
Пример #21
0
def read_information(df_web_config):

    try:

        # Creating an empty Data frame with column names only
        df_news_data = pd.DataFrame(columns=['website', 'website_category', 'website_link', 'header',
                                             'sub_header', 'timestamp'])

        for row in df_web_config.itertuples(index=False):
            url_link = row.website_link

            # open with GET method
            resp = requests.get(url_link, headers={'User-Agent': 'Mozilla/5.0'})

            # http_response 200 means OK status
            if resp.status_code == 200:
                # parser
                soup = BeautifulSoup(resp.text, 'html.parser')

                website = 'india_reuters'
                categories = row.website_category
                news_link = ''
                header = ''
                sub_header = ''
                timestamp = ''

                for level_1 in soup.findAll("div", {"class": "news-headline-list"}):
                    for level_2 in level_1.findAll("div", {"class": "story-content"}):
                        for level_3 in level_2.findAll("a"):
                            news_link = level_3['href']
                            header = level_3.text.strip()

                        for level_3 in level_2.findAll("p"):
                            sub_header = level_3.text

                        for level_3 in level_2.findAll("time", {"class": "article-time"}):
                            timestamp = level_3.text.strip()

                        if len(header) > 0:
                            text_lang = detect(header)

                            if text_lang == "en":
                                df_news_data = df_news_data.append({'website': website, 'website_link': url_link,
                                                 'website_category': categories,'news_link': news_link, 'header': header,
                                                 'sub_header': sub_header, 'timestamp': timestamp},ignore_index=True)

            else:
                database_log.error_log("India - Loader - india_reuters : read_information", resp.status_code)

        if df_news_data.empty is not True:
            df_news_feed_list = [tuple(r) for r in df_news_data[['website', 'website_link', 'website_category',
                                                                 'news_link', 'header', 'sub_header',
                                                                 'timestamp']].values.tolist()]
            sql_execute.bulk_insert_news_feeds(df_news_feed_list)
        else:
            database_log.error_log("India - Loader - india_reuters : read_information", "no record found")

    except Exception as error:
        database_log.error_log("India - Loader - india_reuters : read_information", error)
Пример #22
0
def bulk_insert_twitter_sentiment(twitter_data_sentiment_df):

    try:

        # convert data frame into a list
        twitter_data_list = [tuple(r) for r in twitter_data_sentiment_df[['id', 'tweet_id', 'screen_id',
                                                                'tweet_message', 'tweet_date']].values.tolist()]

        twitter_data_sentiment_list = [tuple(r) for r in twitter_data_sentiment_df[['id','nltk_classify','nltk_confidence',
                            'count_vectorizer_classify','count_vectorizer_confidence',
                            'tfidf_vectorizer_classify','tfidf_vectorizer_confidence','word2vec_classify']].values.tolist()]

        # sql_twitter_data_query = """ INSERT INTO twitter_data (id,tweet_id,screen_id,tweet_message,tweet_date)
        #                                     VALUES (%s,%s,%s,%s,%s) """

        sql_twitter_sentiment_query = """ INSERT INTO twitter_sentiment (tweet_id,nltk_classify,nltk_confidence,
                                          spacy_count_vectorizer_classify,spacy_count_vectorizer_confidence,
                                          spacy_tfidf_vectorizer_classify,spacy_tfidf_vectorizer_confidence,word2vec_classify)
                                          VALUES (%s,%s,%s,%s,%s,%s,%s,%s) """


        # with CursorFromConnectionFromPool() as cursor:
        #    cursor.executemany(sql_twitter_data_query, twitter_data_list)

        with CursorFromConnectionFromPool() as cursor:
            cursor.executemany(sql_twitter_sentiment_query, twitter_data_sentiment_list)

        # delete processd records
        # max_tweet_id = twitter_data_sentiment_df['id'].max()
        # max_tweet_id = int(max_tweet_id)
        # sql_delete_query = "DELETE FROM twitter_preprocessing_data WHERE id <= %s"
        #
        # with CursorFromConnectionFromPool() as cursor:
        #     cursor.execute(sql_delete_query, (max_tweet_id,))

    except Exception as error:
        database_log.error_log("bulk_insert_twitter_sentiment", error)
Пример #23
0
def read_information(df_web_config):

    try:

        # Creating an empty Data frame with column names only
        df_news_data = pd.DataFrame(columns=[
            'website', 'website_category', 'website_link', 'header',
            'sub_header', 'timestamp'
        ])

        for index, row in df_web_config.iterrows():
            url_link = row["website_link"]

            # open with GET method
            resp = requests.get(url_link,
                                headers={'User-Agent': 'Mozilla/5.0'})

            # http_response 200 means OK status
            if resp.status_code == 200:
                # parser
                soup = BeautifulSoup(resp.text, 'html.parser')

                website = 'mint'
                categories = row["website_category"]
                news_link = ''
                header = ''
                sub_header = ''
                timestamp = ''

                for level_1 in soup.findAll("div", {"class": "headlineSec"}):
                    for level_2 in level_1.find_all('h2',
                                                    {"class": "headline"}):
                        for level_3 in level_2.find_all('a', href=True):
                            news_link = level_3['href']
                            header = level_3.text

                    for level_2 in level_1.find_all('span',
                                                    {"class": "fl date"}):
                        for level_3 in level_2.find_all('span'):
                            timestamp = level_3.text

                    df_news_data = df_news_data.append(
                        {
                            'website': website,
                            'website_link': url_link,
                            'website_category': categories,
                            'news_link': news_link,
                            'header': header,
                            'sub_header': sub_header,
                            'timestamp': timestamp
                        },
                        ignore_index=True)
            else:
                database_log.error_log(
                    "India - Loader - mint : read_information",
                    resp.status_code)

        if df_news_data.empty is not True:
            df_news_feed_list = [
                tuple(r) for r in df_news_data[[
                    'website', 'website_link', 'website_category', 'news_link',
                    'header', 'sub_header', 'timestamp'
                ]].values.tolist()
            ]
            sql_execute.bulk_insert_news_feeds(df_news_feed_list)
        else:
            database_log.error_log("India - Loader - mint : read_information",
                                   "no record found")

    except Exception as error:
        database_log.error_log("India - Loader - mint : read_information",
                               error)
Пример #24
0
def read_user_timeline(api):

    try:

        # get twitter account
        df_twitter_account = read_twitter_account()

        for row in df_twitter_account.itertuples(index=False):
            # print(row["id"], row["screen_id"], row['tweet_id'])

            tweets = api.user_timeline(screen_name=row.screen_id,
                                       include_rts=False,
                                       since_id=int(row.tweet_id))

            tweet_list = []
            for tweet in tweets:
                tweet_id = tweet.id
                tweet_message = tweet.text
                tweet_source = tweet.source
                retweet_count = tweet.retweet_count
                likes_count = tweet.favorite_count
                tweet_date = tweet.created_at

                try:
                    text_lang = detect(tweet_message)

                    # check if text is in english
                    if text_lang == "en":
                        tweet_list.append({
                            'tweet_id': tweet_id,
                            'screen_id': row.screen_id,
                            'tweet_message': tweet_message,
                            'tweet_source': tweet_source,
                            'retweet_count': retweet_count,
                            'likes_count': likes_count,
                            'tweet_date': tweet_date
                        })

                except Exception as error:
                    database_log.error_log(
                        "twitter_scheduler - read_user_timeline - language error",
                        error)

                    tweet_list.append({
                        'tweet_id': tweet_id,
                        'screen_id': row.screen_id,
                        'tweet_message': tweet_message,
                        'tweet_source': tweet_source,
                        'retweet_count': retweet_count,
                        'likes_count': likes_count,
                        'tweet_date': tweet_date
                    })

            if len(tweet_list) > 0:
                tweet_data_frame = pd.DataFrame(tweet_list,
                                                columns=[
                                                    'tweet_id', 'screen_id',
                                                    'tweet_message',
                                                    'tweet_source',
                                                    'retweet_count',
                                                    'likes_count', 'tweet_date'
                                                ])

                clean_twitter_data_frame = pre_processing.clean_twitter_data(
                    tweet_data_frame)

            if len(tweet_list
                   ) > 0 and clean_twitter_data_frame.empty is not True:
                tweet_data_list = [
                    tuple(r) for r in clean_twitter_data_frame[[
                        'tweet_id', 'screen_id', 'tweet_message',
                        'tweet_source', 'retweet_count', 'likes_count',
                        'tweet_date'
                    ]].values.tolist()
                ]
                sql_execute.bulk_insert_twitter_feeds(tweet_data_list)
            else:
                database_log.error_log(
                    "twitter_scheduler - read_user_timeline",
                    "no record found")
                print("No Record Found.")

    except Exception as error:
        database_log.error_log("twitter_scheduler - read_user_timeline", error)
        print("twitter_scheduler - read_user_timeline - {}".format(error))
Пример #25
0
    except Exception as error:
        database_log.error_log("twitter_scheduler - read_user_timeline", error)
        print("twitter_scheduler - read_user_timeline - {}".format(error))


if __name__ == '__main__':

    try:

        lookup_value = "twitter"  # search website name
        df_config = sql_database_execute.read_configuration(
            lookup_value)  # read config file setting

        row_id = df_config.index[0]
        scheduled_task_sleeping = df_config["scheduled_task_sleeping"][row_id]

        # initialize twitter api
        api = init()

        while True:
            scheduled_task_sleeping = app_config.twitter_data_scheduled_task_sleeping
            read_user_timeline(api)
            print(
                "Last run was successful for Twitter API, next run in {} seconds."
                .format(scheduled_task_sleeping))
            time.sleep(scheduled_task_sleeping)

    except Exception as error:
        database_log.error_log("twitter_scheduler - main", error)
Пример #26
0
def predict_models_sentiment():

    # read tweet from database
    twitter_data_list = sql_execute.read_twitter_data()
    twitter_df = pd.DataFrame(
        twitter_data_list,
        columns=['id', 'tweet_id', 'screen_id', 'tweet_message', 'tweet_date'])

    # ---------twitter nltk load-------------")
    word_features = twitter_nltk_classifier.load_save_dataset(
        'word_features.pickle')
    ensemble_nltk_clf = twitter_nltk_classifier.get_ensemble_models()

    # --------twitter word2vec load-------------
    model, w2vmodel, tfidf = twitter_word2vec_classifier_model.load_prediction_model_parameters(
    )

    sentiment_data_list = []

    for row in twitter_df.itertuples(index=False):

        tweet = row.tweet_message

        try:
            # ---------nltk-------------")
            classify, nltk_confidence = twitter_nltk_classifier.sentiment_analyzer(
                tweet, ensemble_nltk_clf)

            nltk_classify = 2
            if classify == "Positive":
                nltk_classify = 1
            elif classify == "Negative":
                nltk_classify = 0
            else:
                nltk_classify = 2

            # --------twitter word2vec-------------
            word2vec_classify = twitter_word2vec_classifier_model.predict(
                model, w2vmodel, tfidf, tweet)
            # word2vec_classify = 0

            # predict using CountVectorizer
            classify_cVector, confidence_cVector = twitter_spacy_countVectorizer_model.sentiment_analyzer(
                tweet)
            # classify_cVector = 2
            # confidence_cVector =1

            # predict using TfidfVectorizer
            classify_tfidf, confidence_tfidf = twitter_spacy_tfidfVectorizer_model.sentiment_analyzer(
                tweet)
            # classify_tfidf =2
            # confidence_tfidf=1

            sentiment_data_list.append({
                'id':
                row.id,
                'tweet_id':
                row.tweet_id,
                'screen_id':
                row.screen_id,
                'tweet_message':
                row.tweet_message,
                'tweet_date':
                row.tweet_date,
                'nltk_classify':
                nltk_classify,
                'nltk_confidence':
                nltk_confidence,
                'word2vec_classify':
                word2vec_classify,
                'count_vectorizer_classify':
                int(classify_cVector),
                'count_vectorizer_confidence':
                confidence_cVector,
                'tfidf_vectorizer_classify':
                int(classify_tfidf),
                'tfidf_vectorizer_confidence':
                confidence_tfidf
            })

        except Exception as error:
            database_log.error_log(
                "run_twitter_sentiment_analyzer - predict_models_sentiment",
                error)
            print(
                "run_twitter_sentiment_analyzer - predict_models_sentiment - {}"
                .format(error))

    tweet_sentiment_data = pd.DataFrame(columns=[
        'id', 'tweet_id', 'screen_id', 'tweet_message', 'tweet_date',
        'nltk_classify', 'nltk_confidence', 'word2vec_classify',
        'count_vectorizer_classify', 'count_vectorizer_confidence',
        'tfidf_vectorizer_classify'
        'tfidf_vectorizer_confidence'
    ])
    if len(sentiment_data_list) > 0:
        tweet_sentiment_data = tweet_sentiment_data.append(sentiment_data_list)

    return tweet_sentiment_data
Пример #27
0
            p4 = multiprocessing.Process(target=economic_times.start_process)
            p5 = multiprocessing.Process(target=reuters.start_process)
            p6 = multiprocessing.Process(target=infoline.start_process)
            p7 = multiprocessing.Process(
                target=financial_express.start_process)
            p8 = multiprocessing.Process(target=bloombergquint.start_process)
            p9 = multiprocessing.Process(target=businesstoday.start_process)

            p1.start()
            p2.start()
            p3.start()
            p4.start()
            p5.start()
            p6.start()
            p7.start()
            p8.start()
            p9.start()

    except Exception as error:
        database_log.error_log("read_website_configuration", error)


if __name__ == '__main__':

    try:

        run_news_scraper()

    except Exception as error:
        database_log.error_log("read_website_configuration", error)
def predict_models_header_sentiment(text_data_list):

    text_data_df = pd.DataFrame(text_data_list,
                                columns=['id', 'header', 'sub_header'])

    # delete record with empty or "NaN" value
    text_data_df.drop(text_data_df[text_data_df['header'] == "NaN"].index,
                      inplace=True)
    text_data_df.drop(text_data_df[text_data_df['header'] == ""].index,
                      inplace=True)

    # added sentiment analyzer columns to store sentiment value
    text_data_df["sentiment_for"] = ""
    text_data_df["nltk_classify"] = ""
    text_data_df["nltk_confidence"] = ""
    # text_data_df["word2vec_classify"] = -1
    text_data_df["count_vectorizer_classify"] = ""
    text_data_df["count_vectorizer_confidence"] = ""
    text_data_df["tfidf_vectorizer_classify"] = ""
    text_data_df["tfidf_vectorizer_confidence"] = ""

    # ---------news nltk load-------------")
    word_features = nltk_classifier.load_save_dataset('word_features.pickle')
    ensemble_clf = nltk_classifier.get_ensemble_models()

    # --------news word2vec load-------------
    # model, w2vmodel, tfidf = word2vec_classifier_model.load_prediction_model_parameters()

    sentiment_data_list = []

    for row in text_data_df.itertuples(index=False):
        text = row.header

        if len(text) > 0 and text != "NaN":

            try:
                # ---------nltk-------------")
                classify, nltk_confidence = nltk_classifier.sentiment_analyzer(
                    text, ensemble_clf, word_features)

                nltk_classify = 2
                if classify == "pos":
                    nltk_classify = 1
                elif classify == "neg":
                    nltk_classify = 0
                else:
                    nltk_classify = 2

                # --------news word2vec-------------
                # word2vec_classify = word2vec_classifier_model.predict(model, w2vmodel, tfidf, text)
                #
                # text_data_df['word2vec_classify'] = word2vec_classify

                # --------spacy------------------------
                classify_cVector, confidence_cVector = news_spacy_countVectorizer_model.sentiment_analyzer(
                    text)

                # predict using TfidfVectorizer
                classify_tfidf, confidence_tfidf = news_spacy_tfidfVectorizer_model.sentiment_analyzer(
                    text)

                sentiment_data_list.append({
                    'id':
                    row.id,
                    'header':
                    row.header,
                    'sub_header':
                    row.sub_header,
                    'sentiment_for':
                    "header",
                    'nltk_classify':
                    nltk_classify,
                    'nltk_confidence':
                    nltk_confidence,
                    'count_vectorizer_classify':
                    int(classify_cVector),
                    'count_vectorizer_confidence':
                    confidence_cVector,
                    'tfidf_vectorizer_classify':
                    int(classify_tfidf),
                    'tfidf_vectorizer_confidence':
                    confidence_tfidf
                })

            except Exception as error:
                print(error)
                database_log.error_log(
                    "run_news_sentiment_analyzer - predict_models_header_sentiment",
                    error)

    news_sentiment_data = pd.DataFrame(columns=[
        'id', 'header', 'sub_header', 'sentiment_for', 'nltk_classify',
        'nltk_confidence', 'count_vectorizer_classify',
        'count_vectorizer_confidence', 'tfidf_vectorizer_classify'
        'tfidf_vectorizer_confidence'
    ])

    if len(sentiment_data_list) > 0:
        news_sentiment_data = news_sentiment_data.append(sentiment_data_list)

    return news_sentiment_data