def get_max_db_date(): conn = None cursor = None db_details = MyUtils.get_db_details() sql = MyUtils.get_sql_dict()['get_max_db_date'] try: conn = pg.connect(database=db_details['name'], user=db_details['user'], password=db_details['password'], host=db_details['host'], port=db_details['port']) cursor = conn.cursor() logger.debug(' Read query = " %s "', sql) cursor.execute(sql) date = cursor.fetchone() return date[0] if date[0] is not None else datetime( datetime.now().year, 1, 1) except Exception: logger.error("Exception in get_last_date : %s ", traceback.format_exc()) sys.exit() finally: if cursor: cursor.close() if conn: conn.close()
def get_new_data_date_range(date1, date2): """ Fetches new data from database for given date range :return: list of tuples of data fetched from database """ logger.info(' Retrieving new data from db ') conn = None db_details = MyUtils.get_db_details() sql = MyUtils.get_sql_dict()['get_new_data_date_range'] # last_date = get_last_updated_date() try: conn = pg.connect(database=db_details['name'], user=db_details['user'], password=db_details['password'], host=db_details['host'], port=db_details['port']) date1 = date1.strftime('%Y-%m-%d') if not isinstance(date1, str) else date1 date2 = date2.strftime('%Y-%m-%d') if not isinstance(date2, str) else date2 sql = sql.format(date1, date2) logger.debug(' Read query = " %s "', sql) data_frame = pd.read_sql(sql, conn) if len(data_frame.index) == 0: logger.debug('\n Data frame empty for %s to %s' % (date1, date2)) except (pg.Error, Exception): logger.error('Exception in get_new_data_date_range in Dao.py : %s', traceback.format_exc()) sys.exit() else: data = [] for row in data_frame.itertuples(): text = row.text.strip() if len(text) > 0 and guessLanguage(text) in ['en', 'UNKNOWN']: level_1 = "null" if row.level_1 is None else row.level_1 level_3 = "null" if row.level_3 is None else row.level_3 date = row.date.strftime("%Y-%m-%d") data.append( (text, level_1, level_3, row.attuid, date, row.sid)) logger.info('Success in get_new_data_date_range()') return data finally: if conn: conn.close()
def insert_train_data_into_db_from_file(): """ Checks if training data table is empty. If yes then reads labelled texts from files and inserts into the database with corresponding label. Otherwise does nothing. Uses CSV module to read csv files. psycopg2 to connect to db. pandas data frame to hold data read from csv. The query used is specified in config.properties file and is named 'insert_training_data' """ if os.path.isfile('train_data/texts.csv'): print(' Reading train set from texts.csv file in train_data dir ') df = pd.read_csv('train_data/texts.csv', header=None) texts = df.iloc[:, 0:1] labels = df.iloc[:, 1:2] data = list(zip(texts, labels)) conn = None cursor = None db = MyUtils.get_db_details() insert_sql = MyUtils.get_sql_dict()['insert_training_data'] try: conn = pg.connect(database=db['name'], user=db['user'], password=db['password'], host=db['host'], port=db['port']) cursor = conn.cursor() record_template = ','.join(['%s'] * len(data)) insert_query = insert_sql.format(record_template) logger.debug(' Insert sql = " %s "', insert_query) logger.info(' Executing query for inserting train data into db ') cursor.execute(insert_query, data) conn.commit() print(' commit done ') except (pg.Error, Exception): logger.error( ' Exception in insert_train_data_into_db_from_file: %s ', traceback.format_exc()) sys.exit() else: print('Successfully inserted train data') finally: if cursor: cursor.close() if conn: conn.close()
def insert_predicted_into_db(data, period_type): """ Inserts predicted data into the database with corresponding label with list of texts and issue level_1 and level_3 values as well. Uses psycopg2 to connect to db. The query used is specified in config.properties file and is named 'insert_predicted_data' """ logger.debug(' Inserting predicted data into database ') conn = None cursor = None db = MyUtils.get_db_details() insert_sql = MyUtils.get_sql_dict()['insert_monthly_pos_neg'] if period_type.lower() == CONSTANTS.MONTHLY \ else MyUtils.get_sql_dict()['insert_weekly_pos_neg'] if period_type.lower() == CONSTANTS.WEEKLY \ else MyUtils.get_sql_dict()['insert_daily_pos_neg'] try: conn = pg.connect(database=db['name'], user=db['user'], password=db['password'], host=db['host'], port=db['port']) cursor = conn.cursor() record_template = ','.join(['%s'] * len(data)) insert_query = insert_sql.format(record_template) logger.debug(' Insert sql = " %s "', insert_query) logger.debug(' Executing query for inserting data into db ') cursor.execute(insert_query, data) conn.commit() logger.info(' commit done ') except (pg.Error, Exception): logger.error(' Exception in insert_predicted_into_db: %s ', traceback.format_exc()) sys.exit() else: logger.info('Successfully inserted predicted data') finally: if cursor: cursor.close() if conn: conn.close()
def insert_into_monthly_sentiment(data): """ Inserts data into the database text, label, issue1, issue3, agent_id Uses psycopg2 to connect to db. The query used is specified in config.properties file and is named 'insert_monthly_sentiment' """ logger.debug(' Inserting insert_monthly_sentiment data into database ') conn = None cursor = None db = MyUtils.get_db_details() insert_sql = MyUtils.get_sql_dict()['insert_monthly_sentiment'] try: conn = pg.connect(database=db['name'], user=db['user'], password=db['password'], host=db['host'], port=db['port']) cursor = conn.cursor() record_template = ','.join(['%s'] * len(data)) logger.debug(' Insert sql = " %s "', insert_sql) insert_query = insert_sql.format(record_template) logger.debug(' Executing query for inserting data into db ') cursor.execute(insert_query, data) conn.commit() logger.info(' commit done ') except (pg.Error, Exception): logger.error(' Exception in insert_into_monthly_sentiment', traceback.format_exc()) raise Exception else: logger.info('Successfully inserted predicted data') finally: if cursor: cursor.close() if conn: conn.close()
def get_new_data(date): """ Fetches new data from database on which prediction needs to be performed :return: list of tuples of data fetched from database """ print(' Retrieving new data from db ') conn = None db_details = MyUtils.get_db_details() sql = MyUtils.get_sql_dict()['get_new_data'] try: conn = pg.connect(database=db_details['name'], user=db_details['user'], password=db_details['password'], host=db_details['host'], port=db_details['port']) sql = sql.format(date.strftime('%Y-%m-%d')) logger.debug(' Read query = " %s "', sql) data_frame = pd.read_sql(sql, conn) if len(data_frame.index) == 0: print('Data frame empty. No data after date : ', date) raise Exception except (pg.Error, Exception): logger.error( 'Exception in get_new_data in Classification_Helper.py: %s ', traceback.format_exc()) sys.exit() else: data = [] for row in data_frame.itertuples(): data.append((row.text, row.nt_id)) print('Success in get_new_data()') return data finally: if conn: conn.close()
def get_last_updated_date(type): """ Retrieves last entry date for predicted or raw data in database depending upon type. :param type: one of 'PREVIOUS' or 'NEW' and is used to decide whether to return last(max) predicted data date or max new data date :return: date object """ conn = None cursor = None db_details = MyUtils.get_db_details() sql = MyUtils.get_sql_dict( )['get_last_result_date'] if type == CONSTANTS.PREVIOUS else MyUtils.get_sql_dict( )['get_max_new_data_date'] if type == CONSTANTS.NEW else None try: conn = pg.connect(database=db_details['name'], user=db_details['user'], password=db_details['password'], host=db_details['host'], port=db_details['port']) cursor = conn.cursor() logger.debug(' Read query = " %s "', sql) cursor.execute(sql) date = cursor.fetchone() return date[0] if date[0] is not None else datetime( year=datetime.now().year, month=1, day=1).date() except Exception: logger.error("Exception in get_last_date : %s ", traceback.format_exc()) sys.exit() finally: if cursor: cursor.close() if conn: conn.close()
def delete_previous_data(tables, period): """ Deletes previous data from database for table in 'tables' list :return: None """ sql = None conn = None cursor = None db_details = MyUtils.get_db_details() try: for table in tables: if period.lower() == CONSTANTS.ALL: sql = MyUtils.get_sql_dict()[ 'delete_monthly_sentiment_data'] if table.lower() == CONSTANTS.MONTHLY_SENTIMENT \ else MyUtils.get_sql_dict()[ 'delete_monthly_word_cloud'] if table.lower() == CONSTANTS.MONTHLY_WORD_CLOUD else \ MyUtils.get_sql_dict()[ 'delete_monthly_words_verbatims'] if table.lower() == CONSTANTS.MONTHLY_WORDS_VERBATIMS else None elif period.lower() == CONSTANTS.PREVIOUS: sql = MyUtils.get_sql_dict()[ 'delete_monthly_sentiment_data_last_month'] if table.lower() == CONSTANTS.MONTHLY_SENTIMENT \ else MyUtils.get_sql_dict()[ 'delete_monthly_word_cloud_last_month'] if table.lower() == CONSTANTS.MONTHLY_WORD_CLOUD else \ MyUtils.get_sql_dict()[ 'delete_monthly_words_verbatims_last_month'] if table.lower() == CONSTANTS.MONTHLY_WORDS_VERBATIMS else None conn = pg.connect(database=db_details['name'], user=db_details['user'], password=db_details['password'], host=db_details['host'], port=db_details['port']) cursor = conn.cursor() logger.debug(' Read query = " %s "', sql) if sql is None: raise Exception( "sql can not be 'None' type." " No or wrong table name(s) passed to delete_previous_data() in MyUtils.py" ) cursor.execute(sql) conn.commit() except Exception: logger.error("Exception in delete_previous_data : %s ", traceback.format_exc()) raise Exception finally: if cursor: cursor.close() if conn: conn.close()
def get_train_data_from_db(): """ Retrieves train data from database based on the sql query provided in the config file where sql name is 'get_train_data' :return: data(texts) and label(targets) """ print(' Retrieving train data from db ') conn = None db_details = MyUtils.get_db_details() sql = MyUtils.get_sql_dict()['get_training_data'] try: conn = pg.connect(database=db_details['name'], user=db_details['user'], password=db_details['password'], host=db_details['host'], port=db_details['port']) logger.debug(' Read query = " %s "', sql) data_frame = pd.read_sql(sql, conn) data = data_frame.text label = data_frame.label except (pg.Error, Exception): logger.error( 'Exception in get_train_data in Classification_Helper.py: %s ', traceback.format_exc()) sys.exit() else: print('Success in get_train_data()') return data, label finally: if conn: conn.close()
def get_min_data_date(): """ Gives minimum date from raw data :return: date object """ conn = None cursor = None db_details = MyUtils.get_db_details() sql = MyUtils.get_sql_dict()['get_min_db_date'] try: conn = pg.connect(database=db_details['name'], user=db_details['user'], password=db_details['password'], host=db_details['host'], port=db_details['port']) cursor = conn.cursor() logger.debug(' Read query = " %s "', sql) cursor.execute(sql) date = cursor.fetchone() return date[0] if date[0] is not None else datetime( datetime.now().year, 1, 1).date() except Exception: logger.error("Exception in get_min_data_date : ", traceback.format_exc()) raise Exception finally: if cursor: cursor.close() if conn: conn.close()
def classify_new_data(x_new): """ Loads the pickled classifier and tfidf vectorizer then uses those for transforming and classifying new data :param x_new: New data that needs to be classified :return: prediction on new data """ file_path_dict = MyUtils.get_file_path_dict() with open(file_path_dict[CONSTANTS.TRAINED_MODEL], 'rb') as file: classifier = pickle.load(file) with open(file_path_dict[CONSTANTS.TFIDF_VECTORIZER], 'rb') as file: tfidf_vect = pickle.load(file) predicted = classifier.predict(tfidf_vect.transform(x_new)) return predicted
def train_model(X_data, y_data): """ Trains the model by using the parameter values. Default classifier is MultinomialNB Also stores the trained model and fitted tfidf vectorizer on disk with file names 'trained_model' and 'tfidf_vect' under utility dir :return: Fitted model and fitted tfidf_vectorizer """ file_path_dict = MyUtils.get_file_path_dict() tfidf_vect = get_tfidf_vectorizer() classifier = get_classifier() X_tfidfed = tfidf_vect.fit_transform(X_data) classifier.fit(X_tfidfed, y_data) with open(file_path_dict[CONSTANTS.TRAINED_MODEL], 'wb') as file: pickle.dump(classifier, file) with open(file_path_dict[CONSTANTS.TFIDF_VECTORIZER], 'wb') as file: pickle.dump(tfidf_vect, file) return classifier, tfidf_vect
def text_mining_pipeline(args, log_dir): LOGGING_LEVEL = logging.DEBUG LOG_FILE_NAME = os.path.join(log_dir, 'TEXT_MINING_LOG.out') logger = logging.getLogger(__name__) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s ') handler = RotatingFileHandler(LOG_FILE_NAME, maxBytes=10 * 1024, backupCount=5) handler.setFormatter(formatter) logger.addHandler(handler) dao_logger, utils_logger = Dao.logger, MyUtils.logger logger.setLevel(LOGGING_LEVEL) dao_logger.setLevel(LOGGING_LEVEL) utils_logger.setLevel(LOGGING_LEVEL) dao_logger.addHandler(handler) utils_logger.addHandler(handler) start, max_date_month = None, None print('\n \n Check LOG file for status \n \n ') if len(args) == 1: logger.error( "\n No command line argument passed ! \n" " \n YOU are supposed to pass one script parameter \n" " \n Possible values -> \n 1. 'train' 2. 'all' 3. 'update' \n \n " ) sys.exit(1) elif args[1].lower() not in [ CONSTANTS.TRAIN, CONSTANTS.ALL, CONSTANTS.UPDATE ]: logger.error( "Wrong parameter passed \n \n Possible values -> \n 1. 'train' 2. 'all' 3. 'update' \n \n " ) sys.exit(1) elif args[1].lower() == CONSTANTS.TRAIN: ''' Currently reading data to train from file. But Already have function to read from database.''' logger.info('\n \n Running Pipeline for') logger.info(CONSTANTS.TRAIN.upper()) X, y = MyUtils.get_train_data_from_file() Classification_Helper.train_model( MyUtils.process_data(data=X, type=None), y) logger.info('Training for model done') # Training part of Pipeline ends here else: try: if args[1].lower() == CONSTANTS.ALL: '''ALL option deletes all data from monthly table then runs pipeline from month of minimum date to current month - 1 ''' logger.info('\n \n Running Pipeline for') logger.info(CONSTANTS.ALL.upper()) Dao.delete_previous_data(tables=[ CONSTANTS.MONTHLY_SENTIMENT, CONSTANTS.MONTHLY_WORD_CLOUD, CONSTANTS.MONTHLY_WORDS_VERBATIMS ], period=CONSTANTS.ALL) min_date = Dao.get_min_data_date() start = min_date.replace(day=1) elif args[1].lower() == CONSTANTS.UPDATE: ''' UPDATE runs pipeline from next month of last entry date till current month - 1 ''' logger.info('\n \n Running Pipeline for') logger.info(CONSTANTS.UPDATE.upper()) start = Dao.get_last_updated_date(type=CONSTANTS.PREVIOUS) # start = start.replace(day=1) + relativedelta(months=1) start = start.replace(day=1) # if datetime.now().month == 1: # max_date_month = 12 # max_date_year = datetime.now().year - 1 # else: # max_date_month = datetime.now().month - 1 # max_date_year = datetime.now().year # # max_date = datetime(year=max_date_year, month=max_date_month, day=1).date() max_date = datetime.now().date().replace( day=MyUtils.get_last_date_of_month(datetime.now().date()).day) if start <= max_date: # start = start - relativedelta(months=1) logger.info('Deleting, if present, and running for %s' % start.strftime("%m-%Y")) Dao.delete_previous_data(tables=[ CONSTANTS.MONTHLY_SENTIMENT, CONSTANTS.MONTHLY_WORD_CLOUD, CONSTANTS.MONTHLY_WORDS_VERBATIMS ], period=CONSTANTS.PREVIOUS) if datetime.now().day <= 10 and start.month == max_date.month: logger.info( 'Date in first 10 days... So deleting for one more previous month' ) Dao.delete_previous_data(tables=[ CONSTANTS.MONTHLY_SENTIMENT, CONSTANTS.MONTHLY_WORD_CLOUD, CONSTANTS.MONTHLY_WORDS_VERBATIMS ], period=CONSTANTS.PREVIOUS) start = start - relativedelta(months=1) while start <= max_date: end = MyUtils.get_last_date_of_month(start) new_data = Dao.get_new_data_date_range(start, end) if len(new_data) > 0: texts, issue_1, issue_3, agent_ids, dates, sids = zip( *new_data) to_be_inserted = [] last_date = MyUtils.get_last_date_of_month(date=start) month_year = MyUtils.get_formatted_month_year_name( date=last_date) cleaned_data = MyUtils.process_data(data=texts, type=None) predicted = Classification_Helper.classify_new_data( x_new=cleaned_data) for verbatim, sentiment, lvl1, lvl3, agent_id, survey_date, sid in zip( texts, predicted, issue_1, issue_3, agent_ids, dates, sids): to_be_inserted.append( (last_date, verbatim, sentiment, lvl1, lvl3, agent_id, month_year, survey_date, sid)) # send list of tuples to insert Dao.insert_into_monthly_sentiment(data=to_be_inserted) for sentiment_type in [ CONSTANTS.POSITIVE, CONSTANTS.NEGATIVE ]: one_sentiment_data = MyUtils.separate_single_sentiment_data( data=zip(texts, predicted), sentiment=sentiment_type) cleaned_data_no_adj_adv = MyUtils.process_data( data=one_sentiment_data, type=CONSTANTS.NO_ADJ_ADV) words_list, word_cloud_dict = MyUtils.get_word_cloud_data( data=cleaned_data_no_adj_adv, sentiment=sentiment_type) verbatims_for_word_data = MyUtils.get_verbatims_for_word( data=one_sentiment_data, words=words_list, sentiment=sentiment_type, last_date=last_date, month_year_name=month_year, agent_id=CONSTANTS.ANALYST) word_cloud_data = MyUtils.prepare_word_cloud_data( last_date=last_date, verbatims_for_words=verbatims_for_word_data, word_cloud_dict=word_cloud_dict, sentiment=sentiment_type, nt_id=CONSTANTS.ANALYST, month_year=month_year) Dao.insert_word_cloud_data(data=word_cloud_data) Dao.insert_monthly_words_verbatims( words_verbatims_data=verbatims_for_word_data) start = start + relativedelta(months=1) except Exception: logger.error('\n Exception occurred for %s \n ' % traceback.format_exc()) logger.error('Exception occurred for %s \n \n ' % sys.argv[1]) sys.exit(1) else: logger.info('" %s " option processes done \n\n' % sys.argv[1].upper())