Exemplo n.º 1
0
def get_max_db_date():
    conn = None
    cursor = None
    db_details = MyUtils.get_db_details()
    sql = MyUtils.get_sql_dict()['get_max_db_date']

    try:
        conn = pg.connect(database=db_details['name'],
                          user=db_details['user'],
                          password=db_details['password'],
                          host=db_details['host'],
                          port=db_details['port'])

        cursor = conn.cursor()

        logger.debug(' Read query = " %s "', sql)

        cursor.execute(sql)
        date = cursor.fetchone()

        return date[0] if date[0] is not None else datetime(
            datetime.now().year, 1, 1)

    except Exception:
        logger.error("Exception in get_last_date : %s ",
                     traceback.format_exc())
        sys.exit()

    finally:
        if cursor: cursor.close()
        if conn: conn.close()
Exemplo n.º 2
0
def get_new_data_date_range(date1, date2):
    """
    Fetches new data from database for given date range
    :return: list of tuples of data fetched from database
    """

    logger.info(' Retrieving new data from db ')

    conn = None
    db_details = MyUtils.get_db_details()
    sql = MyUtils.get_sql_dict()['get_new_data_date_range']
    # last_date = get_last_updated_date()

    try:
        conn = pg.connect(database=db_details['name'],
                          user=db_details['user'],
                          password=db_details['password'],
                          host=db_details['host'],
                          port=db_details['port'])

        date1 = date1.strftime('%Y-%m-%d') if not isinstance(date1,
                                                             str) else date1
        date2 = date2.strftime('%Y-%m-%d') if not isinstance(date2,
                                                             str) else date2

        sql = sql.format(date1, date2)

        logger.debug(' Read query = " %s "', sql)

        data_frame = pd.read_sql(sql, conn)

        if len(data_frame.index) == 0:
            logger.debug('\n Data frame empty for %s to %s' % (date1, date2))

    except (pg.Error, Exception):
        logger.error('Exception in get_new_data_date_range in Dao.py : %s',
                     traceback.format_exc())
        sys.exit()

    else:
        data = []
        for row in data_frame.itertuples():
            text = row.text.strip()
            if len(text) > 0 and guessLanguage(text) in ['en', 'UNKNOWN']:
                level_1 = "null" if row.level_1 is None else row.level_1
                level_3 = "null" if row.level_3 is None else row.level_3
                date = row.date.strftime("%Y-%m-%d")
                data.append(
                    (text, level_1, level_3, row.attuid, date, row.sid))

        logger.info('Success in get_new_data_date_range()')
        return data

    finally:
        if conn: conn.close()
Exemplo n.º 3
0
def insert_train_data_into_db_from_file():
    """
    Checks if training data table is empty. If yes then reads labelled texts from files and inserts
     into the database with corresponding label. Otherwise does nothing.
     Uses CSV module to read csv files. psycopg2 to connect to db. pandas data frame to hold data read from csv.
     The query used is specified in config.properties file and is named 'insert_training_data'
    """

    if os.path.isfile('train_data/texts.csv'):
        print(' Reading train set from texts.csv file in train_data dir ')

        df = pd.read_csv('train_data/texts.csv', header=None)

        texts = df.iloc[:, 0:1]
        labels = df.iloc[:, 1:2]
        data = list(zip(texts, labels))

        conn = None
        cursor = None

        db = MyUtils.get_db_details()
        insert_sql = MyUtils.get_sql_dict()['insert_training_data']

        try:
            conn = pg.connect(database=db['name'],
                              user=db['user'],
                              password=db['password'],
                              host=db['host'],
                              port=db['port'])

            cursor = conn.cursor()

            record_template = ','.join(['%s'] * len(data))
            insert_query = insert_sql.format(record_template)
            logger.debug(' Insert sql = " %s "', insert_query)

            logger.info(' Executing query for inserting train data into db ')
            cursor.execute(insert_query, data)

            conn.commit()
            print(' commit done ')

        except (pg.Error, Exception):
            logger.error(
                ' Exception in insert_train_data_into_db_from_file: %s ',
                traceback.format_exc())
            sys.exit()

        else:
            print('Successfully inserted train data')

        finally:
            if cursor: cursor.close()
            if conn: conn.close()
Exemplo n.º 4
0
def insert_predicted_into_db(data, period_type):
    """
    Inserts predicted data into the database with corresponding label with list of texts and
     issue level_1 and level_3 values as well.
    Uses psycopg2 to connect to db.
     The query used is specified in config.properties file and is named 'insert_predicted_data'
    """

    logger.debug(' Inserting predicted data into database ')

    conn = None
    cursor = None

    db = MyUtils.get_db_details()
    insert_sql = MyUtils.get_sql_dict()['insert_monthly_pos_neg'] if period_type.lower() == CONSTANTS.MONTHLY \
        else MyUtils.get_sql_dict()['insert_weekly_pos_neg'] if period_type.lower() == CONSTANTS.WEEKLY \
        else MyUtils.get_sql_dict()['insert_daily_pos_neg']

    try:
        conn = pg.connect(database=db['name'],
                          user=db['user'],
                          password=db['password'],
                          host=db['host'],
                          port=db['port'])

        cursor = conn.cursor()

        record_template = ','.join(['%s'] * len(data))
        insert_query = insert_sql.format(record_template)
        logger.debug(' Insert sql = " %s "', insert_query)

        logger.debug(' Executing query for inserting data into db ')
        cursor.execute(insert_query, data)

        conn.commit()
        logger.info(' commit done ')

    except (pg.Error, Exception):
        logger.error(' Exception in insert_predicted_into_db: %s ',
                     traceback.format_exc())
        sys.exit()

    else:
        logger.info('Successfully inserted predicted data')

    finally:
        if cursor: cursor.close()
        if conn: conn.close()
Exemplo n.º 5
0
def insert_into_monthly_sentiment(data):
    """
    Inserts data into the database text, label, issue1, issue3, agent_id
    Uses psycopg2 to connect to db.
    The query used is specified in config.properties file and is named 'insert_monthly_sentiment'
    """

    logger.debug(' Inserting insert_monthly_sentiment data into database ')

    conn = None
    cursor = None

    db = MyUtils.get_db_details()
    insert_sql = MyUtils.get_sql_dict()['insert_monthly_sentiment']

    try:
        conn = pg.connect(database=db['name'],
                          user=db['user'],
                          password=db['password'],
                          host=db['host'],
                          port=db['port'])

        cursor = conn.cursor()

        record_template = ','.join(['%s'] * len(data))
        logger.debug(' Insert sql = " %s "', insert_sql)
        insert_query = insert_sql.format(record_template)

        logger.debug(' Executing query for inserting data into db ')
        cursor.execute(insert_query, data)

        conn.commit()
        logger.info(' commit done ')

    except (pg.Error, Exception):
        logger.error(' Exception in insert_into_monthly_sentiment',
                     traceback.format_exc())
        raise Exception

    else:
        logger.info('Successfully inserted predicted data')

    finally:
        if cursor: cursor.close()
        if conn: conn.close()
Exemplo n.º 6
0
def get_new_data(date):
    """
    Fetches new data from database on which prediction needs to be performed
    :return: list of tuples of data fetched from database
    """

    print(' Retrieving new data from db ')

    conn = None
    db_details = MyUtils.get_db_details()
    sql = MyUtils.get_sql_dict()['get_new_data']

    try:
        conn = pg.connect(database=db_details['name'],
                          user=db_details['user'],
                          password=db_details['password'],
                          host=db_details['host'],
                          port=db_details['port'])

        sql = sql.format(date.strftime('%Y-%m-%d'))
        logger.debug(' Read query = " %s "', sql)

        data_frame = pd.read_sql(sql, conn)

        if len(data_frame.index) == 0:
            print('Data frame empty. No data after date : ', date)
            raise Exception

    except (pg.Error, Exception):
        logger.error(
            'Exception in get_new_data in Classification_Helper.py: %s ',
            traceback.format_exc())
        sys.exit()
    else:
        data = []
        for row in data_frame.itertuples():
            data.append((row.text, row.nt_id))

        print('Success in get_new_data()')
        return data
    finally:
        if conn: conn.close()
Exemplo n.º 7
0
def get_last_updated_date(type):
    """
    Retrieves last entry date for predicted or raw data in database depending upon type.

    :param type: one of 'PREVIOUS' or 'NEW' and is used to decide whether to return last(max) predicted data date or
     max new data date

    :return: date object
    """

    conn = None
    cursor = None
    db_details = MyUtils.get_db_details()
    sql = MyUtils.get_sql_dict(
    )['get_last_result_date'] if type == CONSTANTS.PREVIOUS else MyUtils.get_sql_dict(
    )['get_max_new_data_date'] if type == CONSTANTS.NEW else None

    try:
        conn = pg.connect(database=db_details['name'],
                          user=db_details['user'],
                          password=db_details['password'],
                          host=db_details['host'],
                          port=db_details['port'])

        cursor = conn.cursor()

        logger.debug(' Read query = " %s "', sql)

        cursor.execute(sql)
        date = cursor.fetchone()

        return date[0] if date[0] is not None else datetime(
            year=datetime.now().year, month=1, day=1).date()

    except Exception:
        logger.error("Exception in get_last_date : %s ",
                     traceback.format_exc())
        sys.exit()

    finally:
        if cursor: cursor.close()
        if conn: conn.close()
Exemplo n.º 8
0
def delete_previous_data(tables, period):
    """
    Deletes previous data from database for table in 'tables' list
    :return: None
    """
    sql = None
    conn = None
    cursor = None
    db_details = MyUtils.get_db_details()

    try:

        for table in tables:
            if period.lower() == CONSTANTS.ALL:
                sql = MyUtils.get_sql_dict()[
                    'delete_monthly_sentiment_data'] if table.lower() == CONSTANTS.MONTHLY_SENTIMENT \
                    else MyUtils.get_sql_dict()[
                    'delete_monthly_word_cloud'] if table.lower() == CONSTANTS.MONTHLY_WORD_CLOUD else \
                    MyUtils.get_sql_dict()[
                        'delete_monthly_words_verbatims'] if table.lower() == CONSTANTS.MONTHLY_WORDS_VERBATIMS else None

            elif period.lower() == CONSTANTS.PREVIOUS:
                sql = MyUtils.get_sql_dict()[
                    'delete_monthly_sentiment_data_last_month'] if table.lower() == CONSTANTS.MONTHLY_SENTIMENT \
                    else MyUtils.get_sql_dict()[
                    'delete_monthly_word_cloud_last_month'] if table.lower() == CONSTANTS.MONTHLY_WORD_CLOUD else \
                    MyUtils.get_sql_dict()[
                        'delete_monthly_words_verbatims_last_month'] if table.lower() == CONSTANTS.MONTHLY_WORDS_VERBATIMS else None

            conn = pg.connect(database=db_details['name'],
                              user=db_details['user'],
                              password=db_details['password'],
                              host=db_details['host'],
                              port=db_details['port'])

            cursor = conn.cursor()

            logger.debug(' Read query = " %s "', sql)

            if sql is None:
                raise Exception(
                    "sql can not be 'None' type."
                    " No or wrong table name(s) passed to delete_previous_data() in MyUtils.py"
                )

            cursor.execute(sql)

            conn.commit()

    except Exception:
        logger.error("Exception in delete_previous_data : %s ",
                     traceback.format_exc())
        raise Exception
    finally:
        if cursor: cursor.close()
        if conn: conn.close()
Exemplo n.º 9
0
def get_train_data_from_db():
    """
    Retrieves train data from database based on the sql query
     provided in the config file where sql name is 'get_train_data'
    :return: data(texts) and label(targets)
    """

    print(' Retrieving train data from db ')

    conn = None
    db_details = MyUtils.get_db_details()
    sql = MyUtils.get_sql_dict()['get_training_data']

    try:
        conn = pg.connect(database=db_details['name'],
                          user=db_details['user'],
                          password=db_details['password'],
                          host=db_details['host'],
                          port=db_details['port'])

        logger.debug(' Read query = " %s "', sql)

        data_frame = pd.read_sql(sql, conn)
        data = data_frame.text
        label = data_frame.label

    except (pg.Error, Exception):
        logger.error(
            'Exception in get_train_data in Classification_Helper.py: %s ',
            traceback.format_exc())
        sys.exit()

    else:
        print('Success in get_train_data()')
        return data, label

    finally:
        if conn: conn.close()
Exemplo n.º 10
0
def get_min_data_date():
    """
    Gives minimum date from raw data
    :return: date object
    """

    conn = None
    cursor = None
    db_details = MyUtils.get_db_details()
    sql = MyUtils.get_sql_dict()['get_min_db_date']

    try:
        conn = pg.connect(database=db_details['name'],
                          user=db_details['user'],
                          password=db_details['password'],
                          host=db_details['host'],
                          port=db_details['port'])

        cursor = conn.cursor()

        logger.debug(' Read query = " %s "', sql)

        cursor.execute(sql)
        date = cursor.fetchone()

        return date[0] if date[0] is not None else datetime(
            datetime.now().year, 1, 1).date()

    except Exception:
        logger.error("Exception in get_min_data_date : ",
                     traceback.format_exc())
        raise Exception

    finally:
        if cursor: cursor.close()
        if conn: conn.close()
def classify_new_data(x_new):
    """
    Loads the pickled classifier and tfidf vectorizer then uses those for transforming and classifying new data
    :param x_new: New data that needs to be classified
    :return: prediction on new data
    """

    file_path_dict = MyUtils.get_file_path_dict()

    with open(file_path_dict[CONSTANTS.TRAINED_MODEL], 'rb') as file:
        classifier = pickle.load(file)
    with open(file_path_dict[CONSTANTS.TFIDF_VECTORIZER], 'rb') as file:
        tfidf_vect = pickle.load(file)

    predicted = classifier.predict(tfidf_vect.transform(x_new))
    return predicted
def train_model(X_data, y_data):
    """
    Trains the model by using the parameter values. Default classifier is MultinomialNB
    Also stores the trained model and fitted tfidf vectorizer on disk with file names 'trained_model' and 'tfidf_vect' under utility dir
    :return: Fitted model and fitted tfidf_vectorizer
    """

    file_path_dict = MyUtils.get_file_path_dict()

    tfidf_vect = get_tfidf_vectorizer()
    classifier = get_classifier()
    X_tfidfed = tfidf_vect.fit_transform(X_data)
    classifier.fit(X_tfidfed, y_data)

    with open(file_path_dict[CONSTANTS.TRAINED_MODEL], 'wb') as file:
        pickle.dump(classifier, file)
    with open(file_path_dict[CONSTANTS.TFIDF_VECTORIZER], 'wb') as file:
        pickle.dump(tfidf_vect, file)

    return classifier, tfidf_vect
def text_mining_pipeline(args, log_dir):
    LOGGING_LEVEL = logging.DEBUG

    LOG_FILE_NAME = os.path.join(log_dir, 'TEXT_MINING_LOG.out')
    logger = logging.getLogger(__name__)

    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s ')
    handler = RotatingFileHandler(LOG_FILE_NAME,
                                  maxBytes=10 * 1024,
                                  backupCount=5)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    dao_logger, utils_logger = Dao.logger, MyUtils.logger

    logger.setLevel(LOGGING_LEVEL)
    dao_logger.setLevel(LOGGING_LEVEL)
    utils_logger.setLevel(LOGGING_LEVEL)

    dao_logger.addHandler(handler)
    utils_logger.addHandler(handler)

    start, max_date_month = None, None

    print('\n \n Check LOG file for status \n \n ')

    if len(args) == 1:
        logger.error(
            "\n No command line argument passed ! \n"
            " \n YOU are supposed to pass one script parameter \n"
            " \n Possible values -> \n 1. 'train'  2. 'all'  3. 'update' \n \n "
        )
        sys.exit(1)

    elif args[1].lower() not in [
            CONSTANTS.TRAIN, CONSTANTS.ALL, CONSTANTS.UPDATE
    ]:
        logger.error(
            "Wrong parameter passed \n  \n Possible values -> \n 1. 'train'  2. 'all'  3. 'update' \n \n "
        )
        sys.exit(1)

    elif args[1].lower() == CONSTANTS.TRAIN:
        ''' Currently reading data to train from file. But Already have function to read from database.'''

        logger.info('\n \n Running Pipeline for')
        logger.info(CONSTANTS.TRAIN.upper())

        X, y = MyUtils.get_train_data_from_file()
        Classification_Helper.train_model(
            MyUtils.process_data(data=X, type=None), y)
        logger.info('Training for model done')
        # Training part of Pipeline ends here

    else:
        try:
            if args[1].lower() == CONSTANTS.ALL:
                '''ALL option deletes all data from monthly table then runs pipeline from month of minimum date
                to current month - 1 '''

                logger.info('\n \n Running Pipeline for')
                logger.info(CONSTANTS.ALL.upper())

                Dao.delete_previous_data(tables=[
                    CONSTANTS.MONTHLY_SENTIMENT, CONSTANTS.MONTHLY_WORD_CLOUD,
                    CONSTANTS.MONTHLY_WORDS_VERBATIMS
                ],
                                         period=CONSTANTS.ALL)

                min_date = Dao.get_min_data_date()
                start = min_date.replace(day=1)

            elif args[1].lower() == CONSTANTS.UPDATE:
                ''' UPDATE runs pipeline from next month of last entry date till current month - 1 '''

                logger.info('\n \n Running Pipeline for')
                logger.info(CONSTANTS.UPDATE.upper())

                start = Dao.get_last_updated_date(type=CONSTANTS.PREVIOUS)
                # start = start.replace(day=1) + relativedelta(months=1)
                start = start.replace(day=1)

            # if datetime.now().month == 1:
            #     max_date_month = 12
            #     max_date_year = datetime.now().year - 1
            # else:
            #     max_date_month = datetime.now().month - 1
            #     max_date_year = datetime.now().year
            #
            # max_date = datetime(year=max_date_year, month=max_date_month, day=1).date()
            max_date = datetime.now().date().replace(
                day=MyUtils.get_last_date_of_month(datetime.now().date()).day)

            if start <= max_date:
                # start = start - relativedelta(months=1)
                logger.info('Deleting, if present, and running for %s' %
                            start.strftime("%m-%Y"))

                Dao.delete_previous_data(tables=[
                    CONSTANTS.MONTHLY_SENTIMENT, CONSTANTS.MONTHLY_WORD_CLOUD,
                    CONSTANTS.MONTHLY_WORDS_VERBATIMS
                ],
                                         period=CONSTANTS.PREVIOUS)

                if datetime.now().day <= 10 and start.month == max_date.month:
                    logger.info(
                        'Date in first 10 days... So deleting for one more previous month'
                    )

                    Dao.delete_previous_data(tables=[
                        CONSTANTS.MONTHLY_SENTIMENT,
                        CONSTANTS.MONTHLY_WORD_CLOUD,
                        CONSTANTS.MONTHLY_WORDS_VERBATIMS
                    ],
                                             period=CONSTANTS.PREVIOUS)
                    start = start - relativedelta(months=1)

            while start <= max_date:
                end = MyUtils.get_last_date_of_month(start)
                new_data = Dao.get_new_data_date_range(start, end)

                if len(new_data) > 0:
                    texts, issue_1, issue_3, agent_ids, dates, sids = zip(
                        *new_data)

                    to_be_inserted = []
                    last_date = MyUtils.get_last_date_of_month(date=start)
                    month_year = MyUtils.get_formatted_month_year_name(
                        date=last_date)
                    cleaned_data = MyUtils.process_data(data=texts, type=None)
                    predicted = Classification_Helper.classify_new_data(
                        x_new=cleaned_data)

                    for verbatim, sentiment, lvl1, lvl3, agent_id, survey_date, sid in zip(
                            texts, predicted, issue_1, issue_3, agent_ids,
                            dates, sids):
                        to_be_inserted.append(
                            (last_date, verbatim, sentiment, lvl1, lvl3,
                             agent_id, month_year, survey_date, sid))

                    # send list of tuples to insert
                    Dao.insert_into_monthly_sentiment(data=to_be_inserted)

                    for sentiment_type in [
                            CONSTANTS.POSITIVE, CONSTANTS.NEGATIVE
                    ]:
                        one_sentiment_data = MyUtils.separate_single_sentiment_data(
                            data=zip(texts, predicted),
                            sentiment=sentiment_type)

                        cleaned_data_no_adj_adv = MyUtils.process_data(
                            data=one_sentiment_data, type=CONSTANTS.NO_ADJ_ADV)

                        words_list, word_cloud_dict = MyUtils.get_word_cloud_data(
                            data=cleaned_data_no_adj_adv,
                            sentiment=sentiment_type)

                        verbatims_for_word_data = MyUtils.get_verbatims_for_word(
                            data=one_sentiment_data,
                            words=words_list,
                            sentiment=sentiment_type,
                            last_date=last_date,
                            month_year_name=month_year,
                            agent_id=CONSTANTS.ANALYST)

                        word_cloud_data = MyUtils.prepare_word_cloud_data(
                            last_date=last_date,
                            verbatims_for_words=verbatims_for_word_data,
                            word_cloud_dict=word_cloud_dict,
                            sentiment=sentiment_type,
                            nt_id=CONSTANTS.ANALYST,
                            month_year=month_year)

                        Dao.insert_word_cloud_data(data=word_cloud_data)

                        Dao.insert_monthly_words_verbatims(
                            words_verbatims_data=verbatims_for_word_data)

                start = start + relativedelta(months=1)

        except Exception:
            logger.error('\n Exception occurred for %s \n ' %
                         traceback.format_exc())
            logger.error('Exception occurred for %s \n \n ' % sys.argv[1])
            sys.exit(1)
        else:
            logger.info('" %s " option processes done \n\n' %
                        sys.argv[1].upper())