Пример #1
0
    def lex_sent_batch(self,
                       df=None,
                       lexicons=None,
                       dataset_name='',
                       lower=True,
                       agg_type='sum',
                       discretize_sent=True):
        """
        Count sentiment base on lexicons for provided dataset.

        Parameters
        ----------
        df : pandas.DataFrame
            It must be passed as Pandas Data Frame with 'Document'
            and 'Sentiment' columns. 'Documents'

        lexicons : dict
            Dictionary with sentiment lexicons.

        dataset_name : str
            Just dataset name - use for saving predictions.

        evaluate : str
            Do you want to evaluate your prediction accuracy? If yes then it must
            provide path where results will be saved. False by default.

        agg_type : str
            Type of the aggregation function for counting the sentiment
            orientation. 'sum' by default. Other: 'avg', 'max', 'min'.

        discretize_sent : bool
            If you want to have continue sentiment values (float) set this
            parameter to False. True by default.

        Returns
        -------
        df : pandas.DataFrame
            Dataset in Data Frame structure.

        self.lexicon_predictions : dict
            Dictionary with all predicting over all lexicons and all documents.

        self.results : dict
            Dictionary with all results-metrics, times of the execution,
            features, parameters etc. used in the experiment.

        classes : list
            List of the classes - true value of the sentiment from dataset.
            It is used to evaluate the lexicons.

        """
        start = datetime.now()  # starting time
        log.info('Start {start_time}'.format(start_time=start))

        # initialize the prediction dictionary
        predictions = {lex_name: {} for lex_name in lexicons.keys()}

        if not isinstance(df, pd.DataFrame):
            raise IOError('Wrong type of dataset, should be Data Frame')

        log.info('Shape of dataset{}, columns: {}'.format(
            df.shape, df.columns))
        # dp = DocumentPreprocessor()

        docs = df.Document
        # n_docs = len(docs)

        # for lex_name, lexicon in lexicons.iteritems()
        log.info(
            'Lexicon based method will be counted with {} processes in parallel.'
            .format(self.n_jobs))
        preds = Parallel(n_jobs=self.n_jobs)(
            delayed(sentiment_lexicon)(docs, lex_name, lexicon, agg_type,
                                       self.stemming, self.progress_interval,
                                       self.output_results)
            for lex_name, lexicon in lexicons.iteritems())

        log.info('PREDS: {}'.format(preds))
        predictions = dict((k, v) for d in preds for (k, v) in d.items())
        # log.info('Predictions: {}'.format(predictions))

        if discretize_sent:
            for lex_name, vals in predictions.iteritems():
                predictions[lex_name] = {
                    k: self.sent_norm(v)
                    for k, v in vals.iteritems()
                }
        evaluation = Evaluation(f_path=self.output_results)
        df_evaluation = evaluation.build_df_lex_results(
            df=df,
            lex_names=lexicons.keys(),
            predictions=predictions,
            f_name=dataset_name)
        if self.output_results:
            res, classes = evaluation.evaluate_lexicons(
                df=df_evaluation, classifiers_to_evaluate=lexicons.keys())
            self.results.update(res)
        else:
            classes = None

        self.lexicon_predictions.update(predictions)
        log.info('Results: {}'.format(self.results))
        self.results['flow-time'] = (start, datetime.now())

        return df, self.lexicon_predictions, self.results, classes