def lex_sent_batch(self, df=None, lexicons=None, dataset_name='', lower=True, agg_type='sum', discretize_sent=True): """ Count sentiment base on lexicons for provided dataset. Parameters ---------- df : pandas.DataFrame It must be passed as Pandas Data Frame with 'Document' and 'Sentiment' columns. 'Documents' lexicons : dict Dictionary with sentiment lexicons. dataset_name : str Just dataset name - use for saving predictions. evaluate : str Do you want to evaluate your prediction accuracy? If yes then it must provide path where results will be saved. False by default. agg_type : str Type of the aggregation function for counting the sentiment orientation. 'sum' by default. Other: 'avg', 'max', 'min'. discretize_sent : bool If you want to have continue sentiment values (float) set this parameter to False. True by default. Returns ------- df : pandas.DataFrame Dataset in Data Frame structure. self.lexicon_predictions : dict Dictionary with all predicting over all lexicons and all documents. self.results : dict Dictionary with all results-metrics, times of the execution, features, parameters etc. used in the experiment. classes : list List of the classes - true value of the sentiment from dataset. It is used to evaluate the lexicons. """ start = datetime.now() # starting time log.info('Start {start_time}'.format(start_time=start)) # initialize the prediction dictionary predictions = {lex_name: {} for lex_name in lexicons.keys()} if not isinstance(df, pd.DataFrame): raise IOError('Wrong type of dataset, should be Data Frame') log.info('Shape of dataset{}, columns: {}'.format( df.shape, df.columns)) # dp = DocumentPreprocessor() docs = df.Document # n_docs = len(docs) # for lex_name, lexicon in lexicons.iteritems() log.info( 'Lexicon based method will be counted with {} processes in parallel.' .format(self.n_jobs)) preds = Parallel(n_jobs=self.n_jobs)( delayed(sentiment_lexicon)(docs, lex_name, lexicon, agg_type, self.stemming, self.progress_interval, self.output_results) for lex_name, lexicon in lexicons.iteritems()) log.info('PREDS: {}'.format(preds)) predictions = dict((k, v) for d in preds for (k, v) in d.items()) # log.info('Predictions: {}'.format(predictions)) if discretize_sent: for lex_name, vals in predictions.iteritems(): predictions[lex_name] = { k: self.sent_norm(v) for k, v in vals.iteritems() } evaluation = Evaluation(f_path=self.output_results) df_evaluation = evaluation.build_df_lex_results( df=df, lex_names=lexicons.keys(), predictions=predictions, f_name=dataset_name) if self.output_results: res, classes = evaluation.evaluate_lexicons( df=df_evaluation, classifiers_to_evaluate=lexicons.keys()) self.results.update(res) else: classes = None self.lexicon_predictions.update(predictions) log.info('Results: {}'.format(self.results)) self.results['flow-time'] = (start, datetime.now()) return df, self.lexicon_predictions, self.results, classes