def __init__(self, output_dir, verbose=False):
     """
     :param output_dir: Absolute filepath to output directory.
     :param verbose: boolean: Write info to console.
     :return:
     """
     self.dbmodel = BasicDbModel()
     self.s_analyzer = LexiconSentimentAnalyzer()
     self.text_writer = TextWriter(output_dir)   # writing CSV files
     self.verbose = verbose  # verbose output
     self.stock_processor = StockPriceProcessor()    # Object for price movements
     self.source_metrics_calculator = SourceMetricsCalculator(output_dir)
     self.total_metrics_calculator = TotalMetricsCalculator(output_dir)
     self.source_metrics_calculator_2_classes = SourceMetricsCalculator2classes(output_dir)
class SimpleExporter(object):

    def __init__(self, output_dir, verbose=False):
        self.dbmodel = BasicDbModel()
        self.s_analyzer = LexiconSentimentAnalyzer()
        self.text_writer = TextWriter(os.path.abspath(output_dir))
        self.verbose = verbose

    def analyze_company(self, company_id, from_date, to_date):
        """
        Analyze documents of the given company - just write VADER scores of every document.
        :param company_id:
        :param from_date:
        :param to_date:
        :return:
        """
        print('==Company %d==') % company_id
        # For every source, create a standalone file.
        for source in ['articles', 'fb_posts', 'fb_comments', 'tweets']:
            self._analyze_documents_by_source(company_id, source, from_date, to_date)
        # end

    def analyze_all_companies(self, from_date, to_date):
        companies = self.dbmodel.get_companies()
        # For every source, create a standalone file.
        for source in ['articles', 'fb_posts', 'fb_comments', 'tweets']:
            print('====Processing %s====') % source
            # Prepare file.
            header_line = ['company_id', 'date', 'sentiment_number', 'sentiment_polarity']
            self.text_writer.write_file([header_line], source, 'csv', ',', 'w')
            # Browse all companies.
            for comp in companies:
                print('==Company %d==') % comp['id']
                self._analyze_company_source(comp['id'], source, from_date, to_date, source)
                #break


    def _analyze_company_source(self, company_id, source_type, from_date, to_date, write_to_filename):
        # Get documents.
        documents = getattr(self.dbmodel, 'get_'+source_type)(company_id, from_date, to_date)
        # Create a final list.
        docs_list = []
        for doc in documents:
            #print doc['id'],
            # Process text.
            if source_type == 'articles':
                text = TextProcessing.process_article_text(doc['text'])
            else:
                text = TextProcessing.process_facebook_text(doc['text'])
            # Skip empty documents.
            if len(text) == 0:
                continue
            # Get sentiment values of the text.
            if source_type == 'articles':
                sentiment_number = self.s_analyzer.calculate_vader_sentiment('custom_dict_orig', text, True)
            else:
                sentiment_number = self.s_analyzer.calculate_vader_sentiment('custom_dict_orig', text, False)
            sentiment_polarity = self.s_analyzer.format_sentiment_value(sentiment_number)
            # Save data.
            doc_date = self._get_doc_date(source_type, doc)
            docs_list.append([company_id, doc_date, sentiment_number, sentiment_polarity])
        # Write to file.
        self.text_writer.write_file(docs_list, write_to_filename, 'csv', ',', 'a')


    def _analyze_documents_by_source(self, company_id, source_type, from_date, to_date, write_to_filename=False):
        # Get documents
        documents = getattr(self.dbmodel, 'get_'+source_type)(company_id, from_date, to_date)
        # Create a final list
        docs_list = []
        for doc in documents:
            # Process text
            if source_type == 'articles':
                text = TextProcessing.process_article_text(doc['text'])
            else:
                text = TextProcessing.process_facebook_text(doc['text'])
            # Skip empty documents
            if len(text) == 0:
                continue
            # Get sentiment values of the text
            sent_sum, sent_division = self.s_analyzer.calculate_vader_sentiment_values('vader', text)
            # Add this to list
            docs_list.append([sent_sum, sent_division, text])
        # Prepare header
        header = ['sentiment_sum', 'sentiment_division', 'text']
        docs_list.insert(0, header)
        file_name = '%d_%s' % (company_id, source_type)
        self.text_writer.write_file(docs_list, file_name, 'csv', '\t', 'w')


    def _get_doc_date(self, source_type, doc):
        if source_type == 'articles':
            date_obj = doc['published_date']
        elif source_type in ['fb_posts', 'fb_comments']:
            date_obj = self.dbmodel.from_timestamp_to_date(doc['created_timestamp'])
        elif source_type == 'tweets':
            date_obj = doc['created_at']
        return date_obj.strftime('%Y-%m-%d')
class DocumentsAnalyzer(object):

    def __init__(self, output_dir, verbose=False):
        """
        :param output_dir: Absolute filepath to output directory.
        :param verbose: boolean: Write info to console.
        :return:
        """
        self.dbmodel = BasicDbModel()
        self.s_analyzer = LexiconSentimentAnalyzer()
        self.text_writer = TextWriter(output_dir)   # writing CSV files
        self.verbose = verbose  # verbose output
        self.stock_processor = StockPriceProcessor()    # Object for price movements
        self.source_metrics_calculator = SourceMetricsCalculator(output_dir)
        self.total_metrics_calculator = TotalMetricsCalculator(output_dir)
        self.source_metrics_calculator_2_classes = SourceMetricsCalculator2classes(output_dir)

    ## Analyze output file

    def analyze_all_companies(self, from_date, to_date, file_name, price_type, const_boundaries, used_dict_name='vader', classes_count=3):
        """
        Analyze all documents for all companies.

        :param from_date:
        :param to_date:
        :param file_name:
        :param price_type:
        :param used_dict_name:
        :return:
        """
        # Reset files.
        self.text_writer.write_econometric_file(file_name, [self._get_days_stats_header()], 'w')
        total_m_header = self.total_metrics_calculator.get_total_metrics_header()
        self.text_writer.write_econometric_file(file_name + '_total-metrics', [total_m_header], 'w')
        source_m_header = self.source_metrics_calculator.get_source_metrics_header()
        self.text_writer.write_econometric_file(file_name + '_source-metrics', [source_m_header], 'w')
        # Process companies
        companies = self.dbmodel.get_companies_order_by_total_documents(from_date, to_date)
        for comp in companies:
            print("<<<<<Company %d>>>>>") % comp['id']
            if not self.verbose:
                with FaCommon.Helpers.suppress_stdout():
                    self.analyze_company(comp['id'], from_date, to_date, file_name, price_type, const_boundaries, used_dict_name, False, classes_count)
            else:
                self.analyze_company(comp['id'], from_date, to_date, file_name, price_type, const_boundaries, used_dict_name, False, classes_count)
        print('>>>All stuff saved.')

    def analyze_company(self, company_id, from_date, to_date, file_name, price_type, const_boundaries, used_dict_name, write_header=False, classes_count=3):
        """
        Analyze documents about company (from_date -> present date).

        :return: list of days, where every row contains information for documents for this day.
        """
        # Prepare variables.
        examined_date = from_date
        last_date = to_date
        total_data = []
        max_sent = float('-inf')

        # Set stock prices for this company ID.
        self.stock_processor.set_stock_prices(company_id, examined_date, price_type)
        #exit(self.stock_processor.get_price_movement_with_delay(examined_date, 2))

        # Prepare list for writing to a file.
        # For every day (from "from_date" to "to_date"), query the DB for documents created on the day.
        while examined_date <= last_date:
            print("===%s===") % examined_date
            # For every document type, process all documents and count number of neutral, positive, negative documents.
            yahoo_values = self._process_yahoo(company_id, examined_date, used_dict_name)
            fb_p_values = self._process_fb_posts(company_id, examined_date, used_dict_name)
            fb_c_values = self._process_fb_comments(company_id, examined_date, used_dict_name)
            tw_values = self._process_tweets(company_id, examined_date, used_dict_name)
            # Save acquired data
            day_data = [
                company_id,
                examined_date.strftime('%d.%m.%Y'),
                fb_p_values['neu'], fb_p_values['pos'], fb_p_values['neg'],
                fb_c_values['neu'], fb_c_values['pos'], fb_c_values['neg'],
                yahoo_values['neu'], yahoo_values['pos'], yahoo_values['neg'],
                tw_values['neu'], tw_values['pos'], tw_values['neg'],
            ]
            # Get stock price movement direction for 1,2,3 days from examined date. Also for previous day.
            day_data.append(self.stock_processor.get_price_movement_with_delay(examined_date, -1, const_boundaries))
            day_data.append(self.stock_processor.get_price_movement_with_delay(examined_date, 1, const_boundaries))
            day_data.append(self.stock_processor.get_price_movement_with_delay(examined_date, 2, const_boundaries))
            day_data.append(self.stock_processor.get_price_movement_with_delay(examined_date, 3, const_boundaries))
            # Calculate simple sentiment for all sources.
            fb_post_s = self._calc_source_sentiment(fb_p_values)
            fb_comment_s = self._calc_source_sentiment(fb_c_values)
            yahoo_s = self._calc_source_sentiment(yahoo_values)
            twitter_s = self._calc_source_sentiment(tw_values)
            day_data.extend([fb_post_s, fb_comment_s, yahoo_s, twitter_s])
            # Calculate overall sentiment for the day.
            (max_sent, day_sent) = self._calc_overall_sentiment_for_day(max_sent, fb_p_values, fb_c_values, yahoo_values, tw_values)
            day_data.append(day_sent)
            # Save day data to total data.
            total_data.append(day_data)
            # Increment examined date.
            examined_date = examined_date + datetime.timedelta(days=1)

        # Normalize sentiment values.
        for i, day_data in enumerate(total_data):
            norm_sent = self._normalize_sentiment(total_data[i][-1], max_sent)
            string_sent = self._format_sentiment(norm_sent)
            total_data[i][-1] = string_sent

        # Write results to file.
        if write_header:
            total_data.insert(0, self._get_days_stats_header())
            self.text_writer.write_econometric_file(file_name, total_data, 'w')
            del(total_data[0])
        else:
            self.text_writer.write_econometric_file(file_name, total_data, 'a')

        # Calculate metrics by source.
        m_filename = file_name + '_source-metrics'
        if classes_count == 3:
            self.source_metrics_calculator.calculate_metrics_by_source(company_id, total_data, m_filename, price_type, write_header)
        else:
            self.source_metrics_calculator_2_classes.calculate_metrics_by_source(company_id, total_data, m_filename, price_type, write_header)

        # Calculate total metrics.
        m_filename = file_name + '_total-metrics'
        self.total_metrics_calculator.calculate_total_metrics(company_id, total_data, m_filename, price_type, write_header)



    #### PRIVATE methods for processing documents

    def _process_fb_posts(self, company_id, examined_date, used_dict_name='vader'):
        # Select all FB posts for given company created on given date.
        posts = self.dbmodel.get_daily_fb_posts(company_id, examined_date)
        counter = {'pos': 0, 'neu': 0, 'neg': 0}
        # Calculate sentiment for all posts
        for post in posts:
            #print("FB post: %s") % post['id'],
            post_text = TextProcessing.process_facebook_text(post['text'])
            if len(post_text) == 0:
                continue    # skip empty posts
            sent_value = self.s_analyzer.calculate_vader_sentiment(used_dict_name, post_text, False)
            polarity = self.s_analyzer.format_sentiment_value(sent_value)
            counter[polarity] += 1
            #print("| %s ... %s") % (str(round(sent_value, 4)), polarity)
        # result
        return counter

    def _process_fb_comments(self, company_id, examined_date, used_dict_name='vader'):
        # Select all FB comments.
        comments = self.dbmodel.get_daily_fb_comments(company_id, examined_date)
        counter = {'pos': 0, 'neu': 0, 'neg': 0}
        # Calculate sentiment for all posts
        for com in comments:
            #print("FB comment: %s") % com['id'],
            com_text = TextProcessing.process_facebook_text(com['text'])
            if len(com_text) == 0:
                continue    # skip empty comments
            sent_value = self.s_analyzer.calculate_vader_sentiment(used_dict_name, com_text, False)
            polarity = self.s_analyzer.format_sentiment_value(sent_value)
            counter[polarity] += 1
            #print("| %s ... %s") % (str(round(sent_value, 4)), polarity)
        # result
        return counter

    def _process_yahoo(self, company_id, examined_date, used_dict_name='vader'):
        # Select all Yahoo Finance articles.
        articles = self.dbmodel.get_daily_articles(company_id, examined_date)
        counter = {'pos': 0, 'neu': 0, 'neg': 0}
        # Calculate sentiment for all articles
        for art in articles:
            #print("Yahoo article: %s") % art['id'],
            art_text = TextProcessing.process_article_text(art['text'])
            if len(art_text) == 0:
                continue    # skip empty articles
            sent_value = self.s_analyzer.calculate_vader_sentiment(used_dict_name, art_text, True)
            polarity = self.s_analyzer.format_sentiment_value(sent_value)
            counter[polarity] += 1
            #print("| %s ... %s") % (str(round(sent_value, 4)), polarity)
        # result
        return counter

    def _process_tweets(self, company_id, examined_date, used_dict_name='vader'):
        # Select all Yahoo Finance articles.
        tweets = self.dbmodel.get_daily_tweets(company_id, examined_date)
        counter = {'pos': 0, 'neu': 0, 'neg': 0}
        # Calculate sentiment for all articles.
        for tw in tweets:
            #print("Tweet: %s") % tw['tw_id'],
            tw_text = TextProcessing.process_facebook_text(tw['text'])
            if len(tw_text) == 0:
                continue    # skip empty tweets
            sent_value = self.s_analyzer.calculate_vader_sentiment(used_dict_name, tw_text, False)
            polarity = self.s_analyzer.format_sentiment_value(sent_value)
            counter[polarity] += 1
            #print("| %s ... %s") % (str(round(sent_value, 4)), polarity)
        # result
        return counter

    ## PRIVATE methods for determining sentiment of the whole day

    def _calc_source_sentiment(self, s_dict):
        """
        Calculate sentiment for given source dictionary.

        :param s_dict: dictionary (sentiment -> number of documents}
        :return: string (pos, neg, neu)
        """
        max_s = max(s_dict.keys(), key=lambda k: s_dict[k])
        # If neutral value is also the biggest one, choose it.
        if s_dict['neu'] == s_dict[max_s]:
            return 'neu'
        return max_s


    @staticmethod
    def _calc_overall_sentiment_for_day(max_sent, fb_p_values, fb_c_values, yahoo_values, tw_values):
        # Calculate numeric sentiment
        fb_p_sent = fb_p_values['pos'] - fb_p_values['neg']
        fb_c_sent = fb_c_values['pos'] - fb_p_values['neg']
        yahoo_sent = yahoo_values['pos'] - fb_p_values['neg']
        tw_sent = tw_values['pos'] - fb_p_values['neg']
        overall_sent = fb_p_sent + fb_c_sent + yahoo_sent + tw_sent
        #print fb_p_sent,fb_c_sent,yahoo_sent,tw_sent
        # Is the new sentiment larger than current largest one?
        if overall_sent > max_sent:
            max_sent = overall_sent
        return max_sent, overall_sent

    @staticmethod
    def _normalize_sentiment(score, alpha=100):
        """
        Normalize the score to be between -1 and 1 using an alpha that approximates the max expected value.
        """
        try:
            norm_score = score/math.sqrt((score*score) + alpha)
        except ZeroDivisionError:
            norm_score = score
        return norm_score

    @staticmethod
    def _format_sentiment(norm_score):
        if -0.1 < norm_score < 0.1:
            return 'neu'
        elif norm_score > 0:
            return 'pos'
        elif norm_score < 0:
            return 'neg'

    @staticmethod
    def _get_days_stats_header():
        header_days = [
            'company_id', 'date',
            'fb_post_neutral', 'fb_post_positive', 'fb_post_negative',
            'fb_comment_neutral', 'fb_comment_positive', 'fb_comment_negative',
            'yahoo_neutral', 'yahoo_positive', 'yahoo_negative',
            'twitter_neutral', 'twitter_positive', 'twitter_negative',
            'stock_dir_-1', 'stock_dir_1', 'stock_dir_2', 'stock_dir_3',
            'sentiment_fb_post', 'sentiment_fb_comment', 'sentiment_yahoo', 'sentiment_twitter',
            'overall_sentiment',
        ]
        return header_days
 def __init__(self, output_dir, verbose=False):
     self.dbmodel = BasicDbModel()
     self.s_analyzer = LexiconSentimentAnalyzer()
     self.text_writer = TextWriter(os.path.abspath(output_dir))
     self.verbose = verbose