示例#1
0
    def __init__(self, config_file=DEFAULT_CONFIG_FILE):
        self.config = ConfigReader(config_file)
        self.states = {}
        self.geo_locator = Nominatim()
        self.tweet_count = 0
        self.city_cache_appender = CacheAppender(self.config.cache_file_path)

        def get_level():
            return {
                'DEBUG': logging.DEBUG,
                'INFO': logging.INFO,
                'WARN': logging.WARNING,
                'ERROR': logging.ERROR,
                'FATAL': logging.FATAL,
                'CRITICAL': logging.CRITICAL
            }[self.config.logging_level]

        logging.basicConfig(format="[%(levelname)s] %(name)s: %(message)s", level=get_level())

        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.info("Analysing city names using config in %s" % config_file)
示例#2
0
class StatesTweetCount:
    def __init__(self, config_file=DEFAULT_CONFIG_FILE):
        self.config = ConfigReader(config_file)
        self.states = {}
        self.geo_locator = Nominatim()
        self.tweet_count = 0
        self.city_cache_appender = CacheAppender(self.config.cache_file_path)

        def get_level():
            return {
                'DEBUG': logging.DEBUG,
                'INFO': logging.INFO,
                'WARN': logging.WARNING,
                'ERROR': logging.ERROR,
                'FATAL': logging.FATAL,
                'CRITICAL': logging.CRITICAL
            }[self.config.logging_level]

        logging.basicConfig(format="[%(levelname)s] %(name)s: %(message)s", level=get_level())

        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.info("Analysing city names using config in %s" % config_file)

    def increment_tweet_count(self, state_name, count):
        self.tweet_count += count
        if state_name in self.states.keys():
            self.states[state_name] += count
        else:
            self.states[state_name] = count

    def get_full_address(self, city_name):
        # Append "India" to CITY_NAME for accurate results
        location = self.geo_locator.geocode("%s India" % city_name,
                                            timeout=self.config.geopy_timeout)
        if location is None:
            return unmatched_cities[city_name]
        return location.address

    def search_states(self, address):
        for state in states_in_india:
            if state in address:
                return state
        return None

    def search_union_territories(self, address):
        for union_ter in union_territories:
            if union_ter in address:
                return union_ter
        return None

    def process_address_tweet(self, address):
        state_check = self.search_states(address)
        if state_check is not None:
            return state_check
        else:
            union_ter = self.search_union_territories(address)
            if union_ter is not None:
                return union_ter
        self.logger.error("[GEOPY] Address not found: %s" % address)
        return None

    def append_to_cache(self, city, state):
        if self.config.update_cache_file:
            self.city_cache_appender.append_cached_data(city, state)

    def read_tweet_cities(self, file_name):
        tuples = []
        for part in xrange(0, self.config.spark_num_partitions):
            spark_file = open(self.config.spark_file_name % part)
            try:
                tuples += spark_file.readlines()
            finally:
                spark_file.close()

        self.logger.info("Processing cities: %s" % tuples)
        for city_tuple in tuples:
            self.logger.debug("Processing city tuple: %s" % city_tuple.rstrip())
            city_name, tweet_count = city_tuple[1:].strip("(|)\n").split(",")
            if city_name in cached_cities.keys():
                state = cached_cities[city_name]
                self.logger.debug("Mapped city: %s to state: %s" % (city_name, state))
                self.increment_tweet_count(state, int(tweet_count))
            else:
                address = self.get_full_address(city_name)
                state = self.process_address_tweet(address)
                self.logger.debug("Mapped city: %s to state: %s" % (city_name, state))
                self.increment_tweet_count(state, int(tweet_count))
                self.append_to_cache(city_name, state)

    def run(self):
        try:
            self.read_tweet_cities(self.config.spark_file_name)
            if self.logger.isEnabledFor(logging.INFO):
                for state, num_tweets in self.states.iteritems():
                    self.logger.info("[%s] Tweets: %s, Percentage: %s" % (
                        state,
                        num_tweets,
                        (num_tweets * 100.0) / self.tweet_count))

            PieChartPlotter(self.states.keys(),
                            self.states.values(),
                            self.config.plotly_username,
                            self.config.plotly_api_key,
                            self.config.plotly_plot_name)
        finally:
            if self.config.update_cache_file:
                self.city_cache_appender.append_cache_file()