def __init__(self): self.validator = ValidatorClass(pathToPickleFiles) self.geo_finder = GeolocationFinder() self.database_handler = DatabaseHandler(dbURL, dbPort, dbUser, dbPasswd)
except IOError: print("Error, failed to write to file: " + path + filename) def write_to_file(filename, res): try: f = open(filename, 'w') for l in res: f.write(l['text'] + "\n") f.close() except IOError: print("cannot open file: " + filename) print("Updating test data ... ") dbh = DatabaseHandler() res = dbh.get_tweets_with_sentiment('has flu') print("Positive file updated") write_to_file('positive.txt', res) res = dbh.get_tweets_with_sentiment('no flu') write_to_file('negative.txt', res) print("Negative file updated") # J is adjective, r is adverb, and v is verb allowed_word_types = ['J'] documents = [] all_words = [] try: positive_tweets = open("../classifiers/positive.txt", "r").read()
def setUp(self): # test_dbh sets up a db on localhost self.test_dbh = DatabaseHandler() self.setup_test_dbh_initial_contents()
class DataCollector(StreamListener): def __init__(self): self.validator = ValidatorClass(pathToPickleFiles) self.geo_finder = GeolocationFinder() self.database_handler = DatabaseHandler(dbURL, dbPort, dbUser, dbPasswd) def on_data(self, raw_data): """ When Listener detects a tweet with the keywords this method is called to handle the tweet. Sequence: - Load the json data - Validate tweet - Store if valid :param raw_data: :return: nothing """ try: # Load the raw data json_data = json.loads(raw_data) # Get some required details from json data user_id, text, language, location, timestamp = self.get_data_from_json_data( json_data) # Check if text in tweet is valid before processing if text != 'invalid' and self.validator.validate_text_from_tweet( text): record = {'created': timestamp, 'user_language': language} # Check if tweet contains a valid location if self.validator.validate_location( location) and location != 'None': # get location details of user address, latitude, longitude = self.geo_finder.get_location( location) # If location has not returned None for lat and long, construct and record the map point in database if (latitude is not None) and (longitude is not None) \ and (latitude != 'None') and (longitude != 'None'): self.add_to_record(address, latitude, longitude, record) self.record_map_point(latitude, longitude, timestamp, text) # Check if language is english if self.language_is_english(language): self.database_handler.write_english_tweet_to_database( record) except TypeError: logger.logging.exception('Error during on_data method') except ValueError: logger.logging.exception('Error during on_data method') def language_is_english(self, language): """ Checks is language provided is english :param language: :return boolean value True/False """ return (language == 'en') or (language == 'en-gb') def add_to_record(self, address, latitude, longitude, record): """ Add location values to record which is a dictionary :param address: string value for address :param latitude: float value for latitude :param longitude: float value for longitude :param record: dictionary :return: nothing """ record['address'] = address record['latitude'] = latitude record['longitude'] = longitude def record_map_point(self, latitude, longitude, timestamp, text): """ creats a record(dictionary) for map point and calls the database handler to store it :param latitude: float value for latitude :param longitude: float value for longitude :param timestamp: string value for timestamp :param text: string value for text :return: nothing """ map_point_record = { 'date': int(timestamp), 'lat': latitude, 'long': longitude, 'text': text } self.database_handler.write_map_point_to_database(map_point_record) def get_data_from_json_data(self, json_data): """ extracts appropriate data from json data, if KeyError occurs sets attribute to unknown or none :param json_data: :return: user_id(string), text(string), user_language(string), location(string), timestamp(string) """ try: user_id = json_data['user']['id_str'] except KeyError: logger.logging.exception('KeyError while accessing user ID') user_id = 'unknown' try: user_language = json_data['user']['lang'] except KeyError: logger.logging.exception('KeyError while accessing user language') user_language = 'unknown' try: location = json_data['user']['location'] except KeyError: logger.logging.exception('KeyError while accessing user location') location = None try: text = json_data['text'].lower() except KeyError: # if keyError is raised set the text to a banned word so it will not be accepted text = 'invalid text' logger.logging.exception('KeyError while accessing tweet text') # Get time tweet picked up timestamp = self.get_timestamp() return user_id, text, user_language, location, timestamp def get_timestamp(self): """ creates a timestamp in string format :return: timestamp(string) """ now = datetime.datetime.now() day = str(now.day) month = str(now.month) year = str(now.year) if len(day) == 1: day = '0' + day if len(month) == 1: month = '0' + month timestamp = year + month + day return timestamp def on_error(self, status_code): logging.error('Twitter Stream returned status code:' + str(status_code))
class DatabaseHandlerTests(unittest.TestCase): def setUp(self): # test_dbh sets up a db on localhost self.test_dbh = DatabaseHandler() self.setup_test_dbh_initial_contents() def setup_test_dbh_initial_contents(self): self.setup_map_points_collection() self.setup_english_tweets_collection() def setup_map_points_collection(self): # clear previous test records [ CAUTION CHECK THAT DB OS NOT PRODUCTION DB ] self.test_dbh.db.map_points.remove() # write initial test_records for num in range(1, 4): # loops for 3 months for num_of_rec in (1, num + 1): # mth1 gets 1 rec, mth2 gets 2, mth3 gets 3 etc record = {'date': int('20160' + str(num_of_rec) + '01'), 'lat': '000000', 'long': '000000', 'text': "test_text" + str(num_of_rec)} self.test_dbh.db.map_points.insert(record) def setup_english_tweets_collection(self): # clear previous test records [ CAUTION CHECK THAT DB OS NOT PRODUCTION DB ] self.test_dbh.db.english_tweets.remove() # write initial test_records for num in range(1, 4): # loops for 3 months for num_of_rec in (1, num + 1): # mth1 gets 1 rec, mth2 gets 2, mth3 gets 3 etc record = {'created': '20160' + str(num_of_rec) + '01', 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000'} self.test_dbh.db.english_tweets.insert(record) def setup_non_english_tweets_collection(self): # clear previous test records [ CAUTION CHECK THAT DB OS NOT PRODUCTION DB ] self.test_dbh.db.non_english_tweets.remove() # write initial test_records for num in range(1, 4): # loops for 3 months for num_of_rec in (1, num + 1): record = {'created': '20160' + str(num_of_rec) + '01', 'user_language': 'pt', 'address': 'test address', 'latitude': '000000', 'longitude': '000000'} self.test_dbh.db.non_english_tweets.insert(record) def test_write_english_tweet_to_database_writes_record_to_test_db_english_tweet_table(self): # Arrange initial_collection_count = 6 record = {'created': int('20160102'), 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000'} # Execute self.test_dbh.write_english_tweet_to_database(record) # Check self.assertEqual(initial_collection_count + 1, self.test_dbh.db.english_tweets.find().count()) self.assertEqual(record, self.test_dbh.db.english_tweets.find_one({'created': 20160102})) def test_write_map_point_writes_record_to_test_db(self): initial_collection_count = 6 record = {'date': '20160102', 'lat': '000000', 'long': '000000', 'text': "test_text entered by write_method"} # Execute self.test_dbh.write_map_point_to_database(record) # Check self.assertEqual(initial_collection_count + 1, self.test_dbh.db.map_points.find().count()) self.assertEqual(record, self.test_dbh.db.map_points.find_one({'date': '20160102'})) def test_get_map_points_for_five_dates_returns_points_for_five_days(self): # Arrange expected_count = 3 # Execute points = self.test_dbh.get_map_points_for_five_dates('20160105', '20160101') # Check self.assertEqual(expected_count, points.count()) def test_get_map_point_data_returns_points_within_defined_area(self): # Arrange record = {'date': int('20160601'), 'lat': 53.3478, 'long': 6.2597, 'text': "test_text"} self.test_dbh.db.map_points.insert(record) expected_count = 1 # Execute records = self.test_dbh.get_map_point_data('54','52', '7', '5', '20160605', '20160531') # Check self.assertEqual(expected_count, records.count()) def test_get_uncategorised_tweet_from_english_collection_returns_uncategorised_tweet(self): # Arrange record = {'created': '20160101', 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000', 'sentiment': 'unknown'} self.test_dbh.db.english_tweets.insert(record) # Execute record = self.test_dbh.get_uncategorised_tweet_from_english_collection() # Check self.assertEqual('unknown', record['sentiment']) def test_update_document_sentiment_in_english_collection_updates_document(self): # Arrange record = {'created': '20160101', 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000', 'sentiment': 'unknown'} record_id = self.test_dbh.db.english_tweets.insert(record) # returns id of newly created record expected_modified_count = 1 modified_count = self.test_dbh.update_document_sentiment_in_english_collection(record_id, 'hasFlu', 'sample_text') # retrieve record to check sentiment record = self.test_dbh.db.english_tweets.find_one({"_id": ObjectId(record_id)}) # Check self.assertEqual(expected_modified_count, modified_count) self.assertEqual('hasFlu', record['sentiment']) def test_get_tweets_with_sentiment_returns_appropriate_records(self): # Arrange record = {'created': '20160101', 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000', 'sentiment': 'unknown'} self.test_dbh.db.english_tweets.insert(record) # Execute records = self.test_dbh.get_tweets_with_sentiment('unknown') # Check self.assertEqual(1, records.count()) def test_get_total_count_returns_total_count(self): # Arrange expected_total_count = 6 # Check self.assertEqual(expected_total_count, self.test_dbh.get_total_count()) def test_get_today_count_return_todays_count(self): # Arrange expected_today_count = 3 # Check self.assertEqual(expected_today_count, self.test_dbh.get_today_count('20160101')) def test_get_yearly_count_returns_year_count(self): # Arrange expected_yearly_count = 6 # Check self.assertEqual(expected_yearly_count, self.test_dbh.get_yearly_count('2016')) def test_get_month_count_returns_month_count(self): # Arrange expected_count = 3 # Check self.assertEqual(expected_count, self.test_dbh.get_month_count('201601')) def test_get_count_for_time_period_returns_correct_count_for_time_period(self): self.assertEqual(3, self.test_dbh.get_count_for_time_period('20160101', '20160101')) def test_get_instance_count_for_each_week_of_this_year_returns_dict_containing_correct_counts(self): # Arrange mock_get_date_ranges= {'week0': {'start_date': '20151228', 'end_date': '20160103'}} expected_results = collections.OrderedDict([('0', 3)]) # Check self.assertEqual(expected_results, self.test_dbh.get_instance_count_for_each_week_of_this_year(mock_get_date_ranges))
class DatabaseHandlerTests(unittest.TestCase): def setUp(self): # test_dbh sets up a db on localhost self.test_dbh = DatabaseHandler() self.setup_test_dbh_initial_contents() def setup_test_dbh_initial_contents(self): self.setup_map_points_collection() self.setup_english_tweets_collection() def setup_map_points_collection(self): # clear previous test records [ CAUTION CHECK THAT DB OS NOT PRODUCTION DB ] self.test_dbh.db.map_points.remove() # write initial test_records for num in range(1, 4): # loops for 3 months for num_of_rec in ( 1, num + 1): # mth1 gets 1 rec, mth2 gets 2, mth3 gets 3 etc record = { 'date': int('20160' + str(num_of_rec) + '01'), 'lat': '000000', 'long': '000000', 'text': "test_text" + str(num_of_rec) } self.test_dbh.db.map_points.insert(record) def setup_english_tweets_collection(self): # clear previous test records [ CAUTION CHECK THAT DB OS NOT PRODUCTION DB ] self.test_dbh.db.english_tweets.remove() # write initial test_records for num in range(1, 4): # loops for 3 months for num_of_rec in ( 1, num + 1): # mth1 gets 1 rec, mth2 gets 2, mth3 gets 3 etc record = { 'created': '20160' + str(num_of_rec) + '01', 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000' } self.test_dbh.db.english_tweets.insert(record) def setup_non_english_tweets_collection(self): # clear previous test records [ CAUTION CHECK THAT DB OS NOT PRODUCTION DB ] self.test_dbh.db.non_english_tweets.remove() # write initial test_records for num in range(1, 4): # loops for 3 months for num_of_rec in (1, num + 1): record = { 'created': '20160' + str(num_of_rec) + '01', 'user_language': 'pt', 'address': 'test address', 'latitude': '000000', 'longitude': '000000' } self.test_dbh.db.non_english_tweets.insert(record) def test_write_english_tweet_to_database_writes_record_to_test_db_english_tweet_table( self): # Arrange initial_collection_count = 6 record = { 'created': int('20160102'), 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000' } # Execute self.test_dbh.write_english_tweet_to_database(record) # Check self.assertEqual(initial_collection_count + 1, self.test_dbh.db.english_tweets.find().count()) self.assertEqual( record, self.test_dbh.db.english_tweets.find_one({'created': 20160102})) def test_write_map_point_writes_record_to_test_db(self): initial_collection_count = 6 record = { 'date': '20160102', 'lat': '000000', 'long': '000000', 'text': "test_text entered by write_method" } # Execute self.test_dbh.write_map_point_to_database(record) # Check self.assertEqual(initial_collection_count + 1, self.test_dbh.db.map_points.find().count()) self.assertEqual( record, self.test_dbh.db.map_points.find_one({'date': '20160102'})) def test_get_map_points_for_five_dates_returns_points_for_five_days(self): # Arrange expected_count = 3 # Execute points = self.test_dbh.get_map_points_for_five_dates( '20160105', '20160101') # Check self.assertEqual(expected_count, points.count()) def test_get_map_point_data_returns_points_within_defined_area(self): # Arrange record = { 'date': int('20160601'), 'lat': 53.3478, 'long': 6.2597, 'text': "test_text" } self.test_dbh.db.map_points.insert(record) expected_count = 1 # Execute records = self.test_dbh.get_map_point_data('54', '52', '7', '5', '20160605', '20160531') # Check self.assertEqual(expected_count, records.count()) def test_get_uncategorised_tweet_from_english_collection_returns_uncategorised_tweet( self): # Arrange record = { 'created': '20160101', 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000', 'sentiment': 'unknown' } self.test_dbh.db.english_tweets.insert(record) # Execute record = self.test_dbh.get_uncategorised_tweet_from_english_collection( ) # Check self.assertEqual('unknown', record['sentiment']) def test_update_document_sentiment_in_english_collection_updates_document( self): # Arrange record = { 'created': '20160101', 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000', 'sentiment': 'unknown' } record_id = self.test_dbh.db.english_tweets.insert( record) # returns id of newly created record expected_modified_count = 1 modified_count = self.test_dbh.update_document_sentiment_in_english_collection( record_id, 'hasFlu', 'sample_text') # retrieve record to check sentiment record = self.test_dbh.db.english_tweets.find_one( {"_id": ObjectId(record_id)}) # Check self.assertEqual(expected_modified_count, modified_count) self.assertEqual('hasFlu', record['sentiment']) def test_get_tweets_with_sentiment_returns_appropriate_records(self): # Arrange record = { 'created': '20160101', 'user_language': 'en', 'address': 'test address', 'latitude': '000000', 'longitude': '000000', 'sentiment': 'unknown' } self.test_dbh.db.english_tweets.insert(record) # Execute records = self.test_dbh.get_tweets_with_sentiment('unknown') # Check self.assertEqual(1, records.count()) def test_get_total_count_returns_total_count(self): # Arrange expected_total_count = 6 # Check self.assertEqual(expected_total_count, self.test_dbh.get_total_count()) def test_get_today_count_return_todays_count(self): # Arrange expected_today_count = 3 # Check self.assertEqual(expected_today_count, self.test_dbh.get_today_count('20160101')) def test_get_yearly_count_returns_year_count(self): # Arrange expected_yearly_count = 6 # Check self.assertEqual(expected_yearly_count, self.test_dbh.get_yearly_count('2016')) def test_get_month_count_returns_month_count(self): # Arrange expected_count = 3 # Check self.assertEqual(expected_count, self.test_dbh.get_month_count('201601')) def test_get_count_for_time_period_returns_correct_count_for_time_period( self): self.assertEqual( 3, self.test_dbh.get_count_for_time_period('20160101', '20160101')) def test_get_instance_count_for_each_week_of_this_year_returns_dict_containing_correct_counts( self): # Arrange mock_get_date_ranges = { 'week0': { 'start_date': '20151228', 'end_date': '20160103' } } expected_results = collections.OrderedDict([('0', 3)]) # Check self.assertEqual( expected_results, self.test_dbh.get_instance_count_for_each_week_of_this_year( mock_get_date_ranges))
def __init__(self): self.validator = ValidatorClass(pathToPickleFiles) self.geo_finder = GeolocationFinder() self.database_handler = DatabaseHandler(dbURL, dbPort,dbUser, dbPasswd)
class DataCollector(StreamListener): def __init__(self): self.validator = ValidatorClass(pathToPickleFiles) self.geo_finder = GeolocationFinder() self.database_handler = DatabaseHandler(dbURL, dbPort,dbUser, dbPasswd) def on_data(self, raw_data): """ When Listener detects a tweet with the keywords this method is called to handle the tweet. Sequence: - Load the json data - Validate tweet - Store if valid :param raw_data: :return: nothing """ try: # Load the raw data json_data = json.loads(raw_data) # Get some required details from json data user_id, text, language, location, timestamp = self.get_data_from_json_data(json_data) # Check if text in tweet is valid before processing if text != 'invalid' and self.validator.validate_text_from_tweet(text): record = {'created': timestamp, 'user_language': language} # Check if tweet contains a valid location if self.validator.validate_location(location) and location != 'None': # get location details of user address, latitude, longitude = self.geo_finder.get_location(location) # If location has not returned None for lat and long, construct and record the map point in database if (latitude is not None) and (longitude is not None) \ and (latitude != 'None') and (longitude != 'None'): self.add_to_record(address, latitude, longitude, record) self.record_map_point(latitude, longitude, timestamp, text) # Check if language is english if self.language_is_english(language): self.database_handler.write_english_tweet_to_database(record) except TypeError: logger.logging.exception('Error during on_data method') except ValueError: logger.logging.exception('Error during on_data method') def language_is_english(self, language): """ Checks is language provided is english :param language: :return boolean value True/False """ return (language == 'en') or (language == 'en-gb') def add_to_record(self, address, latitude, longitude, record): """ Add location values to record which is a dictionary :param address: string value for address :param latitude: float value for latitude :param longitude: float value for longitude :param record: dictionary :return: nothing """ record['address'] = address record['latitude'] = latitude record['longitude'] = longitude def record_map_point(self, latitude, longitude, timestamp, text): """ creats a record(dictionary) for map point and calls the database handler to store it :param latitude: float value for latitude :param longitude: float value for longitude :param timestamp: string value for timestamp :param text: string value for text :return: nothing """ map_point_record = {'date': int(timestamp), 'lat': latitude, 'long': longitude, 'text': text} self.database_handler.write_map_point_to_database(map_point_record) def get_data_from_json_data(self, json_data): """ extracts appropriate data from json data, if KeyError occurs sets attribute to unknown or none :param json_data: :return: user_id(string), text(string), user_language(string), location(string), timestamp(string) """ try: user_id = json_data['user']['id_str'] except KeyError: logger.logging.exception('KeyError while accessing user ID') user_id = 'unknown' try: user_language = json_data['user']['lang'] except KeyError: logger.logging.exception('KeyError while accessing user language') user_language = 'unknown' try: location = json_data['user']['location'] except KeyError: logger.logging.exception('KeyError while accessing user location') location = None try: text = json_data['text'].lower() except KeyError: # if keyError is raised set the text to a banned word so it will not be accepted text = 'invalid text' logger.logging.exception('KeyError while accessing tweet text') # Get time tweet picked up timestamp = self.get_timestamp() return user_id, text, user_language, location, timestamp def get_timestamp(self): """ creates a timestamp in string format :return: timestamp(string) """ now = datetime.datetime.now() day = str(now.day) month = str(now.month) year = str(now.year) if len(day) == 1: day = '0' + day if len(month) == 1: month = '0' + month timestamp = year + month + day return timestamp def on_error(self, status_code): logging.error('Twitter Stream returned status code:' + str(status_code))
# Author: David Dunne, Student Number: C00173649, Created Nov 2015 from flask import Flask, render_template, request, jsonify, make_response from utilities.database_handler import DatabaseHandler from utilities import email_sender from datetime import date, datetime, timedelta app = Flask(__name__) database_handler = DatabaseHandler('ds061335.mongolab.com', 61335, 'flutrackapp', 'flutrackapp') @app.route('/', methods=['GET']) def default_page(): """ Renders web interface for flu-TrakR :return: rendered html page """ return render_template('home.html') @app.route('/categorise', methods=['GET']) def categorise(): """ Used to render form for labelling sentiment of tweets for purposes of developing a training set :return: rendered html page containing web form """ return render_template('dataCategorisor.html')
pickle.dump(object, file) file.close() except IOError: print("Error, failed to write to file: " + path + filename) def write_to_file(filename, res): try: f = open(filename, 'w') for l in res: f.write(l['text'] + "\n") f.close() except IOError: print("cannot open file: " + filename) print("Updating test data ... ") dbh = DatabaseHandler() res = dbh.get_tweets_with_sentiment('has flu') print("Positive file updated") write_to_file('positive.txt', res) res = dbh.get_tweets_with_sentiment('no flu') write_to_file('negative.txt', res) print("Negative file updated") # J is adjective, r is adverb, and v is verb allowed_word_types = ['J'] documents = []