def return_joined_table(tablename, metadata): visits = get_table(tablename, metadata) places = get_table("places", metadata) demographics = get_table("demographics", metadata) visits_with_places = places.join(visits, onclause=and_(places.c.userid == visits.c.userid, places.c.placeid == visits.c.placeid)).alias("visits_joined") visits_with_places_and_demographics = visits_with_places.join(demographics, visits_with_places.c[tablename + "_userid"] == demographics.c.userid).alias("consolidated") return visits_with_places_and_demographics
def return_joined_table(tablename, metadata): visits = get_table(tablename, metadata) places = get_table("places", metadata) demographics = get_table("demographics", metadata) visits_with_places = places.join( visits, onclause=and_( places.c.userid == visits.c.userid, places.c.placeid == visits.c.placeid)).alias("visits_joined") visits_with_places_and_demographics = visits_with_places.join( demographics, visits_with_places.c[tablename + "_userid"] == demographics.c.userid).alias("consolidated") return visits_with_places_and_demographics
def wlan_frequency_and_variety(place, user): wlanrelation = get_table("wlanrelation", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] average_freq = 0.0 applications = set() bluetooth_visits = [] bluetooth_count = 0.0 user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'wlan'))).fetchall()] for start, end in visit_times: filtered_keys = [r[0] for r in user_records if r[1] >= start and r[1] <=end] if len(filtered_keys) == 0: continue distinct_bluetooths = [r[0] for r in connection.execute(select([wlanrelation.c.id_wnetworks]).where(wlanrelation.c.db_key.in_(filtered_keys)).distinct()).fetchall()] if len(distinct_bluetooths) == 0: continue bluetooth_count += len(distinct_bluetooths) bluetooth_visits.append(distinct_bluetooths) if len(bluetooth_visits) < 2: variation = 0 else: import itertools count = 0.0 variation = 0.0 for a, b in itertools.permutations(bluetooth_visits, 2): variation += intersect_len(a, b)/union_len(a,b) count += 1 variation /= count return bluetooth_count/len(visit_times), variation
def charging_and_silent_frequency(place, user): sys = get_table("sys", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where( and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] charging = silent = 0.0 user_records = [(r[0], r[1]) for r in connection.execute( select([records.c.db_key, records.c.time]).where( and_(records.c.userid == user, records.c.type == 'sys'))).fetchall()] if len(user_records) == 0: return 0, 0 num_visits = len(visit_times) for start, end in visit_times: filtered = [ r[0] for r in user_records if r[1] >= start and r[1] <= end ] if (len(filtered)) == 0: continue l = len(filtered) * 1.0 charging += (connection.execute( select([func.count(sys.c.db_key)]).where( and_(sys.c.charging != 0, sys.c.db_key.in_(filtered)))).fetchall()[0][0]) / l silent += (connection.execute( select([func.count(sys.c.db_key)]).where( and_(sys.c.ring == 'silent', sys.c.db_key.in_(filtered)))).fetchall()[0][0]) / l return (charging / num_visits, silent / num_visits)
def application_frequency_and_variety(place, user): application = get_table("application", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where( and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] average_freq = 0.0 applications = set() user_records = [(r[0], r[1]) for r in connection.execute( select([records.c.db_key, records.c.time]).where( and_(records.c.userid == user, records.c.type == 'app'))).fetchall()] for start, end in visit_times: filtered_keys = [ r[0] for r in user_records if r[1] >= start and r[1] <= end ] if len(filtered_keys) == 0: continue calendar_entry_for_user = select([ func.count(application.c.uid), application.c.uid ]).where(application.c.db_key.in_(filtered_keys)).group_by( application.c.uid) times_used = 0.0 for r, uid in connection.execute(calendar_entry_for_user).fetchall(): times_used += r applications.add(uid) average_freq += (times_used / ((end - start) / 3600.0)) return average_freq / len(visit_times), len(applications)
def extract_visits(places_location, label_id, restrict_to, attribute="label"): if len(restrict_to) == 0: return [] visits_10min = get_table("visits_10min", metadata) # q = visits_10min.select([visits_10min.c.time_start]).where(extract('hour', visits_10min.c.time_start) > 10) # connection.execute(q) visits_with_places = places_location.join( visits_10min, onclause=and_( places_location.c.userid == visits_10min.c.userid, places_location.c.placeid == visits_10min.c.placeid ), ) if attribute == "label": query = ( select([visits_with_places.c.places_location_latitude, visits_with_places.c.places_location_longitude]) .where(visits_with_places.c.places_location_place_label_int == label_id) .where(visits_with_places.c.places_location_id.in_(restrict_to)) ) elif attribute == "time": query = ( select([places_location.c.latitude, places_location.c.longitude]) .where( and_( extract("hour", func.to_timestamp(visits_10min.c.time_start)) >= (2 * label_id), extract("hour", func.to_timestamp(visits_10min.c.time_end)) <= (2 * label_id + 1), ) ) .where(places_location.c.id.in_(restrict_to)) .select_from(visits_with_places) ) else: demographics = get_table("demographics", metadata) joined_t = demographics.join( visits_with_places, visits_with_places.c.places_location_userid == demographics.c.userid ) query = ( select([visits_with_places.c.places_location_latitude, visits_with_places.c.places_location_longitude]) .where(demographics.c[attribute] == label_id) .where(places_location.c.id.in_(restrict_to)) .select_from(joined_t) ) results = connection.execute(query).fetchall() # print len(results) return [(float(r[0]), float(r[1])) for r in results]
def plot_demographics(): metadata, connection = setup_database() demographics = get_table("demographics", metadata) gender_query = select([demographics.c.gender, func.count(demographics.c.gender)]).group_by(demographics.c.gender) result = connection.execute(gender_query).fetchall() result = [r for r in result if r[0] is not None] result = sorted(result, key=lambda x: x[0]) vals = [r[1] for r in result] x_ticks = [GENDER_MAPPING[r[0]] for r in result] filename = "gender.png" draw_barplot(vals, x_ticks=x_ticks, xlabel="Gender", ylabel="Count", title="Gender Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35) age_query = select([demographics.c.age_group, func.count(demographics.c.age_group)]).group_by(demographics.c.age_group) result = connection.execute(age_query).fetchall() result = [r for r in result if r[0] is not None] result = sorted(result, key=lambda x: x[0]) vals = [r[1] for r in result] x_ticks = [AGE_MAPPING[r[0]] for r in result] filename = "age.png" draw_barplot(vals, x_ticks=x_ticks, xlabel="Age Group", ylabel="Count", title="Age Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35) working_query = select([demographics.c.working, func.count(demographics.c.working)]).group_by(demographics.c.working) result = connection.execute(working_query).fetchall() print result result = [r for r in result if r[0] is not None] result = sorted(result, key=lambda x: x[0]) vals = [r[1] for r in result] x_ticks = [WORKING_MAPPING[r[0]] for r in result] filename = "working.png" draw_barplot(vals, x_ticks=[textwrap.fill(text,10) for text in x_ticks], xlabel="Employment Status", ylabel="Count", title="Employment Status Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35) bill_query = select([demographics.c.phone_bill, func.count(demographics.c.phone_bill)]).group_by(demographics.c.phone_bill) result = connection.execute(bill_query).fetchall() result = [r for r in result if r[0] is not None] result = sorted(result, key=lambda x: x[0]) vals = [r[1] for r in result] x_ticks = [BILL_MAPPING[r[0]] for r in result] filename = "bill.png" draw_barplot(vals, x_ticks=x_ticks, xlabel="Bill", ylabel="Count", title="Bill Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35) bill_query = select([demographics.c.nb_12, demographics.c.nb_12_18, demographics.c.nb_18_30, demographics.c.nb_30_40, demographics.c.nb_40_50, demographics.c.nb_50_65, demographics.c.nb_65]) result = connection.execute(bill_query).fetchall() result = [sum([a for a in r if a is not None]) for r in result if r is not None] s = set(result) print s vals = [] x_ticks = [] for elem in s: if elem > 13: continue x_ticks.append(elem) vals.append(result.count(elem)) #vals = [r[1] for r in result] #x_ticks = [BILL_MAPPING[r[0]] for r in result] filename = "family.png" draw_barplot(vals, x_ticks=x_ticks, xlabel="Number of members in family", ylabel="Count", title="Number of Family Members Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35)
def calendar_time_frequency(place, user): calendar = get_table("calendar", metadata) visit_times = select([visits_10min.c.time_start]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [t[0] for t in connection.execute(visit_times).fetchall()] freq = 0.0 for t in visit_times: user_records = select([records.c.db_key]).where(and_(records.c.userid == user, records.c.time >= t - 300, records.c.time<= t+300, records.c.type == 'calendar')).alias("user_records") if connection.execute(user_records).rowcount != 0: freq += 1 return freq
def extract_visits(places_location, label_id, restrict_to, attribute='label'): if len(restrict_to) == 0: return [] visits_10min = get_table("visits_10min", metadata) #q = visits_10min.select([visits_10min.c.time_start]).where(extract('hour', visits_10min.c.time_start) > 10) #connection.execute(q) visits_with_places = places_location.join( visits_10min, onclause=and_(places_location.c.userid == visits_10min.c.userid, places_location.c.placeid == visits_10min.c.placeid)) if attribute == 'label': query = select([ visits_with_places.c.places_location_latitude, visits_with_places.c.places_location_longitude ]).where(visits_with_places.c.places_location_place_label_int == label_id).where( visits_with_places.c.places_location_id.in_(restrict_to)) elif attribute == 'time': query = select([ places_location.c.latitude, places_location.c.longitude ]).where( and_( extract('hour', func.to_timestamp( visits_10min.c.time_start)) >= (2 * label_id), extract('hour', func.to_timestamp(visits_10min.c.time_end)) <= (2 * label_id + 1))).where( places_location.c.id.in_(restrict_to)).select_from( visits_with_places) else: demographics = get_table('demographics', metadata) joined_t = demographics.join( visits_with_places, visits_with_places.c.places_location_userid == demographics.c.userid) query = select([ visits_with_places.c.places_location_latitude, visits_with_places.c.places_location_longitude ]).where(demographics.c[attribute] == label_id).where( places_location.c.id.in_(restrict_to)).select_from(joined_t) results = connection.execute(query).fetchall() #print len(results) return [(float(r[0]), float(r[1])) for r in results]
def media_usage_frequency(place, user): mediaplay = get_table("mediaplay", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] average_freq = 0.0 user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'media_play'))).fetchall()] if len(user_records) == 0: return 0 for start, end in visit_times: filtered = [r[0] for r in user_records if r[1] >= start and r[1] <=end] times_used = len(filtered) average_freq += (times_used / ((end-start)/3600.0)) return average_freq
def calendar_time_frequency(place, user): calendar = get_table("calendar", metadata) visit_times = select([visits_10min.c.time_start]).where( and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [t[0] for t in connection.execute(visit_times).fetchall()] freq = 0.0 for t in visit_times: user_records = select([records.c.db_key]).where( and_(records.c.userid == user, records.c.time >= t - 300, records.c.time <= t + 300, records.c.type == 'calendar')).alias("user_records") if connection.execute(user_records).rowcount != 0: freq += 1 return freq
def charging_and_silent_frequency(place, user): sys = get_table("sys", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] charging = silent = 0.0 user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'sys'))).fetchall()] if len(user_records) == 0: return 0, 0 num_visits = len(visit_times) for start, end in visit_times: filtered = [r[0] for r in user_records if r[1] >= start and r[1] <=end] if(len(filtered)) == 0: continue l = len(filtered) * 1.0 charging += (connection.execute(select([func.count(sys.c.db_key)]).where(and_(sys.c.charging != 0, sys.c.db_key.in_(filtered)))).fetchall()[0][0])/l silent += (connection.execute(select([func.count(sys.c.db_key)]).where(and_(sys.c.ring == 'silent', sys.c.db_key.in_(filtered)))).fetchall()[0][0])/l return (charging/num_visits, silent/num_visits)
def application_frequency_and_variety(place, user): application = get_table("application", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] average_freq = 0.0 applications = set() user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'app'))).fetchall()] for start, end in visit_times: filtered_keys = [r[0] for r in user_records if r[1] >= start and r[1] <=end] if len(filtered_keys) == 0: continue calendar_entry_for_user = select([func.count(application.c.uid), application.c.uid]).where(application.c.db_key.in_(filtered_keys)).group_by(application.c.uid) times_used = 0.0 for r, uid in connection.execute(calendar_entry_for_user).fetchall(): times_used += r applications.add(uid) average_freq += (times_used / ((end-start)/3600.0)) return average_freq/len(visit_times), len(applications)
def communication_frequency(place, user): calllog = get_table("calllog", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] callout = missed_call = textout = 0.0 user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'call_log'))).fetchall()] if len(user_records) == 0: return 0, 0, 0 num_visits = len(visit_times) for start, end in visit_times: filtered = [r[0] for r in user_records if r[1] >= start and r[1] <=end] if(len(filtered)) == 0: continue stay_time = (end-start)/3600.0 callout += (connection.execute(select([func.count(calllog.c.db_key)]).where(and_(calllog.c.description == "Voice call", calllog.c.direction == 'Outgoing', calllog.c.db_key.in_(filtered)))).fetchall()[0][0])/stay_time missed_call += (connection.execute(select([func.count(calllog.c.db_key)]).where(and_(calllog.c.description == "Voice call", calllog.c.direction == 'Missed call', calllog.c.db_key.in_(filtered)))).fetchall()[0][0])/stay_time textout += (connection.execute(select([func.count(calllog.c.db_key)]).where(and_(calllog.c.description == "Short message", calllog.c.direction == 'Outgoing', calllog.c.db_key.in_(filtered)))).fetchall()[0][0])/stay_time return (callout/num_visits, missed_call/num_visits, textout/num_visits)
def wlan_frequency_and_variety(place, user): wlanrelation = get_table("wlanrelation", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where( and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] average_freq = 0.0 applications = set() bluetooth_visits = [] bluetooth_count = 0.0 user_records = [(r[0], r[1]) for r in connection.execute( select([records.c.db_key, records.c.time]).where( and_(records.c.userid == user, records.c.type == 'wlan'))).fetchall()] for start, end in visit_times: filtered_keys = [ r[0] for r in user_records if r[1] >= start and r[1] <= end ] if len(filtered_keys) == 0: continue distinct_bluetooths = [ r[0] for r in connection.execute( select([wlanrelation.c.id_wnetworks]).where( wlanrelation.c.db_key.in_( filtered_keys)).distinct()).fetchall() ] if len(distinct_bluetooths) == 0: continue bluetooth_count += len(distinct_bluetooths) bluetooth_visits.append(distinct_bluetooths) if len(bluetooth_visits) < 2: variation = 0 else: import itertools count = 0.0 variation = 0.0 for a, b in itertools.permutations(bluetooth_visits, 2): variation += intersect_len(a, b) / union_len(a, b) count += 1 variation /= count return bluetooth_count / len(visit_times), variation
def extract_places(places_location, label_id, restrict_to, attribute='label'): if len(restrict_to) == 0: return [] if attribute == 'label': lat_long_query = select([ places_location.c.latitude, places_location.c.longitude ]).where(places_location.c.place_label_int == label_id).where( places_location.c.id.in_(restrict_to)) else: demographics = get_table('demographics', metadata) joined_t = demographics.join( places_location, places_location.c.userid == demographics.c.userid) lat_long_query = select([ places_location.c.latitude, places_location.c.longitude ]).where(demographics.c[attribute] == label_id).where( places_location.c.id.in_(restrict_to)).select_from(joined_t) results = connection.execute(lat_long_query).fetchall() return [(float(r[0]), float(r[1])) for r in results]
def communication_frequency(place, user): calllog = get_table("calllog", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where( and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] callout = missed_call = textout = 0.0 user_records = [(r[0], r[1]) for r in connection.execute( select([records.c.db_key, records.c.time]).where( and_(records.c.userid == user, records.c.type == 'call_log'))).fetchall()] if len(user_records) == 0: return 0, 0, 0 num_visits = len(visit_times) for start, end in visit_times: filtered = [ r[0] for r in user_records if r[1] >= start and r[1] <= end ] if (len(filtered)) == 0: continue stay_time = (end - start) / 3600.0 callout += (connection.execute( select([func.count(calllog.c.db_key)]).where( and_(calllog.c.description == "Voice call", calllog.c.direction == 'Outgoing', calllog.c.db_key.in_( filtered)))).fetchall()[0][0]) / stay_time missed_call += (connection.execute( select([func.count(calllog.c.db_key)]).where( and_(calllog.c.description == "Voice call", calllog.c.direction == 'Missed call', calllog.c.db_key.in_( filtered)))).fetchall()[0][0]) / stay_time textout += (connection.execute( select([func.count(calllog.c.db_key)]).where( and_(calllog.c.description == "Short message", calllog.c.direction == 'Outgoing', calllog.c.db_key.in_(filtered)))).fetchall()[0][0] ) / stay_time return (callout / num_visits, missed_call / num_visits, textout / num_visits)
def media_usage_frequency(place, user): mediaplay = get_table("mediaplay", metadata) visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where( and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()] average_freq = 0.0 user_records = [(r[0], r[1]) for r in connection.execute( select([records.c.db_key, records.c.time]).where( and_(records.c.userid == user, records.c.type == 'media_play'))).fetchall()] if len(user_records) == 0: return 0 for start, end in visit_times: filtered = [ r[0] for r in user_records if r[1] >= start and r[1] <= end ] times_used = len(filtered) average_freq += (times_used / ((end - start) / 3600.0)) return average_freq
def extract_places(places_location, label_id, restrict_to, attribute="label"): if len(restrict_to) == 0: return [] if attribute == "label": lat_long_query = ( select([places_location.c.latitude, places_location.c.longitude]) .where(places_location.c.place_label_int == label_id) .where(places_location.c.id.in_(restrict_to)) ) else: demographics = get_table("demographics", metadata) joined_t = demographics.join(places_location, places_location.c.userid == demographics.c.userid) lat_long_query = ( select([places_location.c.latitude, places_location.c.longitude]) .where(demographics.c[attribute] == label_id) .where(places_location.c.id.in_(restrict_to)) .select_from(joined_t) ) results = connection.execute(lat_long_query).fetchall() return [(float(r[0]), float(r[1])) for r in results]
import sys import csv import datetime import math from spams.db.utils import setup_database, get_table from sqlalchemy.sql import select from sqlalchemy import and_, func from sklearn.neighbors import DistanceMetric metadata, connection = setup_database() places_location = get_table("places_location", metadata) visits_10min = get_table("visits_10min", metadata) records = get_table("records", metadata) def relative_frequency(place, user): count_place = connection.execute( select([visits_10min.c.userid, visits_10min.c.placeid]).where( and_(visits_10min.c.userid == user, visits_10min.c.placeid == place))).rowcount count_all_places = connection.execute( select([visits_10min.c.userid ]).where(visits_10min.c.userid == user)).rowcount return (count_place * 1.0) / (count_all_places * 1.0) def distance_from_most_visited_place(place, user): q = select([func.count(), visits_10min.c.placeid ]).where(visits_10min.c.userid == user).group_by( visits_10min.c.placeid).order_by(func.count().desc()) most_visited_places = [r[1] for r in connection.execute(q).fetchall()]
import datetime import math from spams.db.utils import setup_database, get_table from sqlalchemy.sql import select from sqlalchemy import and_, func from sklearn.neighbors import DistanceMetric import numpy as np from itertools import izip from extract_features import write_features_to_csv import logging logging.basicConfig(filename='motion.log',level=logging.DEBUG) metadata, connection = setup_database() places_location = get_table("places_location", metadata) visits_10min = get_table("visits_10min", metadata) records = get_table("records", metadata) accel = get_table("accel", metadata) def get_features(data): X = [d[0] for d in data] Y = [d[1] for d in data] Z = [d[2] for d in data] x_mean = np.mean(X) y_mean = np.mean(Y) z_mean = np.mean(Z) x_var = np.var(X) y_var = np.var(Y) z_var = np.var(Z) mean_magnitude = np.mean([math.sqrt(x*x + y*y +z*z) for (x,y,z) in izip(X,Y,Z)])
import csv from sqlalchemy.sql import select, and_ from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection if __name__ == "__main__": engine = create_postgres_engine() metadata = get_metadata(engine) connection = get_connection(engine) records = get_table("records", metadata) gpswlan = get_table("gpswlan", metadata) ten_min_visits = get_table("visits_10min", metadata) twenty_min_visits = get_table("visits_20min", metadata) places = get_table("places", metadata) all_places = select([places.c.userid, places.c.placeid]).distinct() print all_places all_places_r = connection.execute(all_places) lat_long = {} print "Starting to iterate over all places" no_wifi_visits_10 = {} no_wifi_visits_20 = {} print all_places_r.rowcount count = 0 for userid, placeid in all_places_r: lat_long[(userid, placeid)] = [] no_wifi_visits_10[(userid, placeid)] = [] no_wifi_visits_20[(userid, placeid)] = [] ten_places = select([ten_min_visits.c.time_start, ten_min_visits.c.time_end], and_(ten_min_visits.c.userid == userid, ten_min_visits.c.placeid == placeid)).limit(5) twenty_places = select([twenty_min_visits.c.time_start, twenty_min_visits.c.time_end], and_(twenty_min_visits.c.userid == userid, twenty_min_visits.c.placeid == placeid)).limit(5)
import csv from sqlalchemy.sql import select, and_ from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection if __name__ == "__main__": engine = create_postgres_engine() metadata = get_metadata(engine) connection = get_connection(engine) records = get_table("records", metadata) gps = get_table("gps", metadata) ten_min_visits = get_table("visits_10min", metadata) twenty_min_visits = get_table("visits_20min", metadata) all_places_r = [] with open("output", "r") as f: reader = csv.reader(f, delimiter=" ") for r in reader: all_places_r.append((r[0], r[1])) lat_long = {} print "Starting to iterate over all places" no_wifi_visits_10 = {} no_wifi_visits_20 = {} print len(all_places_r) count = 0 for userid, placeid in all_places_r: lat_long[(userid, placeid)] = [] no_wifi_visits_10[(userid, placeid)] = [] no_wifi_visits_20[(userid, placeid)] = [] ten_places = select([ten_min_visits.c.time_start, ten_min_visits.c.time_end], and_(ten_min_visits.c.userid == userid, ten_min_visits.c.placeid == placeid)).order_by(ten_min_visits.c.time_start).limit(50)
def plot_demographics(): metadata, connection = setup_database() demographics = get_table("demographics", metadata) gender_query = select( [demographics.c.gender, func.count(demographics.c.gender)]).group_by(demographics.c.gender) result = connection.execute(gender_query).fetchall() result = [r for r in result if r[0] is not None] result = sorted(result, key=lambda x: x[0]) vals = [r[1] for r in result] x_ticks = [GENDER_MAPPING[r[0]] for r in result] filename = "gender.png" draw_barplot(vals, x_ticks=x_ticks, xlabel="Gender", ylabel="Count", title="Gender Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35) age_query = select([ demographics.c.age_group, func.count(demographics.c.age_group) ]).group_by(demographics.c.age_group) result = connection.execute(age_query).fetchall() result = [r for r in result if r[0] is not None] result = sorted(result, key=lambda x: x[0]) vals = [r[1] for r in result] x_ticks = [AGE_MAPPING[r[0]] for r in result] filename = "age.png" draw_barplot(vals, x_ticks=x_ticks, xlabel="Age Group", ylabel="Count", title="Age Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35) working_query = select( [demographics.c.working, func.count(demographics.c.working)]).group_by(demographics.c.working) result = connection.execute(working_query).fetchall() print result result = [r for r in result if r[0] is not None] result = sorted(result, key=lambda x: x[0]) vals = [r[1] for r in result] x_ticks = [WORKING_MAPPING[r[0]] for r in result] filename = "working.png" draw_barplot(vals, x_ticks=[textwrap.fill(text, 10) for text in x_ticks], xlabel="Employment Status", ylabel="Count", title="Employment Status Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35) bill_query = select([ demographics.c.phone_bill, func.count(demographics.c.phone_bill) ]).group_by(demographics.c.phone_bill) result = connection.execute(bill_query).fetchall() result = [r for r in result if r[0] is not None] result = sorted(result, key=lambda x: x[0]) vals = [r[1] for r in result] x_ticks = [BILL_MAPPING[r[0]] for r in result] filename = "bill.png" draw_barplot(vals, x_ticks=x_ticks, xlabel="Bill", ylabel="Count", title="Bill Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35) bill_query = select([ demographics.c.nb_12, demographics.c.nb_12_18, demographics.c.nb_18_30, demographics.c.nb_30_40, demographics.c.nb_40_50, demographics.c.nb_50_65, demographics.c.nb_65 ]) result = connection.execute(bill_query).fetchall() result = [ sum([a for a in r if a is not None]) for r in result if r is not None ] s = set(result) print s vals = [] x_ticks = [] for elem in s: if elem > 13: continue x_ticks.append(elem) vals.append(result.count(elem)) #vals = [r[1] for r in result] #x_ticks = [BILL_MAPPING[r[0]] for r in result] filename = "family.png" draw_barplot(vals, x_ticks=x_ticks, xlabel="Number of members in family", ylabel="Count", title="Number of Family Members Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35)
def priors_with_kde( test, train, input_func=extract_places, attribute="label", return_predictions=True, method="dbscan" ): # test is originally a tuple of place, user, label places_location = get_table("places_location", metadata) q = select([places_location.c.id]) predictor_iterator = iterator_dict[attribute] if attribute == "label": train_tuples = [ ( connection.execute( q.where(and_(places_location.c.placeid == place, places_location.c.userid == user)) ).fetchall()[0][0], label, ) for ((place, user), label) in train ] elif attribute == "time": train_tuples = [] input_func = extract_visits for (place, user), _ in train: id = connection.execute( q.where(and_(places_location.c.placeid == place, places_location.c.userid == user)) ).fetchall()[0][0] for i in xrange(0, 12): train_tuples.append((id, i)) else: input_func = extract_visits demographics = get_table("demographics", metadata) demo_q = select([demographics.c[attribute]]) train_tuples = [] for (place, user), _ in train: id = connection.execute( q.where(and_(places_location.c.placeid == place, places_location.c.userid == user)) ).fetchall()[0][0] demo_r = connection.execute(demo_q.where(demographics.c.userid == user)) if demo_r.rowcount == 1: train_tuples.append((id, demo_r.fetchall()[0][0])) else: train_tuples.append((id, None)) test = [ connection.execute( q.where(and_(places_location.c.placeid == place, places_location.c.userid == user)) ).fetchall()[0][0] for ((place, user), _) in test ] train_predictor_dict = defaultdict(list) for id, predictor in train_tuples: if predictor is not None: train_predictor_dict[predictor].append(id) test_scores = {} accurate = 0.0 count = len(test) train_scores = {} if method == "dbscan": place_group_dict, group_place_dict = perform_db_scan(places_location) groups = group_place_dict.keys() for g in groups: estimators = {} group_test = [p for p in test if place_group_dict[p] == g] for predictor in predictor_iterator: places_in_group = [p for p in train_predictor_dict[predictor] if place_group_dict[p] == g] if len(places_in_group) == 0: continue training_set = input_func(places_location, predictor, places_in_group, attribute) xy = np.array(training_set) # Convert to radians xy *= np.pi / 180.0 estimators[predictor] = train_kde(xy, predictor) for p in group_test: test_scores[p] = get_scores(places_location, p, predictor_iterator, estimators) for p in group_place_dict[g]: train_scores[p] = get_scores(places_location, p, predictor_iterator, estimators) elif method == "simple": estimators = {} for predictor in predictor_iterator: places_predictor = train_predictor_dict[predictor] if len(places_predictor) == 0: continue training_set = input_func(places_location, predictor, places_predictor, attribute) xy = np.array(training_set) # Convert to radians xy *= np.pi / 180.0 estimators[predictor] = train_kde(xy, predictor) for p in test: test_scores[p] = get_scores(places_location, p, predictor_iterator, estimators) for p, _ in train_tuples: train_scores[p] = get_scores(places_location, p, predictor_iterator, estimators) accurate = 0.0 count = 0.0 # for place in test_scores.keys(): # if attribute == 'label': # true_predictor = connection.execute(select([places_location.c.place_label_int]).where(places_location.c.id==place)).fetchall()[0][0] # else: # user = connection.execute(select([places_location.c.userid]).where(places_location.c.id==place)).fetchall()[0][0] # r = connection.execute(select([demographics.c[attribute]]).where(demographics.c.userid==user)) # if r.rowcount == 1: # true_predictor = r.fetchall()[0][0] # else: # true_predictor = None # # predicted_value, max_score = max(test_scores[place], key = lambda x: x[1]) # if predicted_value == true_predictor: # accurate += 1 # #print max(scores[place]) # count += 1 # accuracy = accurate/count # if attribute is not 'label': # print accuracy if return_predictions: prior_labels = [] for place in test: predicted_value, max_score = max(test_scores[place], key=lambda x: x[1]) prior_labels.append(predicted_value) return prior_labels else: test_scores = { tuple( connection.execute( select([places_location.c.placeid, places_location.c.userid]).where(places_location.c.id == p) ).fetchall()[0] ): [s[1] for s in score] for p, score in test_scores.items() } training_scores = { tuple( connection.execute( select([places_location.c.placeid, places_location.c.userid]).where(places_location.c.id == p) ).fetchall()[0] ): [s[1] for s in score] for p, score in train_scores.items() } return training_scores, test_scores
import foursquare from sqlalchemy.sql import select from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection from spams.config import FOURSQUARE_CLIENT_ID, FOURSQUARE_CLIENT_SECRET if __name__ == "__main__": client = foursquare.Foursquare(client_id=FOURSQUARE_CLIENT_ID, client_secret=FOURSQUARE_CLIENT_SECRET) engine = create_postgres_engine() metadata = get_metadata(engine) connection = get_connection(engine) output_file = open("nearby_places.csv", "w") writer = csv.writer(output_file) places_location = get_table("places_location", metadata) query = select([ places_location.c.latitude, places_location.c.longitude, places_location.c.userid, places_location.c.placeid, places_location.c.place_label ]) results = connection.execute(query) for r in results: ll = str(r[0]) + "," + str(r[1]) venues = client.venues.search(params={ 'll': ll, 'radius': 50, 'intent': 'browse' }) to_write = [] for venue in venues["venues"]:
FILENAME = "places_with_mapping.csv" if __name__ == "__main__": engine = create_postgres_engine() metadata = get_metadata(engine) connection = get_connection(engine) table_name = 'places_location' if not engine.dialect.has_table(connection, table_name): places_location_mapping = Table(table_name, metadata, Column('id', primary_key = True) Column('userid', Integer), Column('placeid', Integer), Column('latitude', Numeric), Column('latitude_std', Numeric), Column('longitude', Numeric), Column('longitude_std', Numeric), Column('method', String), Column('place_label', String), Column('place_label_int', Integer)) metadata.create_all(engine) print "Created Table" else: print "Table exists" places_location_mapping = get_table(table_name, metadata) with open(FILENAME) as f: reader = csv.reader(f) for row in reader: ins = places_location_mapping.insert().values(tuple(row)) print connection.execute(ins)
MAPPING = "mapping" OUTPUT = "places_with_mapping.csv" if __name__ == "__main__": places_file = open(PLACES, "r") mapping_file = open(MAPPING, "r") output_file = open(OUTPUT, "w") places_reader = csv.reader(places_file) mapping_reader = csv.reader(mapping_file) mapping = {} for row in mapping_reader: mapping[int(row[0])] = row[1] output_writer = csv.writer(output_file) engine = create_postgres_engine() connection = get_connection(engine) metadata = get_metadata(engine) places = get_table("places", metadata) labels = connection.execute(select([places.c.userid, places.c.placeid, places.c.place_label])).fetchall() places_dict = {} for a in labels: places_dict[(a[0], a[1])] = a[2] for row in places_reader: label_int = places_dict[(int(row[0]), int(row[1]))] row.append(mapping[label_int]) row.append(label_int) output_writer.writerow(row) places_file.close() mapping_file.close() output_file.close()
pass def colors_map(): NUM_COLORS = 10 colors = [] cm = plt.get_cmap('gist_rainbow') for i in range(NUM_COLORS): colors.append(cm(10.*i/NUM_COLORS)) return colors if __name__ == "__main__": metadata, connection = setup_database() places_location = get_table("places_location", metadata) restrict_to = [] with open("biggest_cluster_places") as f: for line in f: line = line.strip() restrict_to.append(line) lat_long_query = select([places_location.c.latitude, places_location.c.longitude, places_location.c.place_label, places_location.c.id]).where(places_location.c.place_label == 'Work') queries = select([func.min(places_location.c.latitude), func.max(places_location.c.latitude), func.min(places_location.c.longitude), func.max(places_location.c.longitude)]) min_lat, max_lat, min_long, max_long = connection.execute(queries).fetchall()[0] results = connection.execute(lat_long_query).fetchall() places_with_labels = [(r[2],(float(r[0]), float(r[1]))) for r in results] places = [r[3] for r in results] labels_places_dict = defaultdict(list) labels_places_count = defaultdict(int) places_with_labels = sorted(places_with_labels, key = lambda x:x[0])
import csv from sqlalchemy.sql import select, and_ from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection if __name__ == "__main__": engine = create_postgres_engine() metadata = get_metadata(engine) connection = get_connection(engine) records = get_table("records", metadata) gpswlan = get_table("gpswlan", metadata) ten_min_visits = get_table("visits_10min", metadata) twenty_min_visits = get_table("visits_20min", metadata) places = get_table("places", metadata) all_places = select([places.c.userid, places.c.placeid]).distinct() print all_places all_places_r = connection.execute(all_places) lat_long = {} print "Starting to iterate over all places" no_wifi_visits_10 = {} no_wifi_visits_20 = {} print all_places_r.rowcount count = 0 for userid, placeid in all_places_r: lat_long[(userid, placeid)] = [] no_wifi_visits_10[(userid, placeid)] = [] no_wifi_visits_20[(userid, placeid)] = [] ten_places = select( [ten_min_visits.c.time_start, ten_min_visits.c.time_end], and_(ten_min_visits.c.userid == userid,
def priors_with_kde(test, train, input_func=extract_places, attribute='label', return_predictions=True, method='dbscan'): #test is originally a tuple of place, user, label places_location = get_table("places_location", metadata) q = select([places_location.c.id]) predictor_iterator = iterator_dict[attribute] if attribute == 'label': train_tuples = [(connection.execute( q.where( and_(places_location.c.placeid == place, places_location.c.userid == user))).fetchall()[0][0], label) for ((place, user), label) in train] elif attribute == 'time': train_tuples = [] input_func = extract_visits for (place, user), _ in train: id = connection.execute( q.where( and_(places_location.c.placeid == place, places_location.c.userid == user))).fetchall()[0][0] for i in xrange(0, 12): train_tuples.append((id, i)) else: input_func = extract_visits demographics = get_table("demographics", metadata) demo_q = select([demographics.c[attribute]]) train_tuples = [] for (place, user), _ in train: id = connection.execute( q.where( and_(places_location.c.placeid == place, places_location.c.userid == user))).fetchall()[0][0] demo_r = connection.execute( demo_q.where(demographics.c.userid == user)) if demo_r.rowcount == 1: train_tuples.append((id, demo_r.fetchall()[0][0])) else: train_tuples.append((id, None)) test = [ connection.execute( q.where( and_(places_location.c.placeid == place, places_location.c.userid == user))).fetchall()[0][0] for ((place, user), _) in test ] train_predictor_dict = defaultdict(list) for id, predictor in train_tuples: if predictor is not None: train_predictor_dict[predictor].append(id) test_scores = {} accurate = 0.0 count = len(test) train_scores = {} if method == 'dbscan': place_group_dict, group_place_dict = perform_db_scan(places_location) groups = group_place_dict.keys() for g in groups: estimators = {} group_test = [p for p in test if place_group_dict[p] == g] for predictor in predictor_iterator: places_in_group = [ p for p in train_predictor_dict[predictor] if place_group_dict[p] == g ] if len(places_in_group) == 0: continue training_set = input_func(places_location, predictor, places_in_group, attribute) xy = np.array(training_set) # Convert to radians xy *= np.pi / 180. estimators[predictor] = train_kde(xy, predictor) for p in group_test: test_scores[p] = get_scores(places_location, p, predictor_iterator, estimators) for p in group_place_dict[g]: train_scores[p] = get_scores(places_location, p, predictor_iterator, estimators) elif method == 'simple': estimators = {} for predictor in predictor_iterator: places_predictor = train_predictor_dict[predictor] if len(places_predictor) == 0: continue training_set = input_func(places_location, predictor, places_predictor, attribute) xy = np.array(training_set) # Convert to radians xy *= np.pi / 180. estimators[predictor] = train_kde(xy, predictor) for p in test: test_scores[p] = get_scores(places_location, p, predictor_iterator, estimators) for p, _ in train_tuples: train_scores[p] = get_scores(places_location, p, predictor_iterator, estimators) accurate = 0.0 count = 0.0 #for place in test_scores.keys(): # if attribute == 'label': # true_predictor = connection.execute(select([places_location.c.place_label_int]).where(places_location.c.id==place)).fetchall()[0][0] # else: # user = connection.execute(select([places_location.c.userid]).where(places_location.c.id==place)).fetchall()[0][0] # r = connection.execute(select([demographics.c[attribute]]).where(demographics.c.userid==user)) # if r.rowcount == 1: # true_predictor = r.fetchall()[0][0] # else: # true_predictor = None # # predicted_value, max_score = max(test_scores[place], key = lambda x: x[1]) # if predicted_value == true_predictor: # accurate += 1 # #print max(scores[place]) # count += 1 #accuracy = accurate/count #if attribute is not 'label': # print accuracy if return_predictions: prior_labels = [] for place in test: predicted_value, max_score = max(test_scores[place], key=lambda x: x[1]) prior_labels.append(predicted_value) return prior_labels else: test_scores = { tuple( connection.execute( select([ places_location.c.placeid, places_location.c.userid ]).where(places_location.c.id == p)).fetchall()[0]): [s[1] for s in score] for p, score in test_scores.items() } training_scores = { tuple( connection.execute( select([ places_location.c.placeid, places_location.c.userid ]).where(places_location.c.id == p)).fetchall()[0]): [s[1] for s in score] for p, score in train_scores.items() } return training_scores, test_scores
import csv from sqlalchemy.sql import select, and_ from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection if __name__ == "__main__": engine = create_postgres_engine() metadata = get_metadata(engine) connection = get_connection(engine) records = get_table("records", metadata) gps = get_table("gps", metadata) ten_min_visits = get_table("visits_10min", metadata) twenty_min_visits = get_table("visits_20min", metadata) all_places_r = [] with open("output", "r") as f: reader = csv.reader(f, delimiter=" ") for r in reader: all_places_r.append((r[0], r[1])) lat_long = {} print "Starting to iterate over all places" no_wifi_visits_10 = {} no_wifi_visits_20 = {} print len(all_places_r) count = 0 for userid, placeid in all_places_r: lat_long[(userid, placeid)] = [] no_wifi_visits_10[(userid, placeid)] = [] no_wifi_visits_20[(userid, placeid)] = [] ten_places = select( [ten_min_visits.c.time_start, ten_min_visits.c.time_end],
OUTPUT = "places_with_mapping.csv" if __name__ == "__main__": places_file = open(PLACES, "r") mapping_file = open(MAPPING, "r") output_file = open(OUTPUT, "w") places_reader = csv.reader(places_file) mapping_reader = csv.reader(mapping_file) mapping = {} for row in mapping_reader: mapping[int(row[0])] = row[1] output_writer = csv.writer(output_file) engine = create_postgres_engine() connection = get_connection(engine) metadata = get_metadata(engine) places = get_table("places", metadata) labels = connection.execute( select([places.c.userid, places.c.placeid, places.c.place_label])).fetchall() places_dict = {} for a in labels: places_dict[(a[0], a[1])] = a[2] for row in places_reader: label_int = places_dict[(int(row[0]), int(row[1]))] row.append(mapping[label_int]) row.append(label_int) output_writer.writerow(row) places_file.close() mapping_file.close() output_file.close()