Exemplo n.º 1
0
def return_joined_table(tablename, metadata):
    visits = get_table(tablename, metadata)
    places = get_table("places", metadata)
    demographics = get_table("demographics", metadata)
    visits_with_places = places.join(visits, onclause=and_(places.c.userid == visits.c.userid, places.c.placeid == visits.c.placeid)).alias("visits_joined")
    visits_with_places_and_demographics = visits_with_places.join(demographics, visits_with_places.c[tablename + "_userid"] == demographics.c.userid).alias("consolidated")
    return visits_with_places_and_demographics
Exemplo n.º 2
0
def return_joined_table(tablename, metadata):
    visits = get_table(tablename, metadata)
    places = get_table("places", metadata)
    demographics = get_table("demographics", metadata)
    visits_with_places = places.join(
        visits,
        onclause=and_(
            places.c.userid == visits.c.userid,
            places.c.placeid == visits.c.placeid)).alias("visits_joined")
    visits_with_places_and_demographics = visits_with_places.join(
        demographics, visits_with_places.c[tablename + "_userid"] ==
        demographics.c.userid).alias("consolidated")
    return visits_with_places_and_demographics
Exemplo n.º 3
0
def wlan_frequency_and_variety(place, user):
    wlanrelation = get_table("wlanrelation", metadata)
    visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) 
    visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()]
    average_freq = 0.0
    applications = set()
    bluetooth_visits = []
    bluetooth_count = 0.0
    user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'wlan'))).fetchall()]
    for start, end in visit_times:
        filtered_keys = [r[0] for r in user_records if r[1] >= start and r[1] <=end]
        if len(filtered_keys) == 0:
            continue
        distinct_bluetooths = [r[0] for r in connection.execute(select([wlanrelation.c.id_wnetworks]).where(wlanrelation.c.db_key.in_(filtered_keys)).distinct()).fetchall()]
        if len(distinct_bluetooths) == 0:
            continue
        bluetooth_count += len(distinct_bluetooths)
        bluetooth_visits.append(distinct_bluetooths)
    if len(bluetooth_visits) < 2:
        variation = 0
    else:
        import itertools
        count = 0.0
        variation = 0.0
        for a, b in itertools.permutations(bluetooth_visits, 2):
            variation += intersect_len(a, b)/union_len(a,b)
            count += 1
        variation /= count    

    return bluetooth_count/len(visit_times), variation
Exemplo n.º 4
0
def charging_and_silent_frequency(place, user):
    sys = get_table("sys", metadata)
    visit_times = select([visits_10min.c.time_start,
                          visits_10min.c.time_end]).where(
                              and_(visits_10min.c.userid == user,
                                   visits_10min.c.placeid == place))
    visit_times = [(t[0], t[1])
                   for t in connection.execute(visit_times).fetchall()]
    charging = silent = 0.0
    user_records = [(r[0], r[1]) for r in connection.execute(
        select([records.c.db_key, records.c.time]).where(
            and_(records.c.userid == user, records.c.type ==
                 'sys'))).fetchall()]
    if len(user_records) == 0:
        return 0, 0
    num_visits = len(visit_times)
    for start, end in visit_times:
        filtered = [
            r[0] for r in user_records if r[1] >= start and r[1] <= end
        ]
        if (len(filtered)) == 0:
            continue
        l = len(filtered) * 1.0
        charging += (connection.execute(
            select([func.count(sys.c.db_key)]).where(
                and_(sys.c.charging != 0,
                     sys.c.db_key.in_(filtered)))).fetchall()[0][0]) / l
        silent += (connection.execute(
            select([func.count(sys.c.db_key)]).where(
                and_(sys.c.ring == 'silent',
                     sys.c.db_key.in_(filtered)))).fetchall()[0][0]) / l
    return (charging / num_visits, silent / num_visits)
Exemplo n.º 5
0
def application_frequency_and_variety(place, user):
    application = get_table("application", metadata)
    visit_times = select([visits_10min.c.time_start,
                          visits_10min.c.time_end]).where(
                              and_(visits_10min.c.userid == user,
                                   visits_10min.c.placeid == place))
    visit_times = [(t[0], t[1])
                   for t in connection.execute(visit_times).fetchall()]
    average_freq = 0.0
    applications = set()
    user_records = [(r[0], r[1]) for r in connection.execute(
        select([records.c.db_key, records.c.time]).where(
            and_(records.c.userid == user, records.c.type ==
                 'app'))).fetchall()]
    for start, end in visit_times:
        filtered_keys = [
            r[0] for r in user_records if r[1] >= start and r[1] <= end
        ]
        if len(filtered_keys) == 0:
            continue
        calendar_entry_for_user = select([
            func.count(application.c.uid), application.c.uid
        ]).where(application.c.db_key.in_(filtered_keys)).group_by(
            application.c.uid)
        times_used = 0.0
        for r, uid in connection.execute(calendar_entry_for_user).fetchall():
            times_used += r
            applications.add(uid)
        average_freq += (times_used / ((end - start) / 3600.0))
    return average_freq / len(visit_times), len(applications)
Exemplo n.º 6
0
def extract_visits(places_location, label_id, restrict_to, attribute="label"):
    if len(restrict_to) == 0:
        return []
    visits_10min = get_table("visits_10min", metadata)
    # q = visits_10min.select([visits_10min.c.time_start]).where(extract('hour', visits_10min.c.time_start) > 10)
    # connection.execute(q)
    visits_with_places = places_location.join(
        visits_10min,
        onclause=and_(
            places_location.c.userid == visits_10min.c.userid, places_location.c.placeid == visits_10min.c.placeid
        ),
    )
    if attribute == "label":
        query = (
            select([visits_with_places.c.places_location_latitude, visits_with_places.c.places_location_longitude])
            .where(visits_with_places.c.places_location_place_label_int == label_id)
            .where(visits_with_places.c.places_location_id.in_(restrict_to))
        )
    elif attribute == "time":
        query = (
            select([places_location.c.latitude, places_location.c.longitude])
            .where(
                and_(
                    extract("hour", func.to_timestamp(visits_10min.c.time_start)) >= (2 * label_id),
                    extract("hour", func.to_timestamp(visits_10min.c.time_end)) <= (2 * label_id + 1),
                )
            )
            .where(places_location.c.id.in_(restrict_to))
            .select_from(visits_with_places)
        )
    else:
        demographics = get_table("demographics", metadata)
        joined_t = demographics.join(
            visits_with_places, visits_with_places.c.places_location_userid == demographics.c.userid
        )
        query = (
            select([visits_with_places.c.places_location_latitude, visits_with_places.c.places_location_longitude])
            .where(demographics.c[attribute] == label_id)
            .where(places_location.c.id.in_(restrict_to))
            .select_from(joined_t)
        )

    results = connection.execute(query).fetchall()
    # print len(results)
    return [(float(r[0]), float(r[1])) for r in results]
Exemplo n.º 7
0
def plot_demographics():
    metadata, connection = setup_database()
    demographics = get_table("demographics", metadata)

    gender_query = select([demographics.c.gender, func.count(demographics.c.gender)]).group_by(demographics.c.gender)
    result = connection.execute(gender_query).fetchall()
    result = [r for r in result if r[0] is not None]
    result = sorted(result, key=lambda x: x[0])
    vals = [r[1] for r in result]
    x_ticks = [GENDER_MAPPING[r[0]] for r in result]
    filename = "gender.png"
    draw_barplot(vals, x_ticks=x_ticks, xlabel="Gender", ylabel="Count", title="Gender Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35)

    age_query = select([demographics.c.age_group, func.count(demographics.c.age_group)]).group_by(demographics.c.age_group)
    result = connection.execute(age_query).fetchall()
    result = [r for r in result if r[0] is not None]
    result = sorted(result, key=lambda x: x[0])
    vals = [r[1] for r in result]
    x_ticks = [AGE_MAPPING[r[0]] for r in result]
    filename = "age.png"
    draw_barplot(vals, x_ticks=x_ticks, xlabel="Age Group", ylabel="Count", title="Age Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35)

    working_query = select([demographics.c.working, func.count(demographics.c.working)]).group_by(demographics.c.working)
    result = connection.execute(working_query).fetchall()
    print result
    result = [r for r in result if r[0] is not None]
    result = sorted(result, key=lambda x: x[0])
    vals = [r[1] for r in result]
    x_ticks = [WORKING_MAPPING[r[0]] for r in result]
    filename = "working.png"
    draw_barplot(vals, x_ticks=[textwrap.fill(text,10) for text in x_ticks], xlabel="Employment Status", ylabel="Count", title="Employment Status Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35)

    bill_query = select([demographics.c.phone_bill, func.count(demographics.c.phone_bill)]).group_by(demographics.c.phone_bill)
    result = connection.execute(bill_query).fetchall()
    result = [r for r in result if r[0] is not None]
    result = sorted(result, key=lambda x: x[0])
    vals = [r[1] for r in result]
    x_ticks = [BILL_MAPPING[r[0]] for r in result]
    filename = "bill.png"
    draw_barplot(vals, x_ticks=x_ticks, xlabel="Bill", ylabel="Count", title="Bill Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35)

    bill_query = select([demographics.c.nb_12, demographics.c.nb_12_18, demographics.c.nb_18_30, demographics.c.nb_30_40, demographics.c.nb_40_50, demographics.c.nb_50_65, demographics.c.nb_65])
    result = connection.execute(bill_query).fetchall()
    result = [sum([a for a in r if a is not None]) for r in result if r is not None]
    s = set(result)
    print s
    vals = []
    x_ticks = []
    for elem in s:
        if elem > 13:
            continue
        x_ticks.append(elem)
        vals.append(result.count(elem))
    #vals = [r[1] for r in result]
    #x_ticks = [BILL_MAPPING[r[0]] for r in result]
    filename = "family.png"
    draw_barplot(vals, x_ticks=x_ticks, xlabel="Number of members in family", ylabel="Count", title="Number of Family Members Distribution", save_as=os.path.join("/local", "thesis", "plots", filename), width=0.35)
Exemplo n.º 8
0
def calendar_time_frequency(place, user):
    calendar = get_table("calendar", metadata)
    visit_times = select([visits_10min.c.time_start]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) 
    visit_times = [t[0] for t in connection.execute(visit_times).fetchall()]
    freq = 0.0
    for t in visit_times:
        user_records = select([records.c.db_key]).where(and_(records.c.userid == user, records.c.time >= t - 300, records.c.time<= t+300, records.c.type == 'calendar')).alias("user_records")
        if connection.execute(user_records).rowcount != 0:
            freq += 1
    return freq        
Exemplo n.º 9
0
def extract_visits(places_location, label_id, restrict_to, attribute='label'):
    if len(restrict_to) == 0:
        return []
    visits_10min = get_table("visits_10min", metadata)
    #q = visits_10min.select([visits_10min.c.time_start]).where(extract('hour', visits_10min.c.time_start) > 10)
    #connection.execute(q)
    visits_with_places = places_location.join(
        visits_10min,
        onclause=and_(places_location.c.userid == visits_10min.c.userid,
                      places_location.c.placeid == visits_10min.c.placeid))
    if attribute == 'label':
        query = select([
            visits_with_places.c.places_location_latitude,
            visits_with_places.c.places_location_longitude
        ]).where(visits_with_places.c.places_location_place_label_int ==
                 label_id).where(
                     visits_with_places.c.places_location_id.in_(restrict_to))
    elif attribute == 'time':
        query = select([
            places_location.c.latitude, places_location.c.longitude
        ]).where(
            and_(
                extract('hour', func.to_timestamp(
                    visits_10min.c.time_start)) >= (2 * label_id),
                extract('hour', func.to_timestamp(visits_10min.c.time_end)) <=
                (2 * label_id + 1))).where(
                    places_location.c.id.in_(restrict_to)).select_from(
                        visits_with_places)
    else:
        demographics = get_table('demographics', metadata)
        joined_t = demographics.join(
            visits_with_places, visits_with_places.c.places_location_userid ==
            demographics.c.userid)
        query = select([
            visits_with_places.c.places_location_latitude,
            visits_with_places.c.places_location_longitude
        ]).where(demographics.c[attribute] == label_id).where(
            places_location.c.id.in_(restrict_to)).select_from(joined_t)

    results = connection.execute(query).fetchall()
    #print len(results)
    return [(float(r[0]), float(r[1])) for r in results]
Exemplo n.º 10
0
def media_usage_frequency(place, user):
    mediaplay = get_table("mediaplay", metadata)
    visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) 
    visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()]
    average_freq = 0.0
    user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'media_play'))).fetchall()]
    if len(user_records) == 0:
        return 0
    for start, end in visit_times:
        filtered = [r[0] for r in user_records if r[1] >= start and r[1] <=end]
        times_used =  len(filtered)
        average_freq += (times_used / ((end-start)/3600.0))
    return average_freq
Exemplo n.º 11
0
def calendar_time_frequency(place, user):
    calendar = get_table("calendar", metadata)
    visit_times = select([visits_10min.c.time_start]).where(
        and_(visits_10min.c.userid == user, visits_10min.c.placeid == place))
    visit_times = [t[0] for t in connection.execute(visit_times).fetchall()]
    freq = 0.0
    for t in visit_times:
        user_records = select([records.c.db_key]).where(
            and_(records.c.userid == user, records.c.time >= t - 300,
                 records.c.time <= t + 300,
                 records.c.type == 'calendar')).alias("user_records")
        if connection.execute(user_records).rowcount != 0:
            freq += 1
    return freq
Exemplo n.º 12
0
def charging_and_silent_frequency(place, user):
    sys = get_table("sys", metadata)
    visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) 
    visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()]
    charging = silent = 0.0
    user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'sys'))).fetchall()]
    if len(user_records) == 0:
        return 0, 0
    num_visits = len(visit_times)
    for start, end in visit_times:
        filtered = [r[0] for r in user_records if r[1] >= start and r[1] <=end]
        if(len(filtered)) == 0:
            continue
        l = len(filtered) * 1.0
        charging += (connection.execute(select([func.count(sys.c.db_key)]).where(and_(sys.c.charging != 0, sys.c.db_key.in_(filtered)))).fetchall()[0][0])/l
        silent += (connection.execute(select([func.count(sys.c.db_key)]).where(and_(sys.c.ring == 'silent', sys.c.db_key.in_(filtered)))).fetchall()[0][0])/l
    return (charging/num_visits, silent/num_visits)
Exemplo n.º 13
0
def application_frequency_and_variety(place, user):
    application = get_table("application", metadata)
    visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) 
    visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()]
    average_freq = 0.0
    applications = set()
    user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'app'))).fetchall()]
    for start, end in visit_times:
        filtered_keys = [r[0] for r in user_records if r[1] >= start and r[1] <=end]
        if len(filtered_keys) == 0:
            continue
        calendar_entry_for_user = select([func.count(application.c.uid), application.c.uid]).where(application.c.db_key.in_(filtered_keys)).group_by(application.c.uid)
        times_used = 0.0
        for r, uid in connection.execute(calendar_entry_for_user).fetchall():
            times_used += r
            applications.add(uid)
        average_freq += (times_used / ((end-start)/3600.0))
    return average_freq/len(visit_times), len(applications)
Exemplo n.º 14
0
def communication_frequency(place, user):
    calllog = get_table("calllog", metadata)
    visit_times = select([visits_10min.c.time_start, visits_10min.c.time_end]).where(and_(visits_10min.c.userid == user, visits_10min.c.placeid == place)) 
    visit_times = [(t[0], t[1]) for t in connection.execute(visit_times).fetchall()]
    callout = missed_call = textout = 0.0
    user_records = [(r[0], r[1]) for r in connection.execute(select([records.c.db_key, records.c.time]).where(and_(records.c.userid == user,records.c.type == 'call_log'))).fetchall()]
    if len(user_records) == 0:
        return 0, 0, 0
    num_visits = len(visit_times)
    for start, end in visit_times:
        filtered = [r[0] for r in user_records if r[1] >= start and r[1] <=end]
        if(len(filtered)) == 0:
            continue
        stay_time = (end-start)/3600.0
        callout += (connection.execute(select([func.count(calllog.c.db_key)]).where(and_(calllog.c.description == "Voice call", calllog.c.direction == 'Outgoing', calllog.c.db_key.in_(filtered)))).fetchall()[0][0])/stay_time
        missed_call += (connection.execute(select([func.count(calllog.c.db_key)]).where(and_(calllog.c.description == "Voice call", calllog.c.direction == 'Missed call', calllog.c.db_key.in_(filtered)))).fetchall()[0][0])/stay_time
        textout += (connection.execute(select([func.count(calllog.c.db_key)]).where(and_(calllog.c.description == "Short message", calllog.c.direction == 'Outgoing', calllog.c.db_key.in_(filtered)))).fetchall()[0][0])/stay_time
    return (callout/num_visits, missed_call/num_visits, textout/num_visits)
Exemplo n.º 15
0
def wlan_frequency_and_variety(place, user):
    wlanrelation = get_table("wlanrelation", metadata)
    visit_times = select([visits_10min.c.time_start,
                          visits_10min.c.time_end]).where(
                              and_(visits_10min.c.userid == user,
                                   visits_10min.c.placeid == place))
    visit_times = [(t[0], t[1])
                   for t in connection.execute(visit_times).fetchall()]
    average_freq = 0.0
    applications = set()
    bluetooth_visits = []
    bluetooth_count = 0.0
    user_records = [(r[0], r[1]) for r in connection.execute(
        select([records.c.db_key, records.c.time]).where(
            and_(records.c.userid == user, records.c.type ==
                 'wlan'))).fetchall()]
    for start, end in visit_times:
        filtered_keys = [
            r[0] for r in user_records if r[1] >= start and r[1] <= end
        ]
        if len(filtered_keys) == 0:
            continue
        distinct_bluetooths = [
            r[0] for r in connection.execute(
                select([wlanrelation.c.id_wnetworks]).where(
                    wlanrelation.c.db_key.in_(
                        filtered_keys)).distinct()).fetchall()
        ]
        if len(distinct_bluetooths) == 0:
            continue
        bluetooth_count += len(distinct_bluetooths)
        bluetooth_visits.append(distinct_bluetooths)
    if len(bluetooth_visits) < 2:
        variation = 0
    else:
        import itertools
        count = 0.0
        variation = 0.0
        for a, b in itertools.permutations(bluetooth_visits, 2):
            variation += intersect_len(a, b) / union_len(a, b)
            count += 1
        variation /= count

    return bluetooth_count / len(visit_times), variation
Exemplo n.º 16
0
def extract_places(places_location, label_id, restrict_to, attribute='label'):
    if len(restrict_to) == 0:
        return []
    if attribute == 'label':
        lat_long_query = select([
            places_location.c.latitude, places_location.c.longitude
        ]).where(places_location.c.place_label_int == label_id).where(
            places_location.c.id.in_(restrict_to))

    else:
        demographics = get_table('demographics', metadata)
        joined_t = demographics.join(
            places_location, places_location.c.userid == demographics.c.userid)
        lat_long_query = select([
            places_location.c.latitude, places_location.c.longitude
        ]).where(demographics.c[attribute] == label_id).where(
            places_location.c.id.in_(restrict_to)).select_from(joined_t)

    results = connection.execute(lat_long_query).fetchall()
    return [(float(r[0]), float(r[1])) for r in results]
Exemplo n.º 17
0
def communication_frequency(place, user):
    calllog = get_table("calllog", metadata)
    visit_times = select([visits_10min.c.time_start,
                          visits_10min.c.time_end]).where(
                              and_(visits_10min.c.userid == user,
                                   visits_10min.c.placeid == place))
    visit_times = [(t[0], t[1])
                   for t in connection.execute(visit_times).fetchall()]
    callout = missed_call = textout = 0.0
    user_records = [(r[0], r[1]) for r in connection.execute(
        select([records.c.db_key, records.c.time]).where(
            and_(records.c.userid == user, records.c.type ==
                 'call_log'))).fetchall()]
    if len(user_records) == 0:
        return 0, 0, 0
    num_visits = len(visit_times)
    for start, end in visit_times:
        filtered = [
            r[0] for r in user_records if r[1] >= start and r[1] <= end
        ]
        if (len(filtered)) == 0:
            continue
        stay_time = (end - start) / 3600.0
        callout += (connection.execute(
            select([func.count(calllog.c.db_key)]).where(
                and_(calllog.c.description == "Voice call", calllog.c.direction
                     == 'Outgoing', calllog.c.db_key.in_(
                         filtered)))).fetchall()[0][0]) / stay_time
        missed_call += (connection.execute(
            select([func.count(calllog.c.db_key)]).where(
                and_(calllog.c.description == "Voice call", calllog.c.direction
                     == 'Missed call', calllog.c.db_key.in_(
                         filtered)))).fetchall()[0][0]) / stay_time
        textout += (connection.execute(
            select([func.count(calllog.c.db_key)]).where(
                and_(calllog.c.description == "Short message",
                     calllog.c.direction == 'Outgoing',
                     calllog.c.db_key.in_(filtered)))).fetchall()[0][0]
                    ) / stay_time
    return (callout / num_visits, missed_call / num_visits,
            textout / num_visits)
Exemplo n.º 18
0
def media_usage_frequency(place, user):
    mediaplay = get_table("mediaplay", metadata)
    visit_times = select([visits_10min.c.time_start,
                          visits_10min.c.time_end]).where(
                              and_(visits_10min.c.userid == user,
                                   visits_10min.c.placeid == place))
    visit_times = [(t[0], t[1])
                   for t in connection.execute(visit_times).fetchall()]
    average_freq = 0.0
    user_records = [(r[0], r[1]) for r in connection.execute(
        select([records.c.db_key, records.c.time]).where(
            and_(records.c.userid == user, records.c.type ==
                 'media_play'))).fetchall()]
    if len(user_records) == 0:
        return 0
    for start, end in visit_times:
        filtered = [
            r[0] for r in user_records if r[1] >= start and r[1] <= end
        ]
        times_used = len(filtered)
        average_freq += (times_used / ((end - start) / 3600.0))
    return average_freq
Exemplo n.º 19
0
def extract_places(places_location, label_id, restrict_to, attribute="label"):
    if len(restrict_to) == 0:
        return []
    if attribute == "label":
        lat_long_query = (
            select([places_location.c.latitude, places_location.c.longitude])
            .where(places_location.c.place_label_int == label_id)
            .where(places_location.c.id.in_(restrict_to))
        )

    else:
        demographics = get_table("demographics", metadata)
        joined_t = demographics.join(places_location, places_location.c.userid == demographics.c.userid)
        lat_long_query = (
            select([places_location.c.latitude, places_location.c.longitude])
            .where(demographics.c[attribute] == label_id)
            .where(places_location.c.id.in_(restrict_to))
            .select_from(joined_t)
        )

    results = connection.execute(lat_long_query).fetchall()
    return [(float(r[0]), float(r[1])) for r in results]
Exemplo n.º 20
0
import sys
import csv
import datetime
import math
from spams.db.utils import setup_database, get_table
from sqlalchemy.sql import select
from sqlalchemy import and_, func
from sklearn.neighbors import DistanceMetric

metadata, connection = setup_database()
places_location = get_table("places_location", metadata)
visits_10min = get_table("visits_10min", metadata)
records = get_table("records", metadata)


def relative_frequency(place, user):
    count_place = connection.execute(
        select([visits_10min.c.userid, visits_10min.c.placeid]).where(
            and_(visits_10min.c.userid == user,
                 visits_10min.c.placeid == place))).rowcount
    count_all_places = connection.execute(
        select([visits_10min.c.userid
                ]).where(visits_10min.c.userid == user)).rowcount
    return (count_place * 1.0) / (count_all_places * 1.0)


def distance_from_most_visited_place(place, user):
    q = select([func.count(), visits_10min.c.placeid
                ]).where(visits_10min.c.userid == user).group_by(
                    visits_10min.c.placeid).order_by(func.count().desc())
    most_visited_places = [r[1] for r in connection.execute(q).fetchall()]
Exemplo n.º 21
0
import datetime
import math
from spams.db.utils import setup_database, get_table
from sqlalchemy.sql import select
from sqlalchemy import and_, func
from sklearn.neighbors import DistanceMetric
import numpy as np
from itertools import izip
from extract_features import write_features_to_csv

import logging
logging.basicConfig(filename='motion.log',level=logging.DEBUG)


metadata, connection = setup_database()
places_location = get_table("places_location", metadata)
visits_10min = get_table("visits_10min", metadata)
records = get_table("records", metadata)
accel = get_table("accel", metadata)

def get_features(data):
    X = [d[0] for d in data] 
    Y = [d[1] for d in data]
    Z = [d[2] for d in data]
    x_mean = np.mean(X)
    y_mean = np.mean(Y)
    z_mean = np.mean(Z)
    x_var  = np.var(X)
    y_var =  np.var(Y)
    z_var =  np.var(Z)
    mean_magnitude = np.mean([math.sqrt(x*x + y*y +z*z) for (x,y,z) in izip(X,Y,Z)]) 
Exemplo n.º 22
0
import csv

from sqlalchemy.sql import select, and_

from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection


if __name__ == "__main__":
    engine = create_postgres_engine()
    metadata = get_metadata(engine)
    connection = get_connection(engine)
    records = get_table("records", metadata)
    gpswlan = get_table("gpswlan", metadata)
    ten_min_visits = get_table("visits_10min", metadata)
    twenty_min_visits = get_table("visits_20min", metadata)
    places = get_table("places", metadata)
    all_places = select([places.c.userid, places.c.placeid]).distinct()
    print all_places
    all_places_r = connection.execute(all_places)
    lat_long = {}
    print "Starting to iterate over all places"
    no_wifi_visits_10 = {}
    no_wifi_visits_20 = {}
    print all_places_r.rowcount
    count = 0
    for userid, placeid in all_places_r:
        lat_long[(userid, placeid)] = []
        no_wifi_visits_10[(userid, placeid)] = []
        no_wifi_visits_20[(userid, placeid)] = []
        ten_places = select([ten_min_visits.c.time_start, ten_min_visits.c.time_end], and_(ten_min_visits.c.userid == userid, ten_min_visits.c.placeid == placeid)).limit(5)
        twenty_places = select([twenty_min_visits.c.time_start, twenty_min_visits.c.time_end], and_(twenty_min_visits.c.userid == userid, twenty_min_visits.c.placeid == placeid)).limit(5)
Exemplo n.º 23
0
import csv

from sqlalchemy.sql import select, and_

from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection


if __name__ == "__main__":
    engine = create_postgres_engine()
    metadata = get_metadata(engine)
    connection = get_connection(engine)
    records = get_table("records", metadata)
    gps = get_table("gps", metadata)
    ten_min_visits = get_table("visits_10min", metadata)
    twenty_min_visits = get_table("visits_20min", metadata)
    all_places_r = []
    with open("output", "r") as f:
        reader = csv.reader(f, delimiter=" ")
        for r in reader:
            all_places_r.append((r[0], r[1]))
    lat_long = {}
    print "Starting to iterate over all places"
    no_wifi_visits_10 = {}
    no_wifi_visits_20 = {}
    print len(all_places_r)
    count = 0
    for userid, placeid in all_places_r:
        lat_long[(userid, placeid)] = []
        no_wifi_visits_10[(userid, placeid)] = []
        no_wifi_visits_20[(userid, placeid)] = []
        ten_places = select([ten_min_visits.c.time_start, ten_min_visits.c.time_end], and_(ten_min_visits.c.userid == userid, ten_min_visits.c.placeid == placeid)).order_by(ten_min_visits.c.time_start).limit(50)
Exemplo n.º 24
0
def plot_demographics():
    metadata, connection = setup_database()
    demographics = get_table("demographics", metadata)

    gender_query = select(
        [demographics.c.gender,
         func.count(demographics.c.gender)]).group_by(demographics.c.gender)
    result = connection.execute(gender_query).fetchall()
    result = [r for r in result if r[0] is not None]
    result = sorted(result, key=lambda x: x[0])
    vals = [r[1] for r in result]
    x_ticks = [GENDER_MAPPING[r[0]] for r in result]
    filename = "gender.png"
    draw_barplot(vals,
                 x_ticks=x_ticks,
                 xlabel="Gender",
                 ylabel="Count",
                 title="Gender Distribution",
                 save_as=os.path.join("/local", "thesis", "plots", filename),
                 width=0.35)

    age_query = select([
        demographics.c.age_group,
        func.count(demographics.c.age_group)
    ]).group_by(demographics.c.age_group)
    result = connection.execute(age_query).fetchall()
    result = [r for r in result if r[0] is not None]
    result = sorted(result, key=lambda x: x[0])
    vals = [r[1] for r in result]
    x_ticks = [AGE_MAPPING[r[0]] for r in result]
    filename = "age.png"
    draw_barplot(vals,
                 x_ticks=x_ticks,
                 xlabel="Age Group",
                 ylabel="Count",
                 title="Age Distribution",
                 save_as=os.path.join("/local", "thesis", "plots", filename),
                 width=0.35)

    working_query = select(
        [demographics.c.working,
         func.count(demographics.c.working)]).group_by(demographics.c.working)
    result = connection.execute(working_query).fetchall()
    print result
    result = [r for r in result if r[0] is not None]
    result = sorted(result, key=lambda x: x[0])
    vals = [r[1] for r in result]
    x_ticks = [WORKING_MAPPING[r[0]] for r in result]
    filename = "working.png"
    draw_barplot(vals,
                 x_ticks=[textwrap.fill(text, 10) for text in x_ticks],
                 xlabel="Employment Status",
                 ylabel="Count",
                 title="Employment Status Distribution",
                 save_as=os.path.join("/local", "thesis", "plots", filename),
                 width=0.35)

    bill_query = select([
        demographics.c.phone_bill,
        func.count(demographics.c.phone_bill)
    ]).group_by(demographics.c.phone_bill)
    result = connection.execute(bill_query).fetchall()
    result = [r for r in result if r[0] is not None]
    result = sorted(result, key=lambda x: x[0])
    vals = [r[1] for r in result]
    x_ticks = [BILL_MAPPING[r[0]] for r in result]
    filename = "bill.png"
    draw_barplot(vals,
                 x_ticks=x_ticks,
                 xlabel="Bill",
                 ylabel="Count",
                 title="Bill Distribution",
                 save_as=os.path.join("/local", "thesis", "plots", filename),
                 width=0.35)

    bill_query = select([
        demographics.c.nb_12, demographics.c.nb_12_18, demographics.c.nb_18_30,
        demographics.c.nb_30_40, demographics.c.nb_40_50,
        demographics.c.nb_50_65, demographics.c.nb_65
    ])
    result = connection.execute(bill_query).fetchall()
    result = [
        sum([a for a in r if a is not None]) for r in result if r is not None
    ]
    s = set(result)
    print s
    vals = []
    x_ticks = []
    for elem in s:
        if elem > 13:
            continue
        x_ticks.append(elem)
        vals.append(result.count(elem))
    #vals = [r[1] for r in result]
    #x_ticks = [BILL_MAPPING[r[0]] for r in result]
    filename = "family.png"
    draw_barplot(vals,
                 x_ticks=x_ticks,
                 xlabel="Number of members in family",
                 ylabel="Count",
                 title="Number of Family Members Distribution",
                 save_as=os.path.join("/local", "thesis", "plots", filename),
                 width=0.35)
Exemplo n.º 25
0
def priors_with_kde(
    test, train, input_func=extract_places, attribute="label", return_predictions=True, method="dbscan"
):
    # test is originally a tuple of place, user, label
    places_location = get_table("places_location", metadata)
    q = select([places_location.c.id])

    predictor_iterator = iterator_dict[attribute]
    if attribute == "label":
        train_tuples = [
            (
                connection.execute(
                    q.where(and_(places_location.c.placeid == place, places_location.c.userid == user))
                ).fetchall()[0][0],
                label,
            )
            for ((place, user), label) in train
        ]
    elif attribute == "time":
        train_tuples = []
        input_func = extract_visits
        for (place, user), _ in train:
            id = connection.execute(
                q.where(and_(places_location.c.placeid == place, places_location.c.userid == user))
            ).fetchall()[0][0]
            for i in xrange(0, 12):
                train_tuples.append((id, i))
    else:
        input_func = extract_visits
        demographics = get_table("demographics", metadata)
        demo_q = select([demographics.c[attribute]])
        train_tuples = []
        for (place, user), _ in train:
            id = connection.execute(
                q.where(and_(places_location.c.placeid == place, places_location.c.userid == user))
            ).fetchall()[0][0]
            demo_r = connection.execute(demo_q.where(demographics.c.userid == user))
            if demo_r.rowcount == 1:
                train_tuples.append((id, demo_r.fetchall()[0][0]))
            else:
                train_tuples.append((id, None))

    test = [
        connection.execute(
            q.where(and_(places_location.c.placeid == place, places_location.c.userid == user))
        ).fetchall()[0][0]
        for ((place, user), _) in test
    ]
    train_predictor_dict = defaultdict(list)
    for id, predictor in train_tuples:
        if predictor is not None:
            train_predictor_dict[predictor].append(id)

    test_scores = {}
    accurate = 0.0
    count = len(test)
    train_scores = {}

    if method == "dbscan":
        place_group_dict, group_place_dict = perform_db_scan(places_location)
        groups = group_place_dict.keys()
        for g in groups:
            estimators = {}
            group_test = [p for p in test if place_group_dict[p] == g]
            for predictor in predictor_iterator:
                places_in_group = [p for p in train_predictor_dict[predictor] if place_group_dict[p] == g]
                if len(places_in_group) == 0:
                    continue
                training_set = input_func(places_location, predictor, places_in_group, attribute)
                xy = np.array(training_set)
                # Convert to radians
                xy *= np.pi / 180.0
                estimators[predictor] = train_kde(xy, predictor)
            for p in group_test:
                test_scores[p] = get_scores(places_location, p, predictor_iterator, estimators)
            for p in group_place_dict[g]:
                train_scores[p] = get_scores(places_location, p, predictor_iterator, estimators)
    elif method == "simple":
        estimators = {}
        for predictor in predictor_iterator:
            places_predictor = train_predictor_dict[predictor]
            if len(places_predictor) == 0:
                continue
                training_set = input_func(places_location, predictor, places_predictor, attribute)
                xy = np.array(training_set)
                # Convert to radians
                xy *= np.pi / 180.0
                estimators[predictor] = train_kde(xy, predictor)
        for p in test:
            test_scores[p] = get_scores(places_location, p, predictor_iterator, estimators)
        for p, _ in train_tuples:
            train_scores[p] = get_scores(places_location, p, predictor_iterator, estimators)

    accurate = 0.0
    count = 0.0
    # for place in test_scores.keys():
    #    if attribute == 'label':
    #        true_predictor = connection.execute(select([places_location.c.place_label_int]).where(places_location.c.id==place)).fetchall()[0][0]
    #    else:
    #        user = connection.execute(select([places_location.c.userid]).where(places_location.c.id==place)).fetchall()[0][0]
    #        r = connection.execute(select([demographics.c[attribute]]).where(demographics.c.userid==user))
    #        if r.rowcount == 1:
    #            true_predictor = r.fetchall()[0][0]
    #        else:
    #            true_predictor = None
    #
    #    predicted_value, max_score =  max(test_scores[place], key = lambda x: x[1])
    #    if predicted_value == true_predictor:
    #        accurate += 1
    #        #print max(scores[place])
    #    count += 1
    # accuracy = accurate/count
    # if attribute is not 'label':
    #    print accuracy

    if return_predictions:
        prior_labels = []
        for place in test:
            predicted_value, max_score = max(test_scores[place], key=lambda x: x[1])
            prior_labels.append(predicted_value)
        return prior_labels
    else:
        test_scores = {
            tuple(
                connection.execute(
                    select([places_location.c.placeid, places_location.c.userid]).where(places_location.c.id == p)
                ).fetchall()[0]
            ): [s[1] for s in score]
            for p, score in test_scores.items()
        }
        training_scores = {
            tuple(
                connection.execute(
                    select([places_location.c.placeid, places_location.c.userid]).where(places_location.c.id == p)
                ).fetchall()[0]
            ): [s[1] for s in score]
            for p, score in train_scores.items()
        }
        return training_scores, test_scores
Exemplo n.º 26
0
import foursquare
from sqlalchemy.sql import select

from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection
from spams.config import FOURSQUARE_CLIENT_ID, FOURSQUARE_CLIENT_SECRET

if __name__ == "__main__":
    client = foursquare.Foursquare(client_id=FOURSQUARE_CLIENT_ID,
                                   client_secret=FOURSQUARE_CLIENT_SECRET)
    engine = create_postgres_engine()
    metadata = get_metadata(engine)
    connection = get_connection(engine)
    output_file = open("nearby_places.csv", "w")
    writer = csv.writer(output_file)

    places_location = get_table("places_location", metadata)
    query = select([
        places_location.c.latitude, places_location.c.longitude,
        places_location.c.userid, places_location.c.placeid,
        places_location.c.place_label
    ])
    results = connection.execute(query)
    for r in results:
        ll = str(r[0]) + "," + str(r[1])
        venues = client.venues.search(params={
            'll': ll,
            'radius': 50,
            'intent': 'browse'
        })
        to_write = []
        for venue in venues["venues"]:
Exemplo n.º 27
0
FILENAME = "places_with_mapping.csv"


if __name__ == "__main__":
    engine = create_postgres_engine()
    metadata = get_metadata(engine)
    connection = get_connection(engine)
    table_name = 'places_location'
    if not engine.dialect.has_table(connection, table_name):
        places_location_mapping = Table(table_name, metadata,
                                        Column('id', primary_key = True)
                                        Column('userid', Integer),
                                        Column('placeid', Integer),
                                        Column('latitude', Numeric),
                                        Column('latitude_std', Numeric),
                                        Column('longitude', Numeric),
                                        Column('longitude_std', Numeric),
                                        Column('method', String),
                                        Column('place_label', String),
                                        Column('place_label_int', Integer))
        metadata.create_all(engine)
        print "Created Table"
    else:
        print "Table exists"
    places_location_mapping = get_table(table_name, metadata)
    with open(FILENAME) as f:
        reader = csv.reader(f)
        for row in reader:
            ins = places_location_mapping.insert().values(tuple(row))
            print connection.execute(ins)
Exemplo n.º 28
0
MAPPING = "mapping"
OUTPUT = "places_with_mapping.csv"


if __name__ == "__main__":
    places_file = open(PLACES, "r")
    mapping_file = open(MAPPING, "r")
    output_file = open(OUTPUT, "w")
    places_reader = csv.reader(places_file)
    mapping_reader = csv.reader(mapping_file)
    mapping = {}
    for row in mapping_reader:
        mapping[int(row[0])] = row[1]
    output_writer = csv.writer(output_file)
    engine = create_postgres_engine()
    connection = get_connection(engine)
    metadata = get_metadata(engine)
    places = get_table("places", metadata)
    labels = connection.execute(select([places.c.userid, places.c.placeid, places.c.place_label])).fetchall()
    places_dict = {}
    for a in labels:
        places_dict[(a[0], a[1])] = a[2]
    for row in places_reader:
        label_int = places_dict[(int(row[0]), int(row[1]))]
        row.append(mapping[label_int])
        row.append(label_int)
        output_writer.writerow(row)
    places_file.close()
    mapping_file.close()
    output_file.close()
Exemplo n.º 29
0
    pass    


def colors_map():
    NUM_COLORS = 10
    colors = []
    cm = plt.get_cmap('gist_rainbow')
    for i in range(NUM_COLORS):
        colors.append(cm(10.*i/NUM_COLORS))
    return colors



if __name__ == "__main__":
    metadata, connection = setup_database()
    places_location = get_table("places_location", metadata)
    restrict_to = []
    with open("biggest_cluster_places") as f:
        for line in f:
            line = line.strip()
            restrict_to.append(line)

    lat_long_query = select([places_location.c.latitude, places_location.c.longitude, places_location.c.place_label, places_location.c.id]).where(places_location.c.place_label == 'Work')
    queries = select([func.min(places_location.c.latitude), func.max(places_location.c.latitude), func.min(places_location.c.longitude), func.max(places_location.c.longitude)])
    min_lat, max_lat, min_long, max_long = connection.execute(queries).fetchall()[0]
    results = connection.execute(lat_long_query).fetchall()
    places_with_labels = [(r[2],(float(r[0]), float(r[1]))) for r in results]
    places = [r[3] for r in results]
    labels_places_dict = defaultdict(list)
    labels_places_count = defaultdict(int)
    places_with_labels = sorted(places_with_labels, key = lambda x:x[0])
Exemplo n.º 30
0
import csv

from sqlalchemy.sql import select, and_

from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection

if __name__ == "__main__":
    engine = create_postgres_engine()
    metadata = get_metadata(engine)
    connection = get_connection(engine)
    records = get_table("records", metadata)
    gpswlan = get_table("gpswlan", metadata)
    ten_min_visits = get_table("visits_10min", metadata)
    twenty_min_visits = get_table("visits_20min", metadata)
    places = get_table("places", metadata)
    all_places = select([places.c.userid, places.c.placeid]).distinct()
    print all_places
    all_places_r = connection.execute(all_places)
    lat_long = {}
    print "Starting to iterate over all places"
    no_wifi_visits_10 = {}
    no_wifi_visits_20 = {}
    print all_places_r.rowcount
    count = 0
    for userid, placeid in all_places_r:
        lat_long[(userid, placeid)] = []
        no_wifi_visits_10[(userid, placeid)] = []
        no_wifi_visits_20[(userid, placeid)] = []
        ten_places = select(
            [ten_min_visits.c.time_start, ten_min_visits.c.time_end],
            and_(ten_min_visits.c.userid == userid,
Exemplo n.º 31
0
def priors_with_kde(test,
                    train,
                    input_func=extract_places,
                    attribute='label',
                    return_predictions=True,
                    method='dbscan'):
    #test is originally a tuple of place, user, label
    places_location = get_table("places_location", metadata)
    q = select([places_location.c.id])

    predictor_iterator = iterator_dict[attribute]
    if attribute == 'label':
        train_tuples = [(connection.execute(
            q.where(
                and_(places_location.c.placeid == place,
                     places_location.c.userid == user))).fetchall()[0][0],
                         label) for ((place, user), label) in train]
    elif attribute == 'time':
        train_tuples = []
        input_func = extract_visits
        for (place, user), _ in train:
            id = connection.execute(
                q.where(
                    and_(places_location.c.placeid == place,
                         places_location.c.userid == user))).fetchall()[0][0]
            for i in xrange(0, 12):
                train_tuples.append((id, i))
    else:
        input_func = extract_visits
        demographics = get_table("demographics", metadata)
        demo_q = select([demographics.c[attribute]])
        train_tuples = []
        for (place, user), _ in train:
            id = connection.execute(
                q.where(
                    and_(places_location.c.placeid == place,
                         places_location.c.userid == user))).fetchall()[0][0]
            demo_r = connection.execute(
                demo_q.where(demographics.c.userid == user))
            if demo_r.rowcount == 1:
                train_tuples.append((id, demo_r.fetchall()[0][0]))
            else:
                train_tuples.append((id, None))

    test = [
        connection.execute(
            q.where(
                and_(places_location.c.placeid == place,
                     places_location.c.userid == user))).fetchall()[0][0]
        for ((place, user), _) in test
    ]
    train_predictor_dict = defaultdict(list)
    for id, predictor in train_tuples:
        if predictor is not None:
            train_predictor_dict[predictor].append(id)

    test_scores = {}
    accurate = 0.0
    count = len(test)
    train_scores = {}

    if method == 'dbscan':
        place_group_dict, group_place_dict = perform_db_scan(places_location)
        groups = group_place_dict.keys()
        for g in groups:
            estimators = {}
            group_test = [p for p in test if place_group_dict[p] == g]
            for predictor in predictor_iterator:
                places_in_group = [
                    p for p in train_predictor_dict[predictor]
                    if place_group_dict[p] == g
                ]
                if len(places_in_group) == 0:
                    continue
                training_set = input_func(places_location, predictor,
                                          places_in_group, attribute)
                xy = np.array(training_set)
                # Convert to radians
                xy *= np.pi / 180.
                estimators[predictor] = train_kde(xy, predictor)
            for p in group_test:
                test_scores[p] = get_scores(places_location, p,
                                            predictor_iterator, estimators)
            for p in group_place_dict[g]:
                train_scores[p] = get_scores(places_location, p,
                                             predictor_iterator, estimators)
    elif method == 'simple':
        estimators = {}
        for predictor in predictor_iterator:
            places_predictor = train_predictor_dict[predictor]
            if len(places_predictor) == 0:
                continue
                training_set = input_func(places_location, predictor,
                                          places_predictor, attribute)
                xy = np.array(training_set)
                # Convert to radians
                xy *= np.pi / 180.
                estimators[predictor] = train_kde(xy, predictor)
        for p in test:
            test_scores[p] = get_scores(places_location, p, predictor_iterator,
                                        estimators)
        for p, _ in train_tuples:
            train_scores[p] = get_scores(places_location, p,
                                         predictor_iterator, estimators)

    accurate = 0.0
    count = 0.0
    #for place in test_scores.keys():
    #    if attribute == 'label':
    #        true_predictor = connection.execute(select([places_location.c.place_label_int]).where(places_location.c.id==place)).fetchall()[0][0]
    #    else:
    #        user = connection.execute(select([places_location.c.userid]).where(places_location.c.id==place)).fetchall()[0][0]
    #        r = connection.execute(select([demographics.c[attribute]]).where(demographics.c.userid==user))
    #        if r.rowcount == 1:
    #            true_predictor = r.fetchall()[0][0]
    #        else:
    #            true_predictor = None
    #
    #    predicted_value, max_score =  max(test_scores[place], key = lambda x: x[1])
    #    if predicted_value == true_predictor:
    #        accurate += 1
    #        #print max(scores[place])
    #    count += 1
    #accuracy = accurate/count
    #if attribute is not 'label':
    #    print accuracy

    if return_predictions:
        prior_labels = []
        for place in test:
            predicted_value, max_score = max(test_scores[place],
                                             key=lambda x: x[1])
            prior_labels.append(predicted_value)
        return prior_labels
    else:
        test_scores = {
            tuple(
                connection.execute(
                    select([
                        places_location.c.placeid, places_location.c.userid
                    ]).where(places_location.c.id == p)).fetchall()[0]):
            [s[1] for s in score]
            for p, score in test_scores.items()
        }
        training_scores = {
            tuple(
                connection.execute(
                    select([
                        places_location.c.placeid, places_location.c.userid
                    ]).where(places_location.c.id == p)).fetchall()[0]):
            [s[1] for s in score]
            for p, score in train_scores.items()
        }
        return training_scores, test_scores
Exemplo n.º 32
0
import csv

from sqlalchemy.sql import select, and_

from spams.db.utils import create_postgres_engine, get_table, get_metadata, get_connection

if __name__ == "__main__":
    engine = create_postgres_engine()
    metadata = get_metadata(engine)
    connection = get_connection(engine)
    records = get_table("records", metadata)
    gps = get_table("gps", metadata)
    ten_min_visits = get_table("visits_10min", metadata)
    twenty_min_visits = get_table("visits_20min", metadata)
    all_places_r = []
    with open("output", "r") as f:
        reader = csv.reader(f, delimiter=" ")
        for r in reader:
            all_places_r.append((r[0], r[1]))
    lat_long = {}
    print "Starting to iterate over all places"
    no_wifi_visits_10 = {}
    no_wifi_visits_20 = {}
    print len(all_places_r)
    count = 0
    for userid, placeid in all_places_r:
        lat_long[(userid, placeid)] = []
        no_wifi_visits_10[(userid, placeid)] = []
        no_wifi_visits_20[(userid, placeid)] = []
        ten_places = select(
            [ten_min_visits.c.time_start, ten_min_visits.c.time_end],
Exemplo n.º 33
0
OUTPUT = "places_with_mapping.csv"

if __name__ == "__main__":
    places_file = open(PLACES, "r")
    mapping_file = open(MAPPING, "r")
    output_file = open(OUTPUT, "w")
    places_reader = csv.reader(places_file)
    mapping_reader = csv.reader(mapping_file)
    mapping = {}
    for row in mapping_reader:
        mapping[int(row[0])] = row[1]
    output_writer = csv.writer(output_file)
    engine = create_postgres_engine()
    connection = get_connection(engine)
    metadata = get_metadata(engine)
    places = get_table("places", metadata)
    labels = connection.execute(
        select([places.c.userid, places.c.placeid,
                places.c.place_label])).fetchall()
    places_dict = {}
    for a in labels:
        places_dict[(a[0], a[1])] = a[2]
    for row in places_reader:
        label_int = places_dict[(int(row[0]), int(row[1]))]
        row.append(mapping[label_int])
        row.append(label_int)
        output_writer.writerow(row)
    places_file.close()
    mapping_file.close()
    output_file.close()