def json_to_database(eng):
    with eng.connect() as conn:

        try:
            with open('pokedex.json') as f:
                sql = f.read()
                json_data = json.loads(sql)
                r = Review(int(json_data['id']), json_data['name_english'],
                           json_data['name_japanese'],
                           json_data['name_chinese'], json_data['name_french'],
                           json_data['type'], int(json_data['base_HP']),
                           int(json_data['base_Attack']),
                           int(json_data['base_Defense']),
                           int(json_data['base_Sp_Attack']),
                           int(json_data['base_Sp_Defense']),
                           int(json_data['base_Speed']))
                Session.add(r)
                Session.commit()

        except ValueError as err:
            return False
Пример #2
0
import tensorflow as tf
import pandas as pd
import numpy as np
from connect_db import Session,engine
from data_models import User
import json

session=Session()
all_users=session.query(User.zipcode).all()

zipcodes=list(set([elem[0] for elem in all_users]))

with open('movie_user_zipcodes.json','w') as f:
    json.dump(zipcodes,f)
from connect_db import engine, Session, Base
from product import Product, Details, Names
import json

Base.metadata.create_all(engine)
session = Session()
c = engine.connect()
c.execute('ALTER TABLE names MODIFY\
            value VARCHAR (100)\
            CHARACTER SET utf8\
            COLLATE utf8_unicode_ci;')

f = open('pokedex.json')
sql = f.read()
json_data = json.loads(sql)
for element in json_data:
    type_of_product = ""
    for i in element['type']:
        type_of_product += i

    product = Product(type_of_product)
    eng_name = Names("english", element['name']['english'], product)
    jpa_name = Names("japanese", element['name']['japanese'], product)
    chie_name = Names("chinese", element['name']['chinese'], product)
    fr_name = Names("french", element['name']['french'], product)

    detail = Details(element['base']['HP'], element['base']['Attack'],
                     element['base']['Defense'], element['base']['Sp. Attack'],
                     element['base']['Sp. Defense'], element['base']['Speed'],
                     product)
Пример #4
0
def main():
    session = Session()
    # query with condition? alternative
    all_users = session.query(User).all()
    all_movies = session.query(Movie).all()

    user_rating_counts = session.query(Rating.user_id,
                                       func.count(Rating.user_id)).group_by(
                                           Rating.user_id).all()

    # user with more than 40 ratings
    user_filtered = filter(lambda x: x[1] > 60, user_rating_counts)
    actual_users_index = [elem[0] for elem in user_filtered]

    actor_dict, director_dict, rated_dict, genre_dict = get_movie_dict(
        'movie_dict.json')
    #author_dict,publisher_dict=get_book_dict('book_dict.json')

    with open('movie_user_zipcodes.json', 'r') as f:
        zipcodes = json.load(f)
    zipcode_dict = dict(zip(zipcodes, range(len(zipcodes))))

    all_users_id = [elem.id for elem in all_users]
    all_users_data = [{
        'gender': elem.gender,
        'occupation': elem.occupation,
        'age': elem.age,
        'zipcode': elem.zipcode
    } for elem in all_users]

    all_users_df = pd.DataFrame(all_users_data, index=all_users_id)

    # occupation doesn't need hashing
    occu_dict_size = all_users_df.occupation.max() + 1

    all_users_df.gender = (all_users_df.gender == 'M').astype(int)
    all_users_df.zipcode = all_users_df.zipcode.apply(
        lambda x: zipcode_dict[x])

    user_ages = sorted(all_users_df.age.unique())
    # age may be quantifiable, but every person in their age periods has their own culture and style
    age_dict = dict(zip(user_ages, range(len(user_ages))))

    all_users_df.age = all_users_df.age.apply(lambda x: age_dict[x])

    all_movies_id = [elem.id for elem in all_movies]
    all_movies_data = [{
        'year': elem.year,
        'actor': elem.actor,
        'title': elem.title,
        'rated': elem.rated,
        'director': elem.director,
        'genre': elem.genre
    } for elem in all_movies]

    all_movies_df = pd.DataFrame(all_movies_data, index=all_movies_id)

    all_movies_df.actor = all_movies_df.actor.apply(lambda x: actor_dict[x])
    all_movies_df.director = all_movies_df.director.apply(
        lambda x: director_dict[x])
    all_movies_df.rated = all_movies_df.rated.apply(lambda x: rated_dict[x])
    all_movies_df.genre = all_movies_df.genre.apply(lambda x: genre_dict[x])
    all_movies_df.year = all_movies_df.year - MOVIE_MIN_YEAR

    existing_movies_df = all_movies_df[all_movies_df.year < 1998 -
                                       MOVIE_MIN_YEAR]
    new_movies_df = all_movies_df[all_movies_df.year > 1997 - MOVIE_MIN_YEAR]

    #user_mask=np.random.rand(len(all_users_df)) < 0.8
    #user_existing=all_users_df[user_mask]
    #user_new=all_users_df[~user_mask]
    user_existing = all_users_df[all_users_df.index.isin(actual_users_index)]
    user_new = all_users_df[~all_users_df.index.isin(actual_users_index)]

    rating_existing = session.query(Rating).join(User).filter(
        User.id.in_(
            user_existing.index)).join(Movie).filter(Movie.year < 1998).all()
    #rating_exist_new=session.query(Rating).join(User).filter(User.id.in_(user_existing.index)).join(Movie).filter(Movie.year>1997).all()
    #rating_new_exist=session.query(Rating).join(User).filter(User.id.in_(user_new.index)).join(Movie).filter(Movie.year<1998).all()
    #rating_new_new=session.query(Rating).join(User).filter(User.id.in_(user_new.index)).join(Movie).filter(Movie.year>1997).all()
    '''
    train_genders=[1 if elem.user.genre=='M' else 0 for elem in rating_existing]
    train_occupations=[elem.user.occupation for elem in rating_existing]
    train_ages=[elem.user.age for elem in rating_existing]
    train_zipcodes=[all_users_df.loc[elem.user_id].zipcode for elem in rating_existing]
    train_actors=[all_movies_df.loc[elem.movie_id].actor for elem in rating_existing]
    train_directors=[all_movies_df.loc[elem.movie_id].director for elem in rating_existing]
    train_genres=[all_movies_df.loc[elem.movie_id].genre for elem in rating_existing]
    train_rateds=[all_movies_df.loc[elem.movie_id].rated for elem in rating_existing]

    train_labels=[(elem.rate-1)*0.25 for elem in rating_existing]
    '''

    rating_existing_group = [[] for _ in range(MAX_USER_ID + 1)]
    for rating in rating_existing:
        # 40 ratings per user, + 10 queries
        if len(rating_existing_group[
                rating.user_id]) < scenario_len + query_len:
            rating_existing_group[rating.user_id].append(rating)

    actual_users_index2 = [
        idx for idx, elem in enumerate(rating_existing_group)
        if len(elem) > scenario_len + query_len - 1
    ]

    dict_sizes = {
        'zipcode': len(zipcode_dict),
        'actor': len(actor_dict),
        'authdir': len(director_dict),
        'rated': len(rated_dict),
        'year': MOVIE_MAX_YEAR - MOVIE_MIN_YEAR + 1,
        'occu': occu_dict_size,
        'age': len(age_dict),
        'genre': len(genre_dict)
    }
    emb_sizes = {
        'zipcode': 100,
        'actor': 50,
        'authdir': 50,
        'rated': 5,
        'year': 15,
        'occu': 4,
        'age': 2,
        'genre': 15
    }

    global_model = MeluGlobal(dict_sizes, emb_sizes, 1)
    emb_input_size = sum([v for k, v in emb_sizes.items()])
    local_model = MeluLocal(emb_input_size, [64, 32, 16, 4])

    print(global_model.summary())
    print(local_model.summary())
    utils.plot_model(global_model, 'global.png', True, expand_nested=True)
    utils.plot_model(local_model, 'local.png', True, expand_nested=True)

    USER_BATCH_SIZE = 128

    # task batch size should divide scenario length
    TASK_BATCH_SIZE = 20

    total_batch = floor(len(actual_users_index2) / USER_BATCH_SIZE)
    #remaining_users=len(actual_users_index2)%USER_BATCH_SIZE

    local_loss_fn = losses.MeanAbsoluteError()
    local_optimizer = optimizers.Adam(alpha)
    global_optimizer = optimizers.Adam(beta)
    #global_loss_fn=losses.MeanAbsoluteError()

    #local_model.compile(local_optimizer,local_loss_fn,[metrics.MeanAbsoluteError()])
    #global_model.compile(global_optimizer,global_loss_fn,[metrics.MeanAbsoluteError()])

    #local_model.save_weights('theta2.h5')
    local_model_weights = local_model.get_weights()

    # prepare training metric
    #val_metric=metrics.MeanAbsoluteError()
    for epoch in range(30):
        print('start epoch {}'.format(epoch))
        # previous validation loss to decide early stopping
        # prev_val_loss - epoch-1 loss
        # prev2_val_loss - epoch-2 loss
        # prev3_val_loss - epoch-3 loss
        if epoch > 19:
            prev3_train_loss = prev2_train_loss
            prev2_train_loss = prev_train_loss
            prev_train_loss = total_train_loss
        elif epoch == 19:
            prev2_train_loss = prev_train_loss
            prev_train_loss = total_train_loss
        elif epoch == 18:
            prev_train_loss = total_train_loss
        total_train_loss = 0
        for i in range(total_batch):
            print('user batch # {}'.format(i))
            users = [
                rating_existing_group[elem]
                for elem in actual_users_index2[i * USER_BATCH_SIZE:(i + 1) *
                                                USER_BATCH_SIZE]
            ]

            theta2_user_weights = []

            # calculate local weights per user
            for j, user in enumerate(users):
                #local_model.load_weights('theta2.h5')
                local_model.set_weights(local_model_weights)
                # [authdir,year,age,actor,rated,genre,occu,zipcode]
                user_data = [[
                    existing_movies_df.loc[elem.movie_id].director,
                    existing_movies_df.loc[elem.movie_id].year,
                    all_users_df.loc[elem.user_id].age,
                    existing_movies_df.loc[elem.movie_id].actor,
                    existing_movies_df.loc[elem.movie_id].rated,
                    existing_movies_df.loc[elem.movie_id].genre,
                    all_users_df.loc[elem.user_id].occupation,
                    all_users_df.loc[elem.user_id].zipcode
                ] for elem in user[:scenario_len]]
                label_data = [elem.rate for elem in user[:scenario_len]]
                train_dataset = tf.data.Dataset.from_tensor_slices(
                    (user_data, label_data)).batch(TASK_BATCH_SIZE, True)
                for (user_batch, label_batch) in train_dataset:
                    batch_emb_out = global_model(user_batch)
                    with tf.GradientTape() as tape:
                        logits = local_model(batch_emb_out)
                        local_loss = local_loss_fn(label_batch, logits)
                    local_grads = tape.gradient(local_loss,
                                                local_model.trainable_weights)
                    local_optimizer.apply_gradients(
                        zip(local_grads, local_model.trainable_weights))
                #local_model.save_weights('theta2_{}.h5'.format(j))
                theta2_user_weights.append(local_model.get_weights())
            # calculate gradients for each uesr
            theta1_grads = []
            theta1_losses = 0
            for j, user in enumerate(users):
                #local_model.load_weights('theta2_{}.h5'.format(j))
                local_model.set_weights(theta2_user_weights[j])
                user_query = [[
                    existing_movies_df.loc[elem.movie_id].director,
                    existing_movies_df.loc[elem.movie_id].year,
                    all_users_df.loc[elem.user_id].age,
                    existing_movies_df.loc[elem.movie_id].actor,
                    existing_movies_df.loc[elem.movie_id].rated,
                    existing_movies_df.loc[elem.movie_id].genre,
                    all_users_df.loc[elem.user_id].occupation,
                    all_users_df.loc[elem.user_id].zipcode
                ] for elem in user[scenario_len:]]
                label_data = [elem.rate for elem in user[scenario_len:]]
                train_dataset = tf.data.Dataset.from_tensor_slices(
                    (user_query, label_data)).batch(query_len)
                (query_batch, label_batch) = next(iter(train_dataset))
                with tf.GradientTape() as tape:
                    emb_out = global_model(query_batch)
                    logits = local_model(emb_out)
                    local_loss = local_loss_fn(label_batch, logits)
                    theta1_losses += local_loss.numpy()
                    # there will be USER_BATCH_SIZE * scenario_len/TASK_BATCH_SIZE gradients
                grad = tape.gradient(local_loss,
                                     global_model.trainable_weights)
                theta1_grads.append(grad)
            # apply every gradients to embedding layer weights
            final_theta1_grad = []
            theta2_losses = 0
            for k in range(len(theta1_grads[0])):
                data = [elem[k] for elem in theta1_grads]
                final_data = tf.add_n(data) / USER_BATCH_SIZE
                final_theta1_grad.append(final_data)
            global_optimizer.apply_gradients(
                zip(final_theta1_grad, global_model.trainable_weights))

            # calculate each local gradients per user for updated global theta1
            theta2_grads = []
            for j, user in enumerate(users):
                #local_model.load_weights('theta2_{}.h5'.format(j))
                # below line is wrong(maybe)
                #local_model.set_weights(theta2_user_weights[j])
                local_model.set_weights(local_model_weights)
                user_query = [[
                    existing_movies_df.loc[elem.movie_id].director,
                    existing_movies_df.loc[elem.movie_id].year,
                    all_users_df.loc[elem.user_id].age,
                    existing_movies_df.loc[elem.movie_id].actor,
                    existing_movies_df.loc[elem.movie_id].rated,
                    existing_movies_df.loc[elem.movie_id].genre,
                    all_users_df.loc[elem.user_id].occupation,
                    all_users_df.loc[elem.user_id].zipcode
                ] for elem in user[scenario_len:]]
                label_data = [elem.rate for elem in user[scenario_len:]]
                train_dataset = tf.data.Dataset.from_tensor_slices(
                    (user_query, label_data)).batch(query_len)
                (query_batch, label_batch) = next(iter(train_dataset))
                emb_out = global_model(query_batch)
                with tf.GradientTape() as tape:
                    logits = local_model(emb_out)
                    local_loss = local_loss_fn(label_batch, logits)
                    theta2_losses += local_loss.numpy()
                theta2_grads.append(
                    tape.gradient(local_loss, local_model.trainable_weights))
            # update local dense layer weights
            final_theta2_grad = []
            for k in range(len(theta2_grads[0])):
                data = [elem[k] for elem in theta2_grads]
                final_data = tf.add_n(data) / USER_BATCH_SIZE
                final_theta2_grad.append(final_data)
            global_optimizer.apply_gradients(
                zip(final_theta2_grad, local_model.trainable_weights))
            #local_model.save_weights('theta2.h5')
            local_model_weights = local_model.get_weights()

            # To Do: evaluate validation
            # use MAE ( paper's choice )
            '''
            batch_val_loss=0
            for j,user in enumerate(users):
                validation_batch=user[scenario_len:scenario_len+validatioin_len]   # this is actually all of it
                batch_input=[
                    [existing_movies_df.loc[elem.movie_id].director,
                    existing_movies_df.loc[elem.movie_id].year,
                    all_users_df.loc[elem.user_id].age,
                    existing_movies_df.loc[elem.movie_id].actor,
                    existing_movies_df.loc[elem.movie_id].rated,
                    existing_movies_df.loc[elem.movie_id].genre,
                    all_users_df.loc[elem.user_id].occupation,
                    all_users_df.loc[elem.user_id].zipcode
                    ] for elem in validation_batch
                ]
                batch_labels=[elem.rate for elem in validation_batch]

                # only one batch, so need to be in one-item list
                val_embedded=global_model.predict_on_batch([batch_input])
                val_logits=local_model.predict_on_batch(val_embedded)
                val_metric(batch_labels,val_logits)
                batch_val_loss=batch_val_loss+val_metric.result()
            

            print('validation loss: %s' % (float(batch_val_loss),))
            total_train_loss+=batch_val_loss
            # To do: end train if validation loss increases of not be reduced enogh - Early stopping
            '''
            #measure total training loss
            print('batch #{} theta1 loss:{}'.format(i, theta1_losses))
            print('batch #{} theta2 loss:{}'.format(i, theta2_losses))
            total_train_loss += theta1_losses + theta2_losses
        print('current train loss at epoch {}: '.format(epoch),
              total_train_loss)
        if epoch % 5 == 0:
            local_model.save('models/local_model_{}.h5'.format(epoch))
            global_model.save('models/global_model_{}.h5'.format(epoch))
        if epoch > 19:
            min_prev_loss = min(
                [prev_train_loss, prev2_train_loss, prev3_train_loss])
            print('previous train loss: ', min_prev_loss)

            if total_train_loss > min_prev_loss:
                print('total train loss increases, end training')
                break

    local_model.save('models/local_model_{}_final.h5'.format(epoch))
    global_model.save('models/global_model_{}_final.h5'.format(epoch))
Пример #5
0
def main():
    session = Session()
    # query with condition? alternative
    all_users = session.query(User).all()
    all_movies = session.query(Movie).all()

    user_rating_counts = session.query(Rating.user_id,
                                       func.count(Rating.user_id)).group_by(
                                           Rating.user_id).all()

    # user with more than 40 ratings
    user_filtered = filter(lambda x: x[1] > 45, user_rating_counts)
    actual_users_index = [elem[0] for elem in user_filtered]

    actor_dict, director_dict, rated_dict, genre_dict = get_movie_dict(
        'movie_dict.json')
    #author_dict,publisher_dict=get_book_dict('book_dict.json')

    with open('movie_user_zipcodes.json', 'r') as f:
        zipcodes = json.load(f)
    zipcode_dict = dict(zip(zipcodes, range(len(zipcodes))))

    all_users_id = [elem.id for elem in all_users]
    all_users_data = [{
        'gender': elem.gender,
        'occupation': elem.occupation,
        'age': elem.age,
        'zipcode': elem.zipcode
    } for elem in all_users]

    all_users_df = pd.DataFrame(all_users_data, index=all_users_id)

    # occupation doesn't need hashing
    occu_dict_size = all_users_df.occupation.max() + 1

    all_users_df.gender = (all_users_df.gender == 'M').astype(int)
    all_users_df.zipcode = all_users_df.zipcode.apply(
        lambda x: zipcode_dict[x])

    user_ages = sorted(all_users_df.age.unique())
    # age may be quantifiable, but every person in their age periods has their own culture and style
    age_dict = dict(zip(user_ages, range(len(user_ages))))

    all_users_df.age = all_users_df.age.apply(lambda x: age_dict[x])

    all_movies_id = [elem.id for elem in all_movies]
    all_movies_data = [{
        'year': elem.year,
        'actor': elem.actor,
        'title': elem.title,
        'rated': elem.rated,
        'director': elem.director,
        'genre': elem.genre
    } for elem in all_movies]

    all_movies_df = pd.DataFrame(all_movies_data, index=all_movies_id)

    all_movies_df.actor = all_movies_df.actor.apply(lambda x: actor_dict[x])
    all_movies_df.director = all_movies_df.director.apply(
        lambda x: director_dict[x])
    all_movies_df.rated = all_movies_df.rated.apply(lambda x: rated_dict[x])
    all_movies_df.genre = all_movies_df.genre.apply(lambda x: genre_dict[x])
    all_movies_df.year = all_movies_df.year - MOVIE_MIN_YEAR

    existing_movies_df = all_movies_df[all_movies_df.year < 1998 -
                                       MOVIE_MIN_YEAR]
    new_movies_df = all_movies_df[all_movies_df.year > 1997 - MOVIE_MIN_YEAR]

    #user_mask=np.random.rand(len(all_users_df)) < 0.8
    #user_existing=all_users_df[user_mask]
    #user_new=all_users_df[~user_mask]
    user_existing = all_users_df[all_users_df.index.isin(actual_users_index)]
    user_new = all_users_df[~all_users_df.index.isin(actual_users_index)]
Пример #6
0
import pandas as pd
from connect_db import Session,engine
from data_models import Movie

ratings_df=pd.read_table('data/ratings.dat','::',names=['user_id','movie_id','rate','ts'],engine='python')
ratings_df.drop('ts',1,inplace=True)
session=Session()
allids=[elem[0] for elem in session.query(Movie.id).all()]
ne_ratings_df=ratings_df[ratings_df.movie_id.isin(allids)]
ne_ratings_df.to_sql('rating',engine,if_exists='append',index=False,chunksize=50,method='multi')
Пример #7
0
from connect_db import Session, engine
from data_models import Movie, User, Rating
import json

session = Session()

movies = session.query(Movie.director, Movie.actor, Movie.rated,
                       Movie.genre).all()
books = session.query()

directors = list({elem.director for elem in movies})
actors = list({elem.actor for elem in movies})
rateds = list({elem.rated for elem in movies})
genres = list({elem.genre for elem in movies})

movie_dict = {
    'directors': directors,
    'actors': actors,
    'rateds': rateds,
    'genres': genres
}

with open('movie_dict.json', 'w') as f:
    json.dump(movie_dict, f)
Пример #8
0
movies_df.drop('title_year', 1, inplace=True)

omdb = 'http://www.omdbapi.com'

# daily 1000 limit, can use one of below
# params={'apikey':'f69c6afb'}
# params={'apikey':'a934a276'}
# params={'apikey':'7dad728b'}
# params={'apikey':'d58cf8f1'}
params = {'apikey': '784e8ba1'}

directors = []
actors = []
rated = []

session = Session()

# query=session.query(Movie)

# start_idx=0
stop_idx = 0
failed_idx = []

movies_df_c = movies_df[3770:].copy()

for row in movies_df_c.iterrows():
    params['t'] = row[1].title
    params['year'] = row[1].year
    returned = requests.get(omdb, params)
    if returned.status_code != 200:
        print('Error at row: {}'.format(row[0]))