Пример #1
0
def parse_user():
    """Load users

    output: id2user.pkl,
            user2id.pkl,
            user.friend.pkl,
            user.profile.pkl
    """
    user_profile = {}
    user_friend = {}

    make_dir(PREPROCESS_DIR)

    print("\t[parse user] load user list")
    users_list = load_pkl(PREPROCESS_DIR + "users_list.pkl")
    users_list = set(users_list)

    print("\t[parse user] building user profiles")
    with open(DATA_DIR + "user.json", "r") as fin:
        for ind, ln in enumerate(fin):
            data = json.loads(ln)
            user_id = data['user_id']
            if user_id not in users_list:  # discard infrequent or irrelevant cities
                continue
            user_friend[user_id] = data['friends'].split(", ")
            del data['friends']
            del data['user_id']
            user_profile[user_id] = data

    # user adjacency and profile dictionary separately
    print(
        "\t[parse user] dumping user-friendship and user-profile information ..."
    )
    dump_pkl(PREPROCESS_DIR + "user_friend.pkl", user_friend)
    dump_pkl(PREPROCESS_DIR + "user_profile.pkl", user_profile)
Пример #2
0
def compute_user_avg_loc(city):
    """compute average latitude and longitude of businesses each user visited

    Arg:
        city - the city
    """
    print("\t[user] computing location features")
    # df = pd.read_csv(TRNTST_DIR + "{}/train_pos.csv".format(city))
    df = pd.read_csv(INPUT_DIR + "{}/user_business_interaction.csv".format(city))
    bus_profile = load_pkl(INPUT_DIR + "{}/city_business_profile.pkl".format(city))

    # df.assign(business_latitude=lambda x: bus_profile[x.business]["latitude"])
    # df.assign(business_longitude=lambda x: bus_profile[x.business]["longitude"])

    b_lat_dict = dict([(k, v["latitude"]) for k, v in bus_profile.items()])
    b_long_dict = dict([(k, v["longitude"]) for k, v in bus_profile.items()])

    df = df.assign(bus_lat=df.business.map(b_lat_dict))
    df = df.assign(bus_long=df.business.map(b_long_dict))

    # "ll": latitude and longitude
    print("\t[user] aggregating location (lat and long) by user")
    df_loc = df.groupby("user").agg({"bus_lat": ['max', 'min', 'mean'],
                                     "bus_long": ['max', 'min', 'mean']})

    # rename max, min, mean col to max_lat, min_lat, or mean_at. Same as `long`
    # while still maintaining the index as `user`
    user_lat = df_loc.bus_lat.reset_index()
    user_long = df_loc.bus_long.reset_index()
    user_loc = user_lat.join(user_long, on="user", how="outer",
                             lsuffix="_lat", rsuffix="_long")
    user_loc = user_loc.fillna(user_loc.mean())  # now `user` is column
    user_loc_dict = user_loc.set_index("user").to_dict(orient="index")
    dump_pkl(OUTPUT_DIR + "{}/city_user_loc.pkl".format(city), user_loc_dict)
Пример #3
0
def gather_genre(genre, limit_videos=100):

    print "Gathering features for", genre,
    genreFeatures = gather_videos(genre, limit_videos)
    print "OK."
    print genreFeatures.shape
    dump_pkl(genreFeatures, genre + str(limit_videos))
Пример #4
0
def gather_optical_flow_features(genre, limit_videos = None):
    genre_OF_features = []
    videoPaths = glob(os.path.join(video_resource,'train',genre)+'/*')[:limit_videos]
    for videoPath in videoPaths:
        videoFeatures = optical_flow(videoPath)
        print videoFeatures.shape
        genre_OF_features.append(videoFeatures)
        print "*"*90
    dump_pkl(genre_OF_features, genre+"_ultimate_OF")
def generate_precision_recall_text(mode='val'):

    model = load_moviescope_model('text')
    yTrue, plotFeatures = gather_features(mode, return_video=False, reverse=True)
    plotFeatures = np.array(map(list, zip(*plotFeatures)))
    yPreds = model.predict([plotFeatures[0], plotFeatures[1]])
    dump_pkl((yTrue, yPreds), mode+'_pred_text')

    return
def train_classifier_video(trainVideoFeatures,
                           trainLabels,
                           valVideoFeatures=None,
                           valLabels=None):

    input_dim = 4096

    trainingLabels, trainingFeatures = augment_labels_lstm(
        trainLabels, trainVideoFeatures, number_of_frames)

    print trainingLabels.shape
    print trainingFeatures.shape
    """Initialize the mode"""

    visInput = Input(shape=(number_of_frames, input_dim), dtype='float32')
    model = vis_model(visInput, number_of_classes, return_top=True)

    plot(model, to_file='vis_model.png', show_shapes=True)
    sgd = SGD(lr=0.01, decay=0.000001, momentum=0.9, nesterov=True)
    # suppressing SGD, since text and merged models are optimized using ADAM

    model.compile(optimizer=sgd,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    """Start training"""
    batch_size = 63
    nb_epoch = 50

    checkpoint = ModelCheckpoint(filepath='./data/models/wiki_im_video_sgd.h5',
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')
    callbacks_list = [checkpoint, remote]

    if valLabels is not None:
        valLabels, valFeatures = augment_labels_lstm(valLabels,
                                                     valVideoFeatures,
                                                     number_of_frames)
        hist = model.fit(trainingFeatures,
                         trainingLabels,
                         validation_data=(valFeatures, valLabels),
                         nb_epoch=nb_epoch,
                         batch_size=batch_size,
                         callbacks=callbacks_list)
    else:
        hist = model.fit(trainingFeatures,
                         trainingLabels,
                         nb_epoch=nb_epoch,
                         batch_size=batch_size,
                         callbacks=callbacks_list)
    model.save('data/models/video_sgd.h5')
    histDict = hist.history
    dump_pkl(histDict, 'hist_video_sgd')

    return model
def generate_precision_recall_video(mode='val'):

    model = load_moviescope_model('wiki_im_video_sgd')
    yTrue, videoFeatures = gather_features(mode, return_plot=False)
    _, videoFeatures = augment_labels_lstm(yTrue, videoFeatures, 200)
    yPreds = model.predict(videoFeatures)

    dump_pkl((yTrue, yPreds), mode+'_pred_video_sgd')

    return
def get_raw_data(mode='val'):

    rawData = [] #Include dictionaries containing plot and genre labels & movie_id
    allData = load_pkl(baseName+mode)
    for data in allData:
        movie_id =  data['movie_id']
        plot = data['plot']
        genreLabel = data['newGenreLabels']
        rawData.append({'movie_id':movie_id,'plot':plot,'newGenreLabels':genreLabel})

    dump_pkl(rawData, 'raw_data_'+mode)
def _get_raw_data(mode='val'):

    rawData = [] #Include dictionaries containing trailer path, plot and genre labels & movie_id
    allData = load_pkl(baseName+mode)
    for data in allData:
        movie_id =  data['movie_id']
        path = glob(video_resource+str(movie_id)+'.*')[0]       
        plot = data['plot']
        genreLabel = data['genreLabel']

        rawData.append({'movie_id':movie_id,'plot':plot,'path':path,'genreLabel':genreLabel})

    dump_pkl(rawData, 'raw_data_'+mode)
def generate_precision_recall_vislang(mode='val', merge_mode='sum'):

    if merge_mode == 'bilinear':
        model = vislang_model(merge_mode)
        model.load_weights('data/weights/weights_min_loss_%s.h5' % merge_mode)
    else:
        model = load_moviescope_model('eq_VisLang_%s' % merge_mode)

    yTrue, plotFeatures, videoFeatures = gather_features(mode, reverse=True)
    plotFeatures = np.array(map(list, zip(*plotFeatures)))
    _, videoFeatures = augment_labels_lstm(yTrue, videoFeatures, 200)
    yPreds = model.predict([videoFeatures, plotFeatures[0], plotFeatures[1]])
    dump_pkl((yTrue, yPreds), mode+'_pred_eq_vislang_'+merge_mode)
def train_from_scratch(config, state, channel):
    # Model options
    save_model_dir = config[config.model].save_model_dir
    if save_model_dir == 'current':
        config[config.model].save_model_dir = './'
        save_model_dir = './'
        # to facilitate the use of cluster for multiple jobs
        save_path = './model_config.pkl'
    else:
        # run locally, save locally
        save_path = save_model_dir + 'model_config.pkl'
    print 'current save dir ', save_model_dir
    utils.create_dir_if_not_exist(save_model_dir)

    reload_ = config[config.model].reload_
    if reload_:
        print 'preparing reload'
        save_dir_backup = config[config.model].save_model_dir
        from_dir_backup = config[config.model].from_dir
        # never start retrain in the same folder
        assert save_dir_backup != from_dir_backup
        print 'save dir ', save_dir_backup
        print 'from_dir ', from_dir_backup
        print 'setting current model config with the old one'
        model_config_old = utils.load_pkl(from_dir_backup +
                                          '/model_config.pkl')
        utils.set_config(config, model_config_old)
        config[config.model].save_model_dir = save_dir_backup
        config[config.model].from_dir = from_dir_backup
        config[config.model].reload_ = True
    if config.erase_history:
        print 'erasing everything in ', save_model_dir
        os.system('rm %s/*' % save_model_dir)
    # for stdout file logging
    # sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log')
    print 'saving model config into %s' % save_path
    utils.dump_pkl(config, save_path)
    # Also copy back from config into state.
    for key in config:
        setattr(state, key, config[key])
    model_type = config.model
    print 'Model Type: %s' % model_type
    print 'Command: %s' % ' '.join(sys.argv)

    t0 = time.time()
    print 'training an attention model'
    train(**state.attention)
    if channel:
        channel.save()
    print 'training time in total %.4f sec' % (time.time() - t0)
Пример #12
0
def train_classifier_word_embedding(trainPlotFeatures,
                                    trainLabels,
                                    valPlotFeatures=None,
                                    valLabels=None):

    sequence_input = Input(shape=(3000, ), dtype='int32')
    sequence_input_reverse = Input(shape=(3000, ), dtype='int32')

    textModel = good_text_model(sequence_input,
                                sequence_input_reverse,
                                use_embedding=True,
                                trainable=False)
    textModel.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    checkpoint = ModelCheckpoint(filepath='./data/models/text_checkpoint.h5',
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')
    checkpoint_loss = ModelCheckpoint(
        filepath='./data/models/text_checkpoint_loss.h5',
        monitor='val_loss',
        verbose=1,
        save_best_only=True,
        mode='min')
    callbacks_list = [checkpoint, remote, checkpoint_loss]

    if valLabels is not None:
        hist = textModel.fit(
            [trainPlotFeatures[0], trainPlotFeatures[1]],
            trainLabels,
            validation_data=([valPlotFeatures[0],
                              valPlotFeatures[1]], valLabels),
            nb_epoch=50,
            batch_size=63,
            callbacks=callbacks_list)
    else:
        hist = textModel.fit([trainPlotFeatures[0], trainPlotFeatures[1]],
                             trainLabels,
                             nb_epoch=100,
                             batch_size=63,
                             callbacks=callbacks_list)

    textModel.save('data/models/_text.h5')
    print "Model saved at: data/models/_text.h5"
    histDict = hist.history
    dump_pkl(histDict, 'hist_text')
def gather_training_data(genre, model_name=default_model_name):
    trainPath = os.path.join(video_resource, 'train', genre)
    videoPaths = glob(trainPath + '/*')
    genreFeatures = []
    for videoPath in videoPaths:
        print(videoPath, ":")
        frames = list(get_frames(videoPath, time_step=1000))
        print(len(frames))
        if len(frames) == 0:
            print("corrupt.")
            continue
        videoFeatures = get_features_batch(frames, model_name)
        print(videoFeatures.shape)
        genreFeatures.append(videoFeatures)
    outPath = genre + "_ultimate_" + model_name
    dump_pkl(genreFeatures, outPath)
Пример #14
0
def _get_features_data(mode='val'):
    """ deprecated with old dataset """
    """ Includes every sample with plotFeatures, videoFeatures, movie_id and genreLabel """
    featureData = []
    allData = load_pkl(baseName+mode)
    plots = []

    """Process plot vectors"""

    for data in allData:
        movie_id = data['movie_id']
        plot = data['plot']
        plots.append(plot)

    if mode=='train':
        textObj = Text()
        plotFeatures_all = textObj.fit_transform(plots)
        dump_pkl(textObj, 'plot_object_train')
    else:
        try:
            textObj = load_pkl('plot_object_train')
            plotFeatures_all = textObj.transform(plots).toarray()
        except:
            print "Please train the plots first."
            return

    plotIndex = -1
    for data in allData:
        plotIndex += 1
        movie_id =  data['movie_id']
        path = glob(video_resource+str(movie_id)+'.*')[0]       
        plot = data['plot']
        genreLabel = data['genreLabel']
        print plotIndex,"out of ",len(allData)
        print "Gathering features for",movie_id
        try:
            frames = list(get_frames(path, start_time=1000, end_time=200000, time_step=1000))
            videoFeatures = get_features_batch(frames, 'vgg16')
        except Exception as e:
            print e
            continue # Omit the movie if one of the feature is bad
            # videoFeatures = None
        plotFeatures = plotFeatures_all[plotIndex]

        featureData.append({'videoFeatures':videoFeatures, 'plotFeatures':plotFeatures, 'movie_id':movie_id, 'genreLabel':genreLabel})

    dump_pkl(featureData, 'feature_data_'+mode)
Пример #15
0
def gather_genre(genre, limit_videos=100):

    print "Gathering features for",genre
    videoPaths = glob(video_resource+genre+'/*')[:limit_videos]
    genreFeatures = []
    for videoPath in videoPaths:
        videoFeatures = []
        print "extracting features for",videoPath
        for frame in get_frames(videoPath, time_step=1000):
            frameFeatures = get_features(frame)
            videoFeatures.append(frameFeatures)
        videoFeatures = np.array(videoFeatures)
        genreFeatures.append(videoFeatures)
    genreFeatures = np.array(genreFeatures)
    print "OK."
    print genreFeatures.shape
    dump_pkl(genreFeatures, genre+str(limit_videos))
def return_confident_results(mode='val'):
    
    model = load_moviescope_model('wiki_im_VisLang')
    genrePredictionDict = dict((i,[]) for i in range(26))
    textObj = load_pkl('plot_object_train')
    labels, plotFeatures, videoFeatures, movieIds = gather_features(mode, return_id=True)
    _, videoFeatures = augment_labels_lstm(labels, videoFeatures, 200)
    predictionScores = model.predict([videoFeatures, plotFeatures])
    for index  in range(len(predictionScores)):
        for i in range(26):
            genrePredictionDict[i].append((predictionScores[index][i],movieIds[index]))

    dump_pkl(genrePredictionDict, 'genrePredictionDict_'+mode)
    
    for i in range(26):
        print sorted(genrePredictionDict[i], reverse=True)[:10]
    return
Пример #17
0
def parse_interactions():
    """draw interact from `review.json` and `tips.json`.

    output: ub.interact.csv

    Args:
        keep_city - the interact of cities to keep
    """

    # business_profile only contains city in Lv, Tor, and Phx
    print("\t[parse interactions] loading business_profile pickle...")
    business_profile = load_pkl(PREPROCESS_DIR + "business_profile.pkl")

    users, businesses, cities = [], [], []
    timestamps = []

    # create records as (user, business, city) tuple
    print("\t[parse interactions] loading review.json ...")
    with open(DATA_DIR + "review.json", "r") as fin:
        for ln in fin:
            data = json.loads(ln)
            _bid = data['business_id']
            if _bid not in business_profile:  # only Lv, Tor, and Phx businesses
                continue
            users.append(data['user_id'])
            businesses.append(_bid)
            cities.append(business_profile[_bid]["city"])
            timestamps.append(data['date'])

    interactions = pd.DataFrame({
        'user': users,
        'business': businesses,
        "city": cities,
        "timestamp": timestamps
    })

    interactions.to_csv(PREPROCESS_DIR + "user_business_interact.csv",
                        index=False)

    # kept user for parse user
    user_remained = interactions["user"].unique().tolist()
    dump_pkl(PREPROCESS_DIR + "users_list.pkl", user_remained)
Пример #18
0
def gather_histogram_data(genre, mode='train'):
    """Driver function to collect frame features for a genre"""

    trainPath = os.path.join(video_resource, mode, genre)
    print trainPath
    videoPaths = glob(trainPath + '/*')
    genreFeatures = []
    for videoPath in videoPaths:
        print videoPath, ":",
        frames = list(get_frames(videoPath, time_step=1000))
        print len(frames),
        if len(frames) == 0:
            print "corrupt."
            continue
        videoFeatures = np.array([get_histogram(frame) for frame in frames])
        print videoFeatures.shape
        genreFeatures.append(videoFeatures)

    outPath = genre + "_histogram_" + mode
    dump_pkl(genreFeatures, outPath)
Пример #19
0
def gather_training_data(genre, model_name=default_model_name):
    """Driver function to collect frame features for a genre"""

    trainPath = os.path.join(video_resource,'train',genre)
    print trainPath
    videoPaths = glob(trainPath+'/*')
    genreFeatures = []
    for videoPath in videoPaths:
        print videoPath,":",
        frames =list(get_frames(videoPath, time_step=1000))
        print len(frames),
        if len(frames)==0:
            print "corrupt."
            continue
        videoFeatures = get_features_batch(frames, model_name)
        print videoFeatures.shape
        genreFeatures.append(videoFeatures)

    outPath = genre+"_ultimate_"+model_name
    dump_pkl(genreFeatures, outPath)
Пример #20
0
def train_mode(genres=['action', 'drama', 'horror', 'romance']):
    trainingData, trainingLabels = [], []
    for genreIndex, genre in enumerate(genres):
        try:
            genreFeatures = np.array(load_pkl(genre + '_histogram_train'))
        except Exception as e:
            print e
            return
        for videoFeatures in genreFeatures:
            for feature in videoFeatures:
                trainingData.append(feature)
                trainingLabels.append(genreIndex)
    trainingData = np.array(trainingData)
    trainingLabels = np.array(trainingLabels)
    print trainingData.shape
    print trainingLabels.shape

    print "Training..."
    model = RF(n_estimators=15, n_jobs=-1).fit(trainingData, trainingLabels)
    dump_pkl(model, "RF_histogram")
Пример #21
0
def get_features_data(mode='val'):
    """ Includes every sample with plotFeatures, videoFeatures, movie_id and genreLabel """
    featureData = []
    allData = load_pkl(baseName+mode)
    plots = []

    """Process plot vectors"""

    for data in allData:
        movie_id = data['movie_id']
        plot = data['plot']
        plots.append(plot)

    if mode=='train':
        textObj = WordEmbeddings()
        plotFeatures_all = textObj.fit_transform(plots)
        dump_pkl(textObj, 'plot_object_train')
    else:
        try:
            textObj = load_pkl('plot_object_train')
            plotFeatures_all = textObj.transform(plots, reverse=False)
        except Exception as e:
            print e
            print "Please train the plots first."
            return

    plotIndex = -1
    for data in allData:
        plotIndex += 1
        movie_id =  data['movie_id']
        path = glob(video_resource+str(movie_id)+'.*')[0]       
        plot = data['plot']
        genreLabel = data['newGenreLabels']
        print plotIndex,"out of ",len(allData)
        print "Gathering features for",movie_id
        videoFeatures = data['videoFeatures']
        plotFeatures = plotFeatures_all[plotIndex]

        featureData.append({'videoFeatures':videoFeatures, 'plotFeatures':plotFeatures, 'movie_id':movie_id, 'genreLabel':genreLabel})

    dump_pkl(featureData, 'feature_data_'+mode)
Пример #22
0
def process_record(cnn, data):
    if os.path.exists(os.path.join(FLAGS.feats_dir, data['feat_path'])):
        print "Already processed ... "
        return

    video_fullpath = os.path.join(FLAGS.videos_dir, data['video_path'])

    try:
        cap = cv2.VideoCapture(video_fullpath)
    except:
        ipdb.set_trace()

    frame_count = 0
    frame_list = []

    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()

        if ret is False:
            break

        frame_list.append(frame)
        frame_count += 1

    if frame_count == 0:
        return

    frame_list = np.array(frame_list)

    if frame_count > FLAGS.nr_frames:
        start = np.random.randint(0, frame_count - 10)
        frame_indices = np.arange(start, start + 10)
        frame_list = frame_list[frame_indices]

    cropped_frame_list = np.array(
        map(lambda x: cnn.preprocess_frame(FLAGS.cropping_sizes, x),
            frame_list))
    feats = cnn.get_features(cropped_frame_list)
    save_full_path = os.path.join(FLAGS.feats_dir, data['feat_path'])
    dump_pkl(feats, save_full_path)
Пример #23
0
def parse_business():
    """extract business information from business.json

    output:
        business.profile.pkl
        city.business.pkl
    """

    city_business = {}  # dictionary of city: [business list]
    business_profiles = {}  # dictionary of business profile

    # count business by location (city and state)
    print(
        "\t[parse_business] preprocessing all business without selecting cities ..."
    )
    with open(DATA_DIR + "business.json", "r") as fin:
        for ind, ln in enumerate(fin):
            data = json.loads(ln)
            city = data['city']

            if city not in CANDIDATE_CITY:  # only use cities
                continue

            business_id = data["business_id"]
            # removed fields: id, state, attributes, and hours
            # remained fields: fields: name, address, postal-code, latitude/longitude
            #              star, review_count, is_open
            del data["business_id"], data["state"]
            del data["attributes"], data["hours"]
            business_profiles[business_id] = data

            # save business id to city_business dictionary
            city_business[city] = city_business.get(city, [])
            city_business[city].append(business_id)

    # save city business mapping
    print("\t[parse business] dumping business.profile and city.business ...")
    dump_pkl(PREPROCESS_DIR + "business_profile.pkl", business_profiles)
    dump_pkl(PREPROCESS_DIR + "city_business.pkl", city_business)
Пример #24
0
def train_from_scratch(config, state, channel):
    model_type = config.model
    # set up automatically some fields in config
    if config.dataset.signature == 'MNIST_binary_russ':
        config[model_type].n_in = 784
        config[model_type].n_out = 784

    # manipulate the 'state
    # save the config file
    save_model_path = config.save_model_path

    if save_model_path == 'current':
        config.save_model_path = './'
        # to facilitate the use of cluster for multiple jobs
        save_path = './model_config.pkl'
    else:
        # run locally, save locally
        save_path = save_model_path + 'model_config.pkl'

    utils.create_dir_if_not_exist(config.save_model_path)
    # for stdout file logging
    #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log')
    print 'saving model config into %s' % save_path
    utils.dump_pkl(config, save_path)

    # Also copy back from config into state.
    for key in config:
        setattr(state, key, config[key])

    print 'Model Type: %s' % model_type
    print 'Host:    %s' % socket.gethostname()
    print 'Command: %s' % ' '.join(sys.argv)

    print 'initializing data engine'
    input_dtype = 'float32'
    target_dtype = 'int32'
    data_engine = None
    deep_orderless_bernoulli_nade.train_from_scratch(state, data_engine,
                                                     channel)
Пример #25
0
def train_from_scratch(config, state, channel):
    model_type = config.model
    # set up automatically some fields in config
    if config.dataset.signature == 'MNIST_binary_russ':
        config[model_type].n_in = 784
        config[model_type].n_out = 784
        
    # manipulate the 'state
    # save the config file
    save_model_path = config.save_model_path

    if save_model_path == 'current':
        config.save_model_path = './'
        # to facilitate the use of cluster for multiple jobs
        save_path = './model_config.pkl'
    else:
        # run locally, save locally
        save_path = save_model_path + 'model_config.pkl'

    utils.create_dir_if_not_exist(config.save_model_path)
    # for stdout file logging
    #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log')
    print 'saving model config into %s'%save_path
    utils.dump_pkl(config, save_path)

    # Also copy back from config into state.
    for key in config:
        setattr(state, key, config[key])
    
    print 'Model Type: %s'%model_type
    print 'Host:    %s' % socket.gethostname()
    print 'Command: %s' % ' '.join(sys.argv)
    
    print 'initializing data engine'
    input_dtype = 'float32'
    target_dtype = 'int32'
    data_engine = None
    deep_orderless_bernoulli_nade.train_from_scratch(state, data_engine, channel)
Пример #26
0
def gather_testing_data(genre, model_name=default_model_name):
    """Driver function to collect frame features for a genre"""

    testPath = os.path.join(video_resource, 'test', genre)
    print(testPath)
    videoPaths = glob(testPath + '/*')
    genreFeatures = []
    for videoPath in videoPaths:
        print(
            videoPath,
            ":",
        )
        frames = list(get_frames(videoPath, time_step=1000))
        print(len(frames), )
        if len(frames) == 0:
            print("corrupt.")
            continue
        videoFeatures = get_features_batch(frames)
        print(videoFeatures.shape)
        genreFeatures.append(videoFeatures)

    outPath = genre + "_test_" + model_name
    dump_pkl(genreFeatures, outPath)
Пример #27
0
        if args.modality == "text":

            vocab_lim = "full" if args.vocab_lim is None else str(
                args.vocab_lim)
            vocab_fname = vocab_dict[str(vocab_lim)]
            assert exists(join(local_data_path,
                               vocab_fname + ".p")), "build vocabulary first"
            vocab = load_pkl(local_data_path, vocab_fname)

            trainset, valset = build_text_dataset(meta_dict, vocab,
                                                  args.data_split)
            train_caps_fname = "train_seq_" + str(args.vocab_lim)
            val_caps_fname = "val_seq_" + str(args.vocab_lim)

            dump_pkl(trainset, local_data_path, train_caps_fname)
            dump_pkl(valset, local_data_path, val_caps_fname)

        elif args.modality == "vis":
            build_vis_dataset(meta_dict)

        elif args.modality == "full":

            vocab_lim = "full" if args.vocab_lim is None else str(
                args.vocab_lim)
            vocab_fname = vocab_dict[str(vocab_lim)]
            assert exists(join(local_data_path,
                               vocab_fname + ".p")), "build vocabulary first"
            vocab = load_pkl(local_data_path, vocab_fname)

            vis_dir = join(local_data_path, "crops")
Пример #28
0
def extract_user_attr(city):
    """extract user attributes
    Args:

        city - the city to profess
    Save to disk:
        df_nonzero - non zero number ratios of each attribute
    Return:
        df_nonzero - as above.
    """

    print("\t[user] loading user interaction from {}...".format(city))

    user_profile = load_pkl(INPUT_DIR + "{}/city_user_profile.pkl".format(city))
    user_loc = load_pkl(INPUT_DIR + "{}/city_user_loc.pkl".format(city))

    user_data_csv = []
    user_data_pkl = {}

    # process users, NOTE: user new index starts with 1
    for uid, prof_dict in user_profile.items():
        # --- create feature area ---
        # after checking, each user has all attributes, review_count is non-zero
        tmp_entry = dict()
        u_elite = prof_dict.get('elite', [])
        tmp_entry['elite_count'] = len(u_elite)  # user elite
        tmp_entry['review_count'] = prof_dict.get('review_count', CNT_DFL)  # review_count
        tmp_entry['fans_count'] = prof_dict.get('fans', CNT_DFL)  # fans
        tmp_entry['funny_score'] = prof_dict.get('funny', CNT_DFL)  # funny
        tmp_entry['cool_score'] = prof_dict.get('cool', CNT_DFL)  # cool
        tmp_entry['useful_score'] = prof_dict.get('useful', CNT_DFL)  # useful
        tmp_entry['avg_stars'] = prof_dict.get('average_stars', STAR_DFL)  # average stars
        tmp_entry['mean_lat'] = user_loc[uid]["mean_lat"]
        tmp_entry['mean_long'] = user_loc[uid]["mean_long"]

        reg_yelp_date = prof_dict.get('yelping_since', DATE_DFL)  # yelping years
        delta_time = datetime.today() - parser.parse(reg_yelp_date)
        tmp_entry['yelping_years'] = delta_time.days // 365
        # --- end create feature area ---

        user_data_csv.append(tmp_entry)
        user_data_pkl[uid] = tmp_entry

    # create data frame
    empty_head_entry = pd.DataFrame({'elite_count': 0, "review_count": CNT_DFL,
        "fans_count": CNT_DFL, "funny_score": CNT_DFL, "cool_score": CNT_DFL,
        "useful_score": CNT_DFL, "avg_stars": STAR_DFL, "yelping_years": 0,
        "mean_lat": CNT_DFL, "mean_long": CNT_DFL}, index=[0])
    df_user_profile = pd.DataFrame(user_data_csv)
    assert empty_head_entry.shape[1] == df_user_profile.shape[1]
    df_user_profile = pd.concat([empty_head_entry, df_user_profile],
                              axis=0, sort=True).reset_index(drop=True)
    print("\t[user] length of `df_user_profile` {}".format(df_user_profile.shape[0]))

    # non-zero count attributes
    df_nonzero = df_user_profile.fillna(0).astype(bool).sum(axis=0)
    df_nonzero = df_nonzero / len(df_user_profile)
    print("\t[user] non-zero terms in `df_user_profile`")
    print(df_nonzero)

    print("\t[user] saving dataframe to {}".format(OUTPUT_DIR))
    df_user_profile.to_csv(
        OUTPUT_DIR+"{}/processed_city_user_profile.csv".format(city), index=False)
    dump_pkl(path=OUTPUT_DIR+"{}/processed_city_user_profile.pkl".format(city), 
        obj=user_data_pkl)

    return df_nonzero
Пример #29
0
def discretize_field_attr(city):
    """Discretize continuous fields to

    Starting from 1 instead of 0

    Args:

    Returns:
        c - city

        [DEPRECATED]
        num_bkt - the number of buckets for embedding continuous values
            >0 for a `num_bkt` number of buckets
            -1 for a total discretize, i.e., take integers as discrete values

    avg_stars,cool_score,elite_count,fans_count,funny_score,review_count,
    useful_score,yelping_years
    """

    col_configs = load_configs(city)

    print("\t[user] discretize - loading user attrs")
    df = pd.read_csv(INPUT_DIR + "{}/processed_city_user_profile.csv".format(city))
    cols_disc_info = dict()
    ft_idx_start = 1
    distinct_df_col_names, distinct_df_cols = [], []
    le = LabelEncoder()  # create for transforming CAT features

    for col in df.columns:
        # treat attribute as discrete variable
        # if col in discrete_attrs:
        if col in col_configs['CATEGORICAL']:
            distinct_df_cols.append(pd.Series(le.fit_transform(df[col]))+ft_idx_start)
            distinct_df_col_names.append(col+"_d_dist")
            num_vals = len(le.classes_)
            vals_map_to = le.transform(le.classes_) + ft_idx_start
            vals_map = dict(zip(le.classes_, vals_map_to))
            entry = {"bucket": False, "value_map": vals_map, "count": num_vals,
                     "max_idx": max(vals_map_to), "min_idx": min(vals_map_to)}
            ft_idx_start += num_vals

        # treat attribute as continuous variable
        # else:
        elif col in col_configs['NUMERICAL']:
            num_bkt = col_configs.getint("NUMERICAL", col)
            max_val, min_val = df[col].max(), df[col].min()

            distinct_df_col_names.append(col + "_c_dist")
            distinct_df_cols.append(
                pd.cut(df[col], num_bkt,
                       labels=range(ft_idx_start, ft_idx_start+num_bkt)))
            entry = {"bucket": True,
                    "max_val": max_val, "min_val": min_val, "count": num_bkt,
                    "min_idx": ft_idx_start, "max_idx": ft_idx_start + num_bkt - 1}
            ft_idx_start += num_bkt
        else:
            raise KeyError("{} is NOT configured in `columns_{}.ini`".format(col, city))

        cols_disc_info[col] = entry

    df_disc = pd.DataFrame(data=dict(zip(distinct_df_col_names, distinct_df_cols)))
    print("\t[user] discretize - saving dist. attr. and info to {}".format(OUTPUT_DIR))
    df_disc.to_csv(OUTPUT_DIR + "{}/processed_city_user_profile_dist.csv".format(city),
        index=False)
    dump_pkl(OUTPUT_DIR + "{}/cols_disc_info.pkl".format(city), cols_disc_info)
Пример #30
0
def parse_yelp(args):
    """draw review from `review.json` """

    assert hasattr(args, "min_cat_num"), "Please set `min_cat_num` for yelp."
    assert hasattr(args, "k_core"), "Please set `k_core` for yelp."

    in_dir = "/local2/zyli/irs_fn/data/raw/yelp/"
    #in_dir = INPUT_DIR + "yelp/"
    out_dir = OUTPUT_DIR + "yelp/"

    print("[Yelp] processing yelp dataset ...")

    print("[Yelp] loading business ...")
    food_cats = load_yelp_categories()

    business_profiles = dict()
    with open(in_dir + "business.json", "r") as fin:
        for ind, ln in enumerate(fin):
            data = json.loads(ln)
            # entry must have below fields
            if not all([
                    bool(data[x])
                    for x in ['business_id', 'review_count', 'categories']
            ]):
                continue
            if data['review_count'] < args.k_core:
                continue

            categories = [
                x.strip().lower()
                for x in data['categories'].strip().split(", ")
            ]
            filter_cats_num = sum([x in food_cats for x in categories])

            # throw away the business in two cases:
            #    1. all its cat(s) not present in food_cats
            #    2. >=1 cats not in, but >args.min_cat_num in food_cats
            if (not filter_cats_num) or \
                (len(categories) != filter_cats_num
                 and filter_cats_num < args.min_cat_num):
                continue

            bid = data['business_id']
            business_profiles[bid] = {
                'review_count': data['review_count'],
                'categories': categories
            }

    print("[Yelp] loading reviews ...")
    bid2idx, uid2idx = dict(), dict()
    uniq_bids, uniq_uids = [], []

    review_bids, review_uids = [], []
    review_set = dict()
    with open(in_dir + "review.json", "r") as fin:
        for ln in fin:
            data = json.loads(ln)

            # Make sure all four domains are not `None`.
            if not all([
                    bool(data[x])
                    for x in ['business_id', 'user_id', 'stars', 'text']
            ]):
                continue

            bid, uid = data['business_id'], data['user_id']

            if bid not in business_profiles:
                continue

            if bid not in bid2idx:
                new_bid = "b_" + str(len(uniq_bids))
                uniq_bids.append(bid)
                assert uniq_bids[-1] == uniq_bids[int(new_bid[2:])]
                bid2idx[bid] = new_bid
            else:
                new_bid = bid2idx[bid]

            if uid not in uid2idx:
                new_uid = "u_" + str(len(uniq_uids))
                uniq_uids.append(uid)
                assert uniq_uids[-1] == uniq_uids[int(new_uid[2:])]
                uid2idx[uid] = new_uid
            else:
                new_uid = uid2idx[uid]

            review_bids.append(new_bid)
            review_uids.append(new_uid)

            # NOTE: new_uid and new_bid are `u_[user_idx]` and `b_[bus_idx]`.
            review_set[(new_uid, new_bid)] = {
                "user_id": new_uid,
                "item_id": new_bid,
                "rating": data['stars'],
                "review": data["text"]
            }

    assert len(review_bids) == len(review_uids)

    print("[Yelp] building k_core graph, k={} ...".format(args.k_core))
    G = nx.Graph()
    G.add_edges_from(zip(review_uids, review_bids))
    print("[Yelp]\t num of nodes [{}] and edges [{}] before k_core.".format(
        G.number_of_nodes(), G.number_of_edges()))

    G_kcore = nx.algorithms.core.k_core(G, k=args.k_core)

    # Check if all edges are "ub" or "bu"
    assert all([x[0]+y[0] in ["bu", "ub"] for x, y in G_kcore.edges()]),\
           "NOT all edges are u-b or b-u!"

    # Unify edges from "u-b" or "b-u" to "u-b" to query `review_set`
    G_kcore_edges = [(x, y) if x[0] == "u" else (y, x)
                     for x, y in G_kcore.edges()]

    kcore_dataset = [review_set[tp] for tp in G_kcore_edges]
    print("[Yelp]\t num of nodes [{}] and edges [{}] after k_core.".format(
        G_kcore.number_of_nodes(), G_kcore.number_of_edges()))

    # create a dataframe to save/view/...
    kcore_df = pd.DataFrame(kcore_dataset)
    print("[Yelp] \t number of unique users [{}] and businesses [{}]".format(
        kcore_df["user_id"].nunique(), kcore_df['item_id'].nunique()))
    print("[Yelp] \t unique ratings {}".format(kcore_df['rating'].unique()))

    print("[Yelp] dumping data and four ref pickles ...")
    make_dir(out_dir)
    kcore_df.to_csv(out_dir + "data.csv",
                    index=False,
                    columns=['user_id', 'item_id', 'rating', 'review'])

    dump_pkl(out_dir + "bid2idx.pkl", bid2idx)
    dump_pkl(out_dir + "uniq_bids.pkl", uniq_bids)
    dump_pkl(out_dir + "uid2idx.pkl", uid2idx)
    dump_pkl(out_dir + "uniq_uids.pkl", uniq_uids)

    print("[Yelp] preprocessing done, files saved to {}".format(out_dir))

    train, test = train_test_split(kcore_df, test_size=test_split_ratio)
    train.to_csv('sTrainData.csv')
    test.to_csv('sTestData.csv')
Пример #31
0
def parse_goodreads(args):
    print("[Goodreads] processing yelp dataset ...")

    json_data = []
    with open(
            'goodreads.json', 'r'
    ) as handle:  # replace "goodreads.json" with the path of your file
        for line in handle:
            json_data.append(json.loads(line))

    data_raw = pd.DataFrame(json_data)
    data = data_raw.drop([
        'review_id', 'date_added', 'date_updated', 'read_at', 'started_at',
        'n_votes', 'n_comments'
    ],
                         axis=1)

    def normalizeNulls(a):
        a.replace('N/A', np.NaN, inplace=True)
        a.replace('Null', np.NaN, inplace=True)
        a.replace('NULL', np.NaN, inplace=True)
        a.replace('null', np.NaN, inplace=True)
        a.replace('', np.NaN, inplace=True)
        a.replace('None', np.NaN, inplace=True)
        a.replace('none', np.NaN, inplace=True)

    normalizeNulls(data)
    data.dropna(inplace=True)  # dropping n/a or null values

    #the 2 blocks of code below assign a unique id to each book and author
    data['id'] = data.groupby(['user_id']).ngroup()
    for i in range(len(data.index)):
        data.loc[i, 'id'] = 'u_' + str(data.loc[i, 'id'])
    data['user_id'] = data['id']
    data = data.drop('id', axis=1)

    data['id'] = data.groupby(['book_id']).ngroup()
    for i in range(len(data.index)):
        data.loc[i, 'id'] = 'b_' + str(data.loc[i, 'id'])
    data['book_id'] = data['id']
    data = data.drop('id', axis=1)

    review_set = dict()
    for index, row in data.iterrows():
        review_set[(row['user_id'], row['book_id'])] = {
            "user_id": row['user_id'],
            "book_id": row['book_id'],
            "rating": row['rating'],
            "review": row['review_text']
        }

    #5-core has to be applied here
    G = nx.Graph()
    G.add_edges_from(zip(data['user_id'],
                         data['book_id']))  #uids for authors, bids for books
    print(
        "[Goodreads]\t num of nodes [{}] and edges [{}] before k_core.".format(
            G.number_of_nodes(), G.number_of_edges()))

    G_kcore = nx.algorithms.core.k_core(G, k=5)

    # Check if all edges are "ub" or "bu"
    assert all([x[0]+y[0] in ["bu", "ub"] for x, y in G_kcore.edges()]),\
        "NOT all edges are u-b or b-u!"

    # Unify edges from "u-b" or "b-u" to "u-b" to query `review_set`
    G_kcore_edges = [(x, y) if x[0] == "u" else (y, x)
                     for x, y in G_kcore.edges()]

    kcore_dataset = [review_set[tp] for tp in G_kcore_edges]
    print(
        "[Goodreads]\t num of nodes [{}] and edges [{}] after k_core.".format(
            G_kcore.number_of_nodes(), G_kcore.number_of_edges()))

    # create a dataframe to save/view/...
    kcore_df = pd.DataFrame(kcore_dataset)
    print(kcore_df.head())

    #our main data is now in 'kcore_df'˜ dataframe and not the 'data' dataframe

    def clean_text(unclean_text):
        clean_text = wc.clean_html(unclean_text)
        clean_text = wc.clean_str2(clean_text)
        stop_words = text.ENGLISH_STOP_WORDS
        clean_text = wc.remove_stopwords(clean_text, stop_words)
        clean_text = wc.lemmatized_string(clean_text)
        return clean_text

    cleanFunction = lambda x: clean_text(x)
    kcore_df['review'] = pd.DataFrame(data.review_text.apply(cleanFunction))

    kcore_df.to_csv('cleanData.csv')

    def stem_text(unstemmed_text):
        stemmed_text = wc.stemmed_string(unstemmed_text)
        return stemmed_text

    stemFunction = lambda x: stem_text(x)
    kcore_df['review_text'] = pd.DataFrame(
        data.review_text.apply(stemFunction))

    #count authors and books here
    book_count = kcore_df['book_id'].nunique()
    user_count = kcore_df['user_id'].nunique()

    num_reviews = (kcore_df.count())[3]

    #saving stats of the dataset in stats.json
    stats_dict = {
        'Number of books': str(book_count),
        'Number of users': str(user_count),
        'Number of reviews': str(num_reviews)
    }
    with open('stats.json', 'w') as fp:
        json.dump(stats_dict, fp)

    kcore_df.to_csv('stemmedCleanData.csv')

    train, test = train_test_split(kcore_df, test_size=test_split_ratio)
    train.to_csv('sTrainData.csv')
    test.to_csv('sTestData.csv')

    uniq_bids = kcore_df['book_id'].unique()
    uniq_uids = kcore_df['user_id'].unique()
    dump_pkl(out_dir + "uniq_bids.pkl", uniq_bids)
    dump_pkl(out_dir + "uniq_uids.pkl", uniq_uids)