Exemplo n.º 1
0
    def _make_data_set(root, video_paths, class_to_idx, init_shuffle,
                       video_index):
        def _is_video_file(filename_):
            return any(
                filename_.endswith(extension)
                for extension in VIDEO_EXTENSIONS)

        if init_shuffle and not video_index:
            list_shuffle(video_paths)  # shuffle

        videos = list()
        frames_per_video = list()
        frames_per_class = [0] * len(class_to_idx)
        frames_counter = 0
        for filename in tqdm(video_paths, ncols=80):
            class_ = filename.split('/')[0]
            data_path = join(root, filename)
            if _is_video_file(data_path):
                video_meta = ffprobe(data_path)
                start_idx = frames_counter
                frames = int(video_meta['video'].get('@nb_frames'))
                frames_per_video.append(frames)
                frames_per_class[class_to_idx[class_]] += frames
                frames_counter += frames
                item = ((frames_counter - 1, start_idx),
                        (filename, class_to_idx[class_]))
                videos.append(item)

        sleep(0.5)  # allows for progress bar completion
        return videos, frames_counter, frames_per_video, frames_per_class
def source_data_expand(data_info_file, img_dir, rand_val=False, gender=None):
    """Read sample information, get split train- and test-dataset."""
    # config sample number per class
    train_sample_num = 1000
    val_sample_num = 1000

    # read sample info
    all_info = open(data_info_file).readlines()
    all_info.pop(0)
    all_info = [line.strip().split(',') for line in all_info]
    # select samples of specific gender
    if gender=='m':
        all_info = [line for line in all_info if int(line[1][16])%2==1]
    elif gender=='f':
        all_info = [line for line in all_info if int(line[1][16])%2==0]
    else:
        pass
    imgs = [os.path.join(img_dir, line[2]) for line in all_info]
    vals = [float(line[3]) for line in all_info]
    ages = []
    for line in all_info:
        birth_year = int(line[1][6:10])
        birth_month = int(line[1][10:12])
        a = 2008 - birth_year + (12-birth_month)*1.0/12
        ages.append(a)
    
    # select training samples within specific age range
    timgs = [imgs[i] for i in range(len(ages)) if ages[i]>=1.5]
    tvals = [vals[i] for i in range(len(ages)) if ages[i]>=1.5]
    print('%s training samples collected'%(len(timgs)))

    # select validation samples within specific age range
    vimgs = [imgs[i] for i in range(len(ages)) if (ages[i]>=1.0) and (ages[i]<1.5)]
    vvals = [vals[i] for i in range(len(ages)) if (ages[i]>=1.0) and (ages[i]<1.5)]
    print('%s validation samples collected'%(len(vimgs)))

    # sort the IQs, and split dataset into high and low parts
    train_sorted_idx = np.argsort(tvals)
    train_low_part = train_sorted_idx[0:train_sample_num]
    train_high_part = train_sorted_idx[(-1*train_sample_num):]
    train_idx = train_low_part.tolist() + train_high_part.tolist()
    train_imgs = [timgs[i] for i in train_idx]
    train_labels = [0]*train_sample_num + [1]*train_sample_num
    
    val_sorted_idx = np.argsort(vvals)
    val_low_part = val_sorted_idx[0:val_sample_num]
    val_high_part = val_sorted_idx[(-1*val_sample_num):]
    val_idx = val_low_part.tolist() + val_high_part.tolist()
    val_imgs = [vimgs[i] for i in val_idx]
    val_labels = [0]*val_sample_num + [1]*val_sample_num
    
    
    if rand_val:
        list_shuffle(val_labels)

    print('Training samples %s'%(len(train_imgs)))
    print('Validation samples %s'%(len(val_imgs)))

    return train_imgs, train_labels, val_imgs, val_labels
def source_data(beh_file, dist_file):
    """Read sample information, get split train- and test-dataset."""
    # config sample number per class
    all_sample_num = 1500
    train_sample_num = 1350

    # read behavior info
    beh_info = open(beh_file).readlines()
    beh_info = [line.strip().split(',') for line in beh_info]
    vals = [float(line[1]) for line in beh_info]
    # read landmark dist info
    dist = np.load(dist_file)
    dist_mean = dist.mean(axis=0)
    dist = dist - dist_mean

    sorted_idx = np.argsort(vals)
    low_part = sorted_idx[0:all_sample_num]
    high_part = sorted_idx[(-1 * all_sample_num):]
    low_dist = dist[low_part, :, :]
    high_dist = dist[high_part, :, :]
    rand_low_dist_idx = range(low_dist.shape[0])
    list_shuffle(rand_low_dist_idx)
    low_dist = low_dist[rand_low_dist_idx, :, :]
    rand_high_dist_idx = range(high_dist.shape[0])
    list_shuffle(rand_high_dist_idx)
    high_dist = high_dist[rand_high_dist_idx, :, :]

    train_dist = np.concatenate((low_dist[:train_sample_num, :, :],
                                 high_dist[:train_sample_num, :, :]),
                                axis=0)
    val_dist = np.concatenate((low_dist[train_sample_num:, :, :],
                               high_dist[train_sample_num:, :, :]),
                              axis=0)
    train_labels = [0] * train_sample_num + [1] * train_sample_num
    val_labels = [0]*(all_sample_num-train_sample_num) + \
                 [1]*(all_sample_num-train_sample_num)

    print 'Training set: %s' % (train_dist.shape[0])
    print 'Validation set: %s' % (val_dist.shape[0])

    return train_dist, train_labels, val_dist, val_labels
Exemplo n.º 4
0
    def _shuffle(self):
        """
        Shuffles the video list
        by regenerating the sequence to sample sequentially
        """
        def _is_video_file(filename_):
            return any(
                filename_.endswith(extension)
                for extension in VIDEO_EXTENSIONS)

        root = self.root
        video_paths = self.video_paths
        class_to_idx = self.class_to_idx
        list_shuffle(video_paths)  # shuffle

        videos = list()
        frames_per_video = list()
        frames_counter = 0
        for filename in tqdm(video_paths, ncols=80):
            class_ = filename.split('/')[0]
            data_path = join(root, filename)
            if _is_video_file(data_path):
                video_meta = ffprobe(data_path)
                start_idx = frames_counter
                frames = int(video_meta['video'].get('@nb_frames'))
                frames_per_video.append(frames)
                frames_counter += frames
                item = ((frames_counter - 1, start_idx),
                        (filename, class_to_idx[class_]))
                videos.append(item)

        sleep(0.5)  # allows for progress bar completion
        # update the attributes with the altered sequence
        self.video_paths = video_paths
        self.videos = videos
        self.frames = frames_counter
        self.frames_per_video = frames_per_video
Exemplo n.º 5
0
def source_data(data_info_file, img_dir, rand_val=False, gender=None):
    """Read sample information, get split train- and test-dataset."""
    # config sample number per class
    #all_sample_num = 1500
    #train_sample_num = 1350
    all_sample_num = 1000
    train_sample_num = 900

    # read sample info
    all_info = open(data_info_file).readlines()
    all_info.pop(0)
    all_info = [line.strip().split(',') for line in all_info]
    # select specific gender samples
    if gender == 'm':
        all_info = [line for line in all_info if int(line[1][16]) % 2 == 1]
    elif gender == 'f':
        all_info = [line for line in all_info if int(line[1][16]) % 2 == 0]
    else:
        pass
    imgs = [os.path.join(img_dir, line[2]) for line in all_info]
    ages = []
    for line in all_info:
        birth_year = int(line[1][6:10])
        birth_month = int(line[1][10:12])
        a = 2008 - birth_year + (12 - birth_month) * 1.0 / 12
        ages.append(a)
    vals = [float(line[3]) for line in all_info]
    # sort the IQs, and split dataset into high and low parts
    sorted_idx = np.argsort(vals)
    low_part = sorted_idx[0:all_sample_num]
    high_part = sorted_idx[(-1 * all_sample_num):]
    low_imgs = [imgs[i] for i in low_part]
    high_imgs = [imgs[i] for i in high_part]
    low_ages = [ages[i] for i in low_part]
    high_ages = [ages[i] for i in high_part]
    # shuffle the sample parts
    rand_low_idx = range(len(low_imgs))
    list_shuffle(rand_low_idx)
    low_imgs = [low_imgs[i] for i in rand_low_idx]
    low_ages = [low_ages[i] for i in rand_low_idx]
    rand_high_idx = range(len(high_imgs))
    list_shuffle(rand_high_idx)
    high_imgs = [high_imgs[i] for i in rand_high_idx]
    high_ages = [high_ages[i] for i in rand_high_idx]

    train_imgs = low_imgs[:train_sample_num] + high_imgs[:train_sample_num]
    train_ages = low_ages[:train_sample_num] + high_ages[:train_sample_num]
    val_imgs = low_imgs[train_sample_num:] + high_imgs[train_sample_num:]
    val_ages = low_ages[train_sample_num:] + high_ages[train_sample_num:]
    train_labels = [0] * train_sample_num + [1] * train_sample_num
    val_labels = [0]*(all_sample_num-train_sample_num) + \
                 [1]*(all_sample_num-train_sample_num)
    if rand_val:
        list_shuffle(val_labels)

    return train_imgs, train_ages, train_labels, val_imgs, val_ages, val_labels
def source_mbti_data(data_info_file, img_dir, rand_val=False, gender=None):
    """Read sample information, get split train- and test-dataset."""
    # config sample selection criteria for each class
    # for E-I factor, <8 : 2971 subjects, >13: 3070 subjects
    low_thresh = 8
    high_thresh = 13
    # select 200 subjects from each class for validation
    val_sample_num = 200

    # read sample info
    all_info = open(data_info_file).readlines()
    all_info.pop(0)
    all_info = [line.strip().split(',') for line in all_info]
    # select samples of specific gender
    if gender=='m':
        all_info = [line for line in all_info if line[1]=='male']
    elif gender=='f':
        all_info = [line for line in all_info if line[1]=='female']
    else:
        pass
    imgs = [os.path.join(img_dir, line[9]) for line in all_info]
    vals = [float(line[5]) for line in all_info]

    # select samples based on specific criteria
    assert len(imgs)==len(vals)
    high_imgs = []
    low_imgs = []
    for i in range(len(vals)):
        if vals[i]<low_thresh:
            low_imgs.append(imgs[i])
        elif vals[i]>high_thresh:
            high_imgs.append(imgs[i])

    # shuffle the samples and split training/validation dataset
    list_shuffle(low_imgs)
    list_shuffle(high_imgs)
    train_imgs = low_imgs[:(-1*val_sample_num)] + high_imgs[:(-1*val_sample_num)]
    val_imgs = low_imgs[(-1*val_sample_num):] + high_imgs[(-1*val_sample_num):]
    train_labels = [0]*(len(low_imgs)-val_sample_num) + [1]*(len(high_imgs)-val_sample_num)
    val_labels = [0]*val_sample_num + [1]*val_sample_num
    if rand_val:
        list_shuffle(val_labels)

    print('Training samples %s - %s'%(len(train_imgs), len(train_labels)))
    print('Validation samples %s - %s'%(len(val_imgs), len(val_labels)))

    return train_imgs, train_labels, val_imgs, val_labels
def source_data(data_info_file, img_dir, rand_val=False, gender=None):
    """Read sample information, get split train- and test-dataset."""
    # config sample number per class
    #all_sample_num = 1500
    #train_sample_num = 1350
    all_sample_num = 1000
    train_sample_num = 900

    # read sample info
    all_info = open(data_info_file).readlines()
    all_info.pop(0)
    all_info = [line.strip().split(',') for line in all_info]
    # select specific gender samples
    if gender == 'm':
        all_info = [line for line in all_info if int(line[1][16]) % 2 == 1]
    elif gender == 'f':
        all_info = [line for line in all_info if int(line[1][16]) % 2 == 0]
    else:
        pass
    imgs = [os.path.join(img_dir, line[2]) for line in all_info]
    vals = [float(line[3]) for line in all_info]
    sorted_idx = np.argsort(vals)
    low_part = sorted_idx[0:all_sample_num]
    high_part = sorted_idx[(-1 * all_sample_num):]
    low_imgs = [imgs[i] for i in low_part]
    high_imgs = [imgs[i] for i in high_part]
    list_shuffle(low_imgs)
    list_shuffle(high_imgs)
    train_imgs = low_imgs[:train_sample_num] + high_imgs[:train_sample_num]
    val_imgs = low_imgs[train_sample_num:] + high_imgs[train_sample_num:]
    train_labels = [0] * train_sample_num + [1] * train_sample_num
    val_labels = [0]*(all_sample_num-train_sample_num) + \
                 [1]*(all_sample_num-train_sample_num)
    if rand_val:
        list_shuffle(val_labels)

    return train_imgs, train_labels, val_imgs, val_labels
def source_landmark_with_age_sampling(data_file, rand_val=False, gender=None):
    """Read sample information, get split train- and test-dataset."""
    # config sample number per class
    sample_num = 100
    small_sample_num = 75

    # read sample info
    all_info = open(data_file).readlines()
    all_info = [line.strip().split(',') for line in all_info]
    # select specific gender samples
    if gender=='m':
        all_info = [line for line in all_info
                        if int(line[0].split('_')[0][16])%2==1]
    elif gender=='f':
        all_info = [line for line in all_info
                        if int(line[0].split('_')[0][16])%2==0]
    else:
        pass
    # get landmarks and IQs
    landmarks = [[float(line[2+i]) for i in range(144)] for line in all_info]
    vals = [float(line[1]) for line in all_info]
    ages = []
    for line in all_info:
        birth_year = int(line[0].split('_')[0][6:10])
        birth_month = int(line[0].split('_')[0][10:12])
        a = 2008 - birth_year + (12-birth_month)*1.0/12
        ages.append(a)
   
    # select samples within each age group
    landmark_list = []
    label_list = []
    for a in np.unique(ages):
        tmp_landmarks = [landmarks[i] for i in range(len(ages)) if ages[i]==a]
        tmp_vals = [vals[i] for i in range(len(ages)) if ages[i]==a]
        if len(tmp_landmarks)<200:
            snum = small_sample_num
        else:
            snum = sample_num
        # select top- and bottom- part of samples
        sorted_idx = np.argsort(tmp_vals)
        low_part = sorted_idx[:snum]
        high_part = sorted_idx[-snum:]
        sel_idx = np.concatenate((low_part, high_part))
        tmp_landmarks = [tmp_landmarks[i] for i in sel_idx]
        tmp_labels = [0]*snum + [1]*snum
        # shuffle the sample parts
        rand_idx = range(len(tmp_landmarks))
        list_shuffle(rand_idx)
        tmp_landmarks = [tmp_landmarks[i] for i in rand_idx]
        tmp_labels = [tmp_labels[i] for i in rand_idx]
        landmark_list.append(tmp_landmarks)
        label_list.append(tmp_labels)

    # select two subsets of the age groups as validation dataset
    group_idx = range(len(landmark_list))
    list_shuffle(group_idx)
    landmark_list = [landmark_list[i] for i in group_idx]
    label_list = [label_list[i] for i in group_idx]
    val_landmarks = []
    val_labels = []
    val_landmarks = [item for line in landmark_list[:2] for item in line]
    val_labels = [item for line in label_list[:2] for item in line]
    train_landmarks = [item for line in landmark_list[2:] for item in line]
    train_labels = [item for line in label_list[2:] for item in line]

    if rand_val:
        list_shuffle(val_labels)

    print('Training samples %s'%(len(train_landmarks)))
    print('Validation samples %s'%(len(val_landmarks)))

    return train_landmarks, train_labels, val_landmarks, val_labels
def source_data_with_age_sampling(data_info_file, img_dir, sample_num,
                                  small_sample_num, val_set_idx,
                                  rand_val=False, gender=None):
    """Read sample information, get split train- and test-dataset."""
    # config sample number per class
    #sample_num = 100
    #small_sample_num = 75

    # read sample info
    all_info = open(data_info_file).readlines()
    all_info.pop(0)
    all_info = [line.strip().split(',') for line in all_info]
    # select specific gender samples
    if gender=='m':
        all_info = [line for line in all_info if int(line[0][16])%2==1]
    elif gender=='f':
        all_info = [line for line in all_info if int(line[0][16])%2==0]
    else:
        pass

    # remove samples with glasses
    all_info = [line for line in all_info if not int(line[1])]

    # remove 2 2006-students
    all_info = [line for line in all_info if not int(line[0][6:10])==2006]

    print('%s samples selected'%(len(all_info)))

    imgs = []
    for line in all_info:
        tmp = []
        for item in line[6:]:
            tmp.append(os.path.join(img_dir, item))
        imgs.append(tmp)
    # index 1: graphic reasoning, 2: expanding graph
    vals = [float(line[2]) for line in all_info]
    ages = []
    for line in all_info:
        birth_year = int(line[0][6:10])
        birth_month = int(line[0][10:12])
        a = 2008 - birth_year + (12-birth_month)*1.0/12
        ages.append(a)
   
    # select samples within each age group
    img_list = []
    label_list = []
    unique_ages = np.unique(ages)
    for a in unique_ages:
        tmp_imgs = [imgs[i] for i in range(len(ages)) if ages[i]==a]
        tmp_vals = [vals[i] for i in range(len(ages)) if ages[i]==a]
        if len(tmp_imgs)<(2*sample_num):
            snum = small_sample_num
        else:
            snum = sample_num
        # select top- and bottom- part of samples
        sorted_idx = np.argsort(tmp_vals)
        low_part = sorted_idx[:snum]
        high_part = sorted_idx[-snum:]
        sel_idx = np.concatenate((low_part, high_part))
        tmp_imgs = [tmp_imgs[i] for i in sel_idx]
        tmp_labels = [0]*snum + [1]*snum
        # shuffle the sample parts
        rand_idx = list(range(len(tmp_imgs)))
        list_shuffle(rand_idx)
        tmp_imgs = [tmp_imgs[i] for i in rand_idx]
        tmp_labels = [tmp_labels[i] for i in rand_idx]
        img_list.append(tmp_imgs)
        label_list.append(tmp_labels)

    # select two subsets of the age groups as validation dataset
    val_set_ids = []
    #for i in range(len(img_list)):
    #    for j in range(i+1, len(img_list)):
    #        val_set_ids.append([i, j])
    #val_set_ids = val_set_ids[val_set_idx]
    #set_ids = range(len(img_list))
    #train_set_ids = [item for item in set_ids if not item in val_set_ids]
    #print 'Age groups for validation: %s and %s'%(unique_ages[val_set_ids[0]],
    #                                              unique_ages[val_set_ids[1]])
    for i in range(len(img_list)):
        val_set_ids.append([i])
    val_set_ids = val_set_ids[val_set_idx]
    set_ids = range(len(img_list))
    train_set_ids = [item for item in set_ids if not item in val_set_ids]
    print('Age groups for validation: %s'%(unique_ages[val_set_ids[0]]))


    val_imgs = [line for i in val_set_ids
                     for line in img_list[i]]
    val_labels = [item for i in val_set_ids
                       for item in label_list[i]]
    train_imgs = [line for i in train_set_ids
                       for line in img_list[i]]
    train_labels = [item for i in train_set_ids
                         for item in label_list[i]]

    # make a full size image array
    train_mlen = max([len(line) for line in train_imgs])
    train_imgs = [line+['null']*(train_mlen-len(line)) for line in train_imgs]
    val_mlen = max([len(line) for line in val_imgs])
    val_imgs = [line+['null']*(val_mlen-len(line)) for line in val_imgs]

    if rand_val:
        list_shuffle(val_labels)

    print('Training samples %s'%(len(train_imgs)))
    print('Validation samples %s'%(len(val_imgs)))

    return train_imgs, train_labels, val_imgs, val_labels
Exemplo n.º 10
0
 def shuffle(self):
     list_shuffle(self.cards)