Exemplo n.º 1
0
def get_image_matrices(train_imagepath, test_imagepath, trainDF, valDF,
                       testDF):
    '''
    load images from pkl files and convert to matrices
    '''

    plog("Loading train image features from %s..." % train_imagepath)
    with open(train_imagepath, 'rb') as f:
        imageDF = pkl.load(f)

    if test_imagepath is not None:
        plog("Loading test image features from %s..." % test_imagepath)
        with open(test_imagepath, 'rb') as f:
            test_imageDF = pkl.load(f)

        test_image_matrix = test_imageDF.as_matrix()
        test_image_matrix = test_image_matrix[:testDF.shape[0], :]
        assert test_image_matrix.shape[0] == testDF.shape[0]
    else:
        test_image_matrix = None

    image_matrix = imageDF.as_matrix()
    train_image_matrix = image_matrix[:trainDF.shape[0], :]
    val_image_matrix = image_matrix[trainDF.shape[0]:trainDF.shape[0] +
                                    valDF.shape[0], :]

    return (train_image_matrix, val_image_matrix, test_image_matrix)
Exemplo n.º 2
0
def build_text_matrices(datadir, tokenizer_path, trainDF, valDF, testDF):
    '''
    use bag-of-words representation to convert descriptions into bag-of-words matrices
    '''
    plog("Building text matrices...")
    with open(tokenizer_path) as f:
        tokenizer = pkl.load(f)
    train_text_matrix_path = datadir + 'train_text.pkl'
    val_text_matrix_path = datadir + 'val_text.pkl'
    test_text_matrix_path = datadir + 'test_text.pkl'

    bow_train, idx_train = bag_of_words.series_to_bag_of_words(
        trainDF.description_clean,
        tokenizer,
        train_text_matrix_path,
        mode="binary")
    bow_val, idx_val = bag_of_words.series_to_bag_of_words(
        valDF.description_clean,
        tokenizer,
        val_text_matrix_path,
        mode="binary")
    bow_test, idx_test = bag_of_words.series_to_bag_of_words(
        testDF.description_clean,
        tokenizer,
        test_text_matrix_path,
        mode="binary")

    plog("bow_train type: %s" % type(bow_train))
    return (bow_train, bow_val, bow_test)
def prepDFs(datadir,
        train_samples=10000,
        test_samples=1000,
        val_portion=0.1,
        debug=False):
    '''
    1. run train_val_split on training
    1b. run shuffle on test
    2. if text:
        a. train tokenizer
        b. convert text data to bag of words matrix
    3. if images:
        a. extract image data
    4. merge datasets

    returns: X_train,y_train,X_val,y_val,X_test,y_test
    '''
    if(debug):
        trainpath = datadir + 'head_train_set.csv'
        testpath = datadir + 'head_test_set.csv'
        train_samples = 90
        test_samples = 90
    else:
        trainpath = datadir + 'train_set.csv'
        testpath = datadir + 'test_set.csv'

    plog("Loading train csv...")
    trainDF = pd.read_csv(trainpath,header = 0, index_col = 0,low_memory = False)
    plog("Loading test csv...")
    testDF = pd.read_csv(testpath,header = 0, index_col = 0,low_memory = False)

    trainDF = shuffle_and_downsample(trainDF,train_samples)
    testDF = shuffle_and_downsample(testDF,test_samples)
    return trainDF,testDF
Exemplo n.º 4
0
def prepDFs(datadir,
            train_samples=10000,
            test_samples=1000,
            val_portion=0.1,
            debug=False):
    '''
    1. run train_val_split on training
    1b. run shuffle on test
    2. if text:
        a. train tokenizer
        b. convert text data to bag of words matrix
    3. if images:
        a. extract image data
    4. merge datasets

    returns: X_train,y_train,X_val,y_val,X_test,y_test
    '''
    if (debug):
        trainpath = datadir + 'head_train_set.csv'
        testpath = datadir + 'head_test_set.csv'
        train_samples = 90
        test_samples = 90
    else:
        trainpath = datadir + 'train_set.csv'
        testpath = datadir + 'test_set.csv'

    plog("Loading train csv...")
    trainDF = pd.read_csv(trainpath, header=0, index_col=0, low_memory=False)
    plog("Loading test csv...")
    testDF = pd.read_csv(testpath, header=0, index_col=0, low_memory=False)

    trainDF = shuffle_and_downsample(trainDF, train_samples)
    testDF = shuffle_and_downsample(testDF, test_samples)
    return trainDF, testDF
def build_image_network():
    '''
    builds CNN for image feature extraction
    CNN is designed to match the pretrained network from VGG

    returns:
        network
    '''

    plog("Building lasagne net...")
    net = {}
    net['input'] = InputLayer((None, 3, 224, 224))
    net['conv1'] = ConvLayer(net['input'], num_filters=96, filter_size=7, stride=2)
    net['norm1'] = NormLayer(net['conv1'], alpha=0.0001) # caffe has alpha = alpha * pool_size
    net['pool1'] = PoolLayer(net['norm1'], pool_size=3, stride=3, ignore_border=False)
    net['conv2'] = ConvLayer(net['pool1'], num_filters=256, filter_size=5)
    net['pool2'] = PoolLayer(net['conv2'], pool_size=2, stride=2, ignore_border=False)
    net['conv3'] = ConvLayer(net['pool2'], num_filters=512, filter_size=3, pad=1)
    net['conv4'] = ConvLayer(net['conv3'], num_filters=512, filter_size=3, pad=1)
    net['conv5'] = ConvLayer(net['conv4'], num_filters=512, filter_size=3, pad=1)
    net['pool5'] = PoolLayer(net['conv5'], pool_size=3, stride=3, ignore_border=False)
    net['fc6'] = DenseLayer(net['pool5'], num_units=4096)
    net['drop6'] = DropoutLayer(net['fc6'], p=0.5)
    net['fc7'] = DenseLayer(net['drop6'], num_units=4096)
    net['drop7'] = DropoutLayer(net['fc7'], p=0.5)
    net['fc8'] = DenseLayer(net['drop7'], num_units=1000, nonlinearity=lasagne.nonlinearities.softmax)
    output_layer = net['fc8']
    lasagne.layers.set_all_param_values(output_layer, PRETRAINED_VGG['values'])
    return net
Exemplo n.º 6
0
    def __init__(self, port, cfgs, reload_sign):
        self.port: int = port
        self.cfgs = cfgs
        self.reload_sign: queue.Queue = reload_sign

        self.domain_match_dict = self.make_domain_match_dict()
        plog(self.domain_match_dict)

        self.ssl_ctx = {}
        self.default_ssl_ctx = None
        self.load_ssl_ctxs()
Exemplo n.º 7
0
def main():
    conf_file = sys.argv[1]
    conf = load_config(conf_file)
    while 1:
        try:
            do_main(conf)
            sleep_minute = conf.get("sleep", 30)
            plog(u"%d 分钟后执行下一次检查", sleep_minute)
            time.sleep(sleep_minute * 60)
        except KeyboardInterrupt, _:
            sys.exit(0)
        except Exception, e:
            plog(str(e.message))
            time.sleep(60)
def build_text_matrices(datadir, tokenizer_path, trainDF, valDF, testDF):
    '''
    use bag-of-words representation to convert descriptions into bag-of-words matrices
    '''
    plog("Building text matrices...")
    with open(tokenizer_path) as f:
        tokenizer=pkl.load(f)
    train_text_matrix_path=datadir + 'train_text.pkl'
    val_text_matrix_path=datadir + 'val_text.pkl'
    test_text_matrix_path=datadir + 'test_text.pkl'

    bow_train, idx_train = bag_of_words.series_to_bag_of_words(trainDF.description_clean,tokenizer,train_text_matrix_path,mode="binary")
    bow_val, idx_val = bag_of_words.series_to_bag_of_words(valDF.description_clean,tokenizer,val_text_matrix_path,mode="binary")
    bow_test, idx_test = bag_of_words.series_to_bag_of_words(testDF.description_clean,tokenizer,test_text_matrix_path,mode="binary")

    plog("bow_train type: %s" %type(bow_train))
    return (bow_train, bow_val, bow_test)
Exemplo n.º 9
0
def conditional_hstack(other, bow, image, dataset_name):
    '''
    assumes other is present.
    if bag of words is not none, hstack it
    if image is not None, hstack it
    '''
    if other is not None:
        X = other
        if bow is not None:
            assert bow.shape[0] == X.shape[0]
            X = np.hstack((X, bow))
        else:
            plog("Bag of words data missing from %s" % dataset_name)
        if image is not None:
            assert image.shape[0] == X.shape[0]
            X = np.hstack((X, image))
        else:
            plog("Image data missing from %s" % dataset_name)
    return X
def build_brand_matrices(trainDF, valDF, testDF):
    '''
    one-hot encode brand indexes
    '''
    brand_list = get_brand_index(trainDF,valDF,testDF)
    with open(datadir + 'brand_list.pkl','wb') as f:
        pkl.dump(brand_list,f)

    plog("Building brand matrices...")
    enc = OneHotEncoder()
    train_vect = np.reshape(trainDF.brand_num.values,(-1,1))
    brands_train = enc.fit_transform(train_vect).toarray()

    val_vect = np.reshape(valDF.brand_num.values,(-1,1))    
    brands_val = enc.transform(val_vect).toarray()

    test_vect = np.reshape(testDF.brand_num.values,(-1,1))
    brands_test = enc.transform(test_vect).toarray()
    return (brands_train, brands_val, brands_test)
def merge_data(bows,images,others):
    '''
    merge together the datasets to be used in the model
    args:
        sets: list of datasets to be used
    returns: 2D float32 numpyarrays
    '''
    #HACK: splitting None into 3
    if bows is None:
        bows = (None,None,None)
    if images is None:
        images= (None,None,None)

    plog("Merging data...")
    X_train = conditional_hstack(others[0],bows[0],images[0],'train')
    X_val = conditional_hstack(others[1],bows[1],images[1],'val')
    X_test = conditional_hstack(others[2],bows[2],images[2],'test')

    return X_train.astype(np.float32), X_val.astype(np.float32), X_test.astype(np.float32)
def conditional_hstack(other,bow,image,dataset_name):
    '''
    assumes other is present.
    if bag of words is not none, hstack it
    if image is not None, hstack it
    '''
    if other is not None:
        X=other
        if bow is not None:
            assert bow.shape[0]==X.shape[0]
            X = np.hstack((X,bow))
        else:
            plog("Bag of words data missing from %s" %dataset_name)
        if image is not None:
            assert image.shape[0]==X.shape[0]
            X = np.hstack((X,image))
        else:
            plog("Image data missing from %s" %dataset_name)
    return X
Exemplo n.º 13
0
def build_brand_matrices(trainDF, valDF, testDF):
    '''
    one-hot encode brand indexes
    '''
    brand_list = get_brand_index(trainDF, valDF, testDF)
    with open(datadir + 'brand_list.pkl', 'wb') as f:
        pkl.dump(brand_list, f)

    plog("Building brand matrices...")
    enc = OneHotEncoder()
    train_vect = np.reshape(trainDF.brand_num.values, (-1, 1))
    brands_train = enc.fit_transform(train_vect).toarray()

    val_vect = np.reshape(valDF.brand_num.values, (-1, 1))
    brands_val = enc.transform(val_vect).toarray()

    test_vect = np.reshape(testDF.brand_num.values, (-1, 1))
    brands_test = enc.transform(test_vect).toarray()
    return (brands_train, brands_val, brands_test)
Exemplo n.º 14
0
def merge_data(bows, images, others):
    '''
    merge together the datasets to be used in the model
    args:
        sets: list of datasets to be used
    returns: 2D float32 numpyarrays
    '''
    #HACK: splitting None into 3
    if bows is None:
        bows = (None, None, None)
    if images is None:
        images = (None, None, None)

    plog("Merging data...")
    X_train = conditional_hstack(others[0], bows[0], images[0], 'train')
    X_val = conditional_hstack(others[1], bows[1], images[1], 'val')
    X_test = conditional_hstack(others[2], bows[2], images[2], 'test')

    return X_train.astype(np.float32), X_val.astype(np.float32), X_test.astype(
        np.float32)
def get_image_matrices(train_imagepath,test_imagepath, trainDF, valDF, testDF):
    '''
    load images from pkl files and convert to matrices
    '''

    plog("Loading train image features from %s..." %train_imagepath)
    with open(train_imagepath,'rb') as f:
        imageDF=pkl.load(f)
    
    if test_imagepath is not None:
        plog("Loading test image features from %s..." %test_imagepath)
        with open(test_imagepath,'rb') as f:
            test_imageDF = pkl.load(f)

        test_image_matrix = test_imageDF.as_matrix()
        test_image_matrix = test_image_matrix[:testDF.shape[0],:]
        assert test_image_matrix.shape[0]==testDF.shape[0]
    else: test_image_matrix=None

    image_matrix = imageDF.as_matrix()
    train_image_matrix = image_matrix[:trainDF.shape[0],:]
    val_image_matrix = image_matrix[trainDF.shape[0]:trainDF.shape[0] + valDF.shape[0],:]

    return (train_image_matrix, val_image_matrix, test_image_matrix)
def get_selected_image_features(df,
                                datadir,
                                dataset,
                                iloc0,
                                iloc1,
                                save_freq,
                                out_pickle_name='image_features.pkl',
                                batch_size=256,
                                width=224,
                                filetype='jpg'):
    '''
    for a given index range, download and resize the images,
    then save to directory

    args:
        df: dataframe where image urls are
        iloc0: int or None. first iloc of range of images to download
        iloc1: int or None. last iloc of range of images to download
        save_freq: how many batches before saving
        out_pickle_name: name of outfile
        batch_size: rows per batch
        dataset: string 'train' or 'test' or other identifier

    returns:
        none
    '''
    plog("Beginning feature extraction...")
    assert iloc0<=df.shape[0]
    assert iloc1<=df.shape[0]
    image_urls = df.large_image_URL.iloc[iloc0:iloc1]
    iloc=iloc0
    prev_iloc = iloc0
    batch_num=0
    featureDF = pd.DataFrame()
    
    for batch in iterate_minibatches(image_urls,batch_size):
        plog("extracting image features for batch %i, iloc %i" %(batch_num,iloc))
        batch_featureDF = batch_extract_features(batch,dataset,datadir,width,filetype)
        featureDF=featureDF.append(batch_featureDF,verify_integrity=True)
        
        iloc+=batch_size
        batch_num+=1
        
        if iloc>iloc0 and (batch_num%save_freq==0 or iloc>=iloc1-1):
            plog("Saving from image iloc %i to image iloc %i" %(prev_iloc,iloc))
            #Append to csv here
            with open('csv_fn.csv','a') as outf:
                featureDF.to_csv(outf,header=False)

            #with open(datadir+out_pickle_name + '_' + str(prev_iloc) + '_' + str(iloc)+'.pkl','wb') as outf:
            #    pkl.dump(featureDF,outf)  
            prev_iloc = iloc

            #reset featureDF to save memory
            featureDF = pd.DataFrame()
Exemplo n.º 17
0
def do_main(conf):
    for info in conf.get("domains"):
        domain = info["domain"]
        hosts = info["hosts"]
        plog(u"检查域名: %s", domain)
        # pick fastest domain
        fastest_host = pick_fastest_ping(hosts)
        plog(u"响应最快的主机: %s", fastest_host)

        # update_recode
        new_value = update_record(domain, fastest_host)
        plog(u"新的记录值: %s => %s", domain, new_value)
Exemplo n.º 18
0
'''
main.py
End-to-end script for running all processes.  
'''
__author__='Charlie Guthrie'

from utils import create_log,plog,fplog
create_log(__file__)
import sys

#Command-line arguments
if len(sys.argv)<2:
    plog("Usage: python main.py [num_train_samples] [use_images|use_text]")
    sys.exit()
else:
    train_samples = int(sys.argv[1])
    if 'use_images' in sys.argv:
        use_images=True
    else:
        use_images=False
    if 'use_text' in sys.argv:
        use_text=True
    else:
        use_text=False

plog('importing main.py modules...')
import os
import data_prep
import models
import pdb
from datetime import datetime
Exemplo n.º 19
0
#! python3
from utils import getIntegers, plog
logfile = r"\\192.168.99.91\shares\scripts\BartenderPrint\testlog_.log"

file = open(logfile, "r")
cnt = 0
for line in file:
    cnt += getIntegers(line)[6]
print(cnt)
plog(
    logfile, "Итого за февраль напечатано" + str(cnt) +
    "термочеков для групп, что составляет" + str(int(cnt / 300)) +
    "роликов или " + str(cnt / 300 / 60) + "коробок")
plog(
    logfile,
    "рассчёты произведены без учёта термочеков для грузчиков или другого использования, такого как наклейки для овощей или фруктов"
)
'''
data_prep.py

Starting with the csv's, ending with X_train, y_train, X_val, y_val, X_test, y_test
Where X's are feature vectors and y's are classifier integers
'''
__author__='Charlie Guthrie'

from utils import create_log,plog
create_log(__file__)

plog('importing modules...')
from datetime import datetime
import os
import pandas as pd
import numpy as np
import pdb
import cPickle as pkl
import bag_of_words
from sklearn.preprocessing import OneHotEncoder


def shuffle_and_downsample(df,samples):
    '''
    shuffle dataframe, including previous indexes, then downsample
    args:
        samples: number of samples
    '''
    #random seed 9 makes sure we always get the same shuffle.
    np.random.seed(9)
    assert df.shape[0]>2
Exemplo n.º 21
0
def main(datadir,
         train_samples=10000,
         test_samples=1000,
         val_portion=0.1,
         use_images=True,
         use_text=True,
         train_image_fn='train_image_features_0_2500.pkl',
         test_image_fn='test_image_features_0_2500.pkl',
         debug=False):
    '''
    1. run train_val_split on training
    1b. run shuffle on test
    2. if text:
        a. train tokenizer
        b. convert text data to bag of words matrix
    3. if images:
        a. extract image data
    4. merge datasets

    returns: X_train,y_train,X_val,y_val,X_test,y_test
    '''

    if (debug):
        trainpath = datadir + 'head_train_set.csv'
        testpath = datadir + 'head_test_set.csv'
        train_imagepath = datadir + 'train_image_features_0_2500.pkl'
        test_imagepath = datadir + 'test_image_features_0_2500.pkl'
        train_samples = 90
        test_samples = 90
    else:
        trainpath = datadir + 'train_set.csv'
        testpath = datadir + 'test_set.csv'
        train_imagepath = datadir + train_image_fn
        test_imagepath = datadir + test_image_fn

    dstart = datetime.now()
    plog("Checking to see if prepped data already available...")
    outpath = datadir + 'model_data_%i_%r_%s_%s.pkl' % (
        train_samples, val_portion, use_images, use_text)
    if os.path.exists(outpath):
        plog("Data found.  Loading...")
        with open(outpath, 'rb') as f:
            data, n_values = pkl.load(f)

        dfin = datetime.now()
        plog("Data loading time: %s" % (dfin - dstart))
        return data, n_values

    plog("Prepped data not available.  Preparing data...")

    plog("Loading train csv...")
    trainDF = pd.read_csv(trainpath, header=0, index_col=0, low_memory=False)
    plog("Loading test csv...")
    testDF = pd.read_csv(testpath, header=0, index_col=0, low_memory=False)

    trainDF = shuffle_and_downsample(trainDF, train_samples)
    trainDF, valDF = train_val_split(trainDF, val_portion)
    testDF = shuffle_and_downsample(testDF, test_samples)
    #Load text data
    t0 = datetime.now()
    if use_text:
        bow_data = build_text_matrices(datadir, 'tokenizer_5000.pkl', trainDF,
                                       valDF, testDF)
        t1 = datetime.now()
        plog("Time to load text: %s" % str(t1 - t0))
    else:
        bow_data = None

    #Load image data
    t1 = datetime.now()
    if use_images:
        image_data = get_image_matrices(train_imagepath, test_imagepath,
                                        trainDF, valDF, testDF)
        t2 = datetime.now()
        plog("Time to load images: %s" % str(t2 - t1))
    else:
        image_data = None

    #Load other data
    y1_train, y2_train, y3_train = get_targets(trainDF)
    y1_val, y2_val, y3_val = get_targets(valDF)
    y1_test, y2_test, y3_test = get_targets(testDF)

    other_data = build_brand_matrices(trainDF, valDF, testDF)

    X_train, X_val, X_test = merge_data(bow_data, image_data, other_data)

    train_data = X_train, y1_train, y2_train, y3_train
    val_data = X_val, y1_val, y2_val, y3_val
    test_data = X_test, y1_test, y2_test, y3_test

    keys = ['y_1', 'y_2', 'y_3']
    values = [max(d) + 1 for d in train_data[1:]]
    n_values = dict(zip(keys, values))
    data = (train_data, val_data, test_data)

    plog("Data loaded.  Saving to %s" % outpath)
    with open(outpath, 'wb') as f:
        pkl.dump((data, n_values), f)

    dfin = datetime.now()
    plog("Data loading time: %s" % (dfin - dstart))

    return data, n_values
def main(datadir,
        train_samples=10000,
        test_samples=1000,
        val_portion=0.1,
        use_images=True,
        use_text=True,
        train_image_fn='train_image_features_0_2500.pkl',
        test_image_fn='test_image_features_0_2500.pkl',
        debug=False):
    '''
    1. run train_val_split on training
    1b. run shuffle on test
    2. if text:
        a. train tokenizer
        b. convert text data to bag of words matrix
    3. if images:
        a. extract image data
    4. merge datasets

    returns: X_train,y_train,X_val,y_val,X_test,y_test
    '''


    if(debug):
        trainpath = datadir + 'head_train_set.csv'
        testpath = datadir + 'head_test_set.csv'
        train_imagepath = datadir + 'train_image_features_0_2500.pkl'
        test_imagepath = datadir + 'test_image_features_0_2500.pkl'
        train_samples = 90
        test_samples = 90
    else:
        trainpath = datadir + 'train_set.csv'
        testpath = datadir + 'test_set.csv'
        train_imagepath = datadir + train_image_fn
        test_imagepath = datadir + test_image_fn

    dstart=datetime.now()
    plog("Checking to see if prepped data already available...")
    outpath = datadir + 'model_data_%i_%r_%s_%s.pkl'%(train_samples,val_portion,use_images,use_text)
    if os.path.exists(outpath):
        plog("Data found.  Loading...")
        with open(outpath,'rb') as f:
            data,n_values = pkl.load(f)

        dfin = datetime.now()
        plog("Data loading time: %s" %(dfin-dstart))
        return data,n_values

    plog("Prepped data not available.  Preparing data...")


    plog("Loading train csv...")
    trainDF = pd.read_csv(trainpath,header = 0, index_col = 0,low_memory = False)
    plog("Loading test csv...")
    testDF = pd.read_csv(testpath,header = 0, index_col = 0,low_memory = False)

    trainDF = shuffle_and_downsample(trainDF,train_samples)
    trainDF,valDF = train_val_split(trainDF,val_portion)
    testDF = shuffle_and_downsample(testDF,test_samples)
    #Load text data
    t0 = datetime.now()
    if use_text:
        bow_data=build_text_matrices(datadir, 'tokenizer_5000.pkl', trainDF, valDF, testDF)
        t1 = datetime.now()
        plog("Time to load text: %s" %str(t1-t0))
    else:
        bow_data=None

    #Load image data
    t1 = datetime.now()
    if use_images:
        image_data = get_image_matrices(train_imagepath,test_imagepath,trainDF, valDF, testDF)
        t2 = datetime.now()
        plog("Time to load images: %s" %str(t2-t1))
    else:
        image_data=None

    #Load other data
    y1_train,y2_train,y3_train=get_targets(trainDF)
    y1_val,y2_val,y3_val=get_targets(valDF)
    y1_test,y2_test,y3_test=get_targets(testDF)

    other_data = build_brand_matrices(trainDF, valDF, testDF)

    X_train,X_val,X_test = merge_data(bow_data,image_data,other_data)

    train_data = X_train, y1_train, y2_train, y3_train
    val_data = X_val, y1_val, y2_val, y3_val
    test_data = X_test, y1_test, y2_test, y3_test

    keys = ['y_1','y_2','y_3']
    values = [max(d)+1 for d in train_data[1:]]
    n_values = dict(zip(keys,values))
    data = (train_data, val_data, test_data)

    plog("Data loaded.  Saving to %s" %outpath)
    with open(outpath,'wb') as f:
        pkl.dump((data,n_values),f)

    dfin = datetime.now()
    plog("Data loading time: %s" %(dfin-dstart))

    return data,n_values
Exemplo n.º 23
0
def stitch_files(basename,idx_start=0,idx_finish=None):
    '''
    Cycles through all files in the basename directory and stacks them together.
    args:
        basename: name without indexes, e.g. 'train_image_features'
        idx_start: starting index (usually 0)
        idx_finish: last index of the output file
    returns:
        none.  saves pickle of images stitched together
        
    '''
    #datadir = '../data/'
    datadir = '/scratch/cdg356/spring/data/'
    featuredir = datadir+basename+'/'
    
    #Get list of indexes
    iloc0_list = []
    iloc1_list = []
    for root, dirs, files in os.walk(featuredir):
        for fname in files:
            idx_range = get_indexes(fname)
            if idx_range[0] is not None and idx_range[1] is not None:
                if idx_range[0]>=idx_start and idx_range[1]<=idx_finish:
                    iloc0_list.append(idx_range[0])
                    iloc1_list.append(idx_range[1])
    iloc0_list.sort()
    iloc1_list.sort()

    #Make sure there are no duplicates present
    assert len(iloc0_list)==len(set(iloc0_list))
    assert len(iloc1_list)==len(set(iloc1_list))

    #Make sure there are no gaps, i.e. that iloc1 of one file = iloc0 of the next
    for i in range(len(iloc0_list)-1):
        assert iloc0_list[i+1]==iloc1_list[i]

    #Load files
    for i,iloc0 in enumerate(iloc0_list):
        iloc1=iloc1_list[i]
        fname = basename + "_%i_%i.pkl" %(iloc0,iloc1)
        plog("loading %s..." %fname)
        with open(featuredir + fname,'rb') as f:
            if i==0:
                df=pkl.load(f)
            else:
                df2=pkl.load(f)
                df=pd.concat([df,df2])
        plog("df shape: %s" %str(df.shape))
    max_index = max(iloc1_list)

    # A couple sanity checks
    assert max_index == iloc1_list[-1]
    if idx_finish is not None:
        assert idx_finish==max_index
    assert idx_start==iloc0_list[0]
    
    outname = datadir + basename + '_%i_%i.pkl'%(idx_start,max_index)

    plog("writing to %s..." %outname)
    with open(outname,'wb') as f:
        pkl.dump(df,f)
#TODO: 
#batch these up into batches of 256 or 512 images
from utils import create_log,plog
create_log(__file__)

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import theano
import cPickle as pkl
import download_images_to_directory as dl
from datetime import datetime

plog("Theano device: %s" %theano.config.device)

#dnn requires GPU
import lasagne
from lasagne.layers import InputLayer, DenseLayer, DropoutLayer
from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer
from lasagne.layers import MaxPool2DLayer as PoolLayer
from lasagne.layers import LocalResponseNormalization2DLayer as NormLayer
from lasagne.utils import floatX

# ### Load the model parameters and metadata
def load_pretrained_model(datadir):
    plog("Loading vgg model...")
    model = pkl.load(open(datadir+'vgg_cnn_s.pkl'))
    #CLASSES = model['synset words']
    mean_image = model['mean image']
Exemplo n.º 25
0
        previoustime = os.path.getmtime(file)
    except:
        previoustime = os.path.getmtime(file)

# отсортировать - не нужно
# сделать дифф
    try:
        previousjson
        currentjson = utils.loadjson(file, quiet=True)
        currentcustomtime = ntpath.getmtime(file)
        #print(currentcustomtime)
        #print(currentjson["group1"]["lastnum"] > previousjson["group1"]["lastnum"])
        if currentjson["group1"]["lastnum"] > previousjson["group1"]["lastnum"]:
            utils.plog(
                logfile,
                "напечатано " + str(currentjson["group1"]["lastnum"] -
                                    previousjson["group1"]["lastnum"]) +
                " бирок для первой группы", currentcustomtime)
        if currentjson["group2"]["lastnum"] > previousjson["group2"]["lastnum"]:
            utils.plog(
                logfile,
                "напечатано " + str(currentjson["group2"]["lastnum"] -
                                    previousjson["group2"]["lastnum"]) +
                " бирок для второй группы", currentcustomtime)
        if currentjson["group3"]["lastnum"] > previousjson["group3"]["lastnum"]:
            utils.plog(
                logfile,
                "напечатано " + str(currentjson["group3"]["lastnum"] -
                                    previousjson["group3"]["lastnum"]) +
                " бирок для третьей группы", currentcustomtime)
        if currentjson["group4"]["lastnum"] > previousjson["group4"]["lastnum"]:
Exemplo n.º 26
0
'''
data_prep.py

Starting with the csv's, ending with X_train, y_train, X_val, y_val, X_test, y_test
Where X's are feature vectors and y's are classifier integers
'''
__author__ = 'Charlie Guthrie'

from utils import create_log, plog
create_log(__file__)

plog('importing modules...')
from datetime import datetime
import os
import pandas as pd
import numpy as np
import pdb
import cPickle as pkl
import bag_of_words
from sklearn.preprocessing import OneHotEncoder


def shuffle_and_downsample(df, samples):
    '''
    shuffle dataframe, including previous indexes, then downsample
    args:
        samples: number of samples
    '''
    #random seed 9 makes sure we always get the same shuffle.
    np.random.seed(9)
    assert df.shape[0] > 2
def load_pretrained_model(datadir):
    plog("Loading vgg model...")
    model = pkl.load(open(datadir+'vgg_cnn_s.pkl'))
    #CLASSES = model['synset words']
    mean_image = model['mean image']
    return model, mean_image
Exemplo n.º 28
0
'''
main.py
End-to-end script for running all processes.  
'''
__author__ = 'Charlie Guthrie'

from utils import create_log, plog, fplog
create_log(__file__)
import sys

#Command-line arguments
if len(sys.argv) < 2:
    plog("Usage: python main.py [num_train_samples] [use_images|use_text]")
    sys.exit()
else:
    train_samples = int(sys.argv[1])
    if 'use_images' in sys.argv:
        use_images = True
    else:
        use_images = False
    if 'use_text' in sys.argv:
        use_text = True
    else:
        use_text = False

plog('importing main.py modules...')
import os
import data_prep
import models
import pdb
from datetime import datetime