示例#1
0
def upload_csv(args, files):
  """ Takes in a csv and creates a Model around it.
  CSV needs to have a feature per column. Also needs to have at least one column marked as output by prepending 
  'output_' to the column name (first row in file). Types will be conservatively infered from the input (ie type will be
  string as long as one cell contains a non-numeric character).
  
  Files: Path to tmp CSV file on server (handled by the framework).
  Returns: A JSON with the model handle just created, and the infered feature types.
  """
  if 'upload' not in files:
      print 'Files not specified in upload: ' + files
      return 'No file specified'
  upload = files['upload']
  if not upload:
    return 'File not valid'
  name, ext = os.path.splitext(upload.filename)
  if ext != '.csv':
      return 'File extension not recognized'
  save_path = "/tmp/{name}".format(name=upload.filename)
  if not os.path.isfile(save_path):
    upload.save(save_path)
  
  if os.stat(save_path).st_size > 100000:
    return 'File too big'

  model = new_model()
  res = model.add_train_file(save_path)
  if res:
    delete_model(model.get_handle())
    return json.dumps({'status': 'ERROR', 'handle': model.get_handle(), 'why': res})
  save_model(model)
  
  return json.dumps({'status': 'OK', 'handle': model.get_handle(), 'types': model.types})
示例#2
0
def train(air_model, train_epochs=20):
    """ Runs TPE black box optimization of the neural network to use.
  After evaluating all points, it saves the best model to disk and sets the status flag as TRAINED.
  """
    from db import get_model, save_model
    from model import ModelStatus
    info('Running training on new process')
    air_model.status = ModelStatus.TRAINING
    save_model(air_model)

    fspace = {
        'optimizer':
        hp.choice('optimzer', [
            'rmsprop', 'adagrad'
        ]),  #NEQP (Supongo que si, pero es a proposito que diga 'optimzer'?)
        'layers':
        hp.choice('layers', [(str(x), layer_choice(x))
                             for x in range(10)])  # Choose from 0 to 9 layers.
    }

    if config.DISTRIBUTED_HYPEROPT:
        # TODO: Probably not send all model from json. Just send the ids and make the worker fetch it from the DB.
        fspace['model_json'] = air_model.to_json()
        trials = MongoTrials('mongo://localhost:27017/testdb/jobs',
                             exp_key='userid.trainingid',
                             workdir='/home/paezand/pusher/bottle_air')
        best = fmin(fn=run_model_fn,
                    space=fspace,
                    trials=trials,
                    algo=tpe.suggest,
                    max_evals=train_epochs)
        # Run workers with
        # hyperopt-mongo-worker --mongo=$mongodbURL/testdb --poll-interval=0.1 --workdir=$bottle_air_dir
    else:
        trials = Trials(
        )  #NEQP (Checaste la opcion de hacer parallel search con MongoDB?)
        best = fmin(fn=air_model.run_model(),
                    space=fspace,
                    algo=tpe.suggest,
                    max_evals=train_epochs,
                    trials=trials)

    print 'best:', space_eval(fspace, best)

    print 'trials:'
    for trial in trials.trials[:2]:
        print trial

    model_fn = air_model.run_model(persist=True)
    model_fn(space_eval(fspace, best))  # Train and persist best model.

    print 'Training finished'
    air_model.status = ModelStatus.TRAINED
    air_model.best_model = best
    save_model(air_model)
示例#3
0
        def run_model_fn(hp):
            """ Definition to be evaluated by the black box optimizer.
        Params: hyperparameter dictionary.
      """
            print str(hp)
            output_headers = [
                outputs for outputs in self.data.iterkeys()
                if outputs.startswith('output_')
            ]
            if not output_headers:
                raise ValueError('No outputs defined!')

            # Process string features.
            if not self.string_features:
                self.string_features = []
                for header, typ in self.types.iteritems():
                    if typ != 'str':
                        continue
                    # Every string feature is treated as a list of words.
                    word_list = [x.split() for x in self.data[header]]
                    dict_, _ = self.process_text_feature(word_list)
                    assert len(dict_) > 0, 'Dict is empty.'
                    self.embedding_dicts[
                        header] = dict_  #NEQP (Si haces un nuevo dict para cada columna de strings, no hay entonces idx que se repiten para diferentes palabras?)
                    lengths = [len(words) for words in word_list]
                    lengths.sort()
                    input_size = lengths[int(
                        np.round((len(lengths) - 1) *
                                 0.95))]  #NEQP (Para que es este calculo?)
                    if input_size == 0:
                        print 'WARNING: input_size is 0 for ' + header
                        input_size = 1
                    for idx, words in enumerate(word_list):
                        # Strings to integers. Pad sequences with zeros so that all of them have the same size.
                        word_list[idx] = pad_sequences(
                            [[dict_[word] for word in words]],
                            maxlen=input_size,
                            padding='post',
                            truncating='post')[0].tolist(
                            )  #NEQP (Y esto que pex?)
                    self.string_features.append((header, word_list))

            # Build models.
            # Merge all inputs into one model.
            def init_model(self):
                feature_models = []
                total_input_size = 0
                i = 0
                for tup in self.string_features:
                    header = tup[0]
                    word_list = tup[1]
                    sequence_length = len(word_list[0])
                    embedding_size = int(
                        np.round(np.log10(len(self.embedding_dicts[header]))))
                    embedding_size = embedding_size if embedding_size > 0 else 1
                    model = Sequential(name='str_model_' +
                                       str(len(feature_models)))
                    model.add(
                        Embedding(len(self.embedding_dicts[header].keys()),
                                  embedding_size,
                                  input_length=sequence_length,
                                  name='embedding_model_' +
                                  str(len(feature_models))))
                    model.add(
                        Flatten(name='flatten_model_' +
                                str(len(feature_models))))
                    total_input_size += embedding_size * len(
                        word_list[0]
                    )  #NEQP (Si hay un embedding por palabra: realmente los embeddings podran generar un vector de significado? Y, no se le da mucho peso a las strings por sobre los integers?)
                    feature_models.append(model)

                numeric_inputs = len(self.data) - len(
                    self.string_features) - len(output_headers)
                if numeric_inputs:
                    num_model = Sequential(name='num_model_' +
                                           str(len(feature_models)))
                    num_model.add(
                        Dense(numeric_inputs,
                              input_shape=(numeric_inputs, ),
                              name='dense_model_' + str(len(feature_models))))
                    total_input_size += numeric_inputs
                    feature_models.append(num_model)

                merged_model = Sequential()
                if len(feature_models) < 0:  #NEQP (No seria < 1?)
                    raise ValueError('No models built, no inputs?')
                elif len(feature_models) == 1:
                    merged_model = feature_models[0]
                else:
                    merged_model.add(
                        Merge(feature_models, mode='concat', concat_axis=1))
                return merged_model, total_input_size

            # We will build in total DEEP_RANGE*WIDE_RANGE models.
            optimizer = hp['optimizer']
            layers = hp['layers']
            dropout = 0.2  # hp['dropout'] #NEQP (No estaria bueno igual que hyperopt tambien optimizara estos hyperparamenters?)
            batch_size = 128  # hp['batch_size']

            model, input_size = init_model(self)

            # We will add 'depth' layers with 'net_width' neurons.
            depth = len(layers[1])
            for i in range(depth):
                layer_activation = layers[1][i][1][1]
                layer_width = layers[1][i][1][
                    0]  #NEQP (Creo que es una buena practica variar el width de las layers. Normalmente se usan variaciones tipo 10-20-40-20-10)
                if i == 0 and depth != 1:
                    model.add(
                        Dense(layer_width,
                              input_shape=(input_size, ),
                              name='layer_model_' + str(i)))
                    model.add(Activation(layer_activation))
                    model.add(Dropout(dropout))
                elif i == depth - 1:
                    model.add(
                        Dense(len(output_headers),
                              input_shape=(len(layers[1][i - 1][1]), ),
                              name='layer_model_' + str(i)))
                else:
                    model.add(
                        Dense(layer_width,
                              input_shape=(len(layers[1][i - 1][1]), ),
                              name='layer_model_' + str(i)))
                    model.add(Activation(layer_activation))
                    model.add(Dropout(dropout))

            if not depth:
                model.add(
                    Dense(len(output_headers),
                          input_shape=(input_size, ),
                          name='layer_model_0'))
            # No Activation in the end for now... Assuming regression always.
            model.compile(loss='mse',
                          optimizer=optimizer,
                          metrics=['accuracy'])
            nb_epoch = 100
            if persist:
                nb_epoch = 300

            model_name = str(hp).replace('{', '').replace('}', '')
            if persist:
                X_train, Y_train = self.get_data_sets(sample=False)
            X_train, Y_train = self.get_data_sets(
                sample=True)  # Only use a small sample.

            VAL_SPLIT = 0.1  # Split of data to use as validation.
            print 'Sizes: ' + str(len(X_train)) + ', ' + str(
                X_train[0].shape) + ' ' + str(len(Y_train))
            with tf.Session() as sess:
                history = model.fit(
                    X_train,
                    Y_train,
                    batch_size=
                    batch_size,  #NEQP (Entonces X_train como esta organizado? Cual es su forma?)
                    nb_epoch=nb_epoch,
                    shuffle=True,
                    validation_split=VAL_SPLIT)
                if persist:
                    # Save the model for inference purposes.
                    from db import persist_keras_model
                    persist_keras_model(self.get_handle(), model)
                else:
                    # Save metrics of this run.
                    if model_name not in self.val_losses:
                        self.val_losses[model_name] = {}
                    for key, val in history.history.iteritems():
                        if key in self.val_losses[model_name]:
                            self.val_losses[model_name][key].extend(val)
                        else:
                            self.val_losses[model_name][key] = val
                    from db import save_model
                    save_model(self)

            total_dataset_loss = VAL_SPLIT * history.history['val_loss'][-1]
            +(1 - VAL_SPLIT) * history.history['loss'][-1]
            return {'loss': total_dataset_loss, 'status': STATUS_OK}
示例#4
0
import db
import scraping

page_num = 619
total_entries = 0
while True:
    print(page_num)
    details = db.get_listings_with_vins(page_num)
    if len(details) == 0:  # Proceed until there are no more to load
        break

    for detail in details:
        total_entries += 1
        print("Loading details for listing {}".format(detail['listing_id']))
        car_attributes = scraping.get_attributes_from_vin(detail['vin'])

        if car_attributes.get('Model'):
            new_model = car_attributes['Model']
            print("  Old model: {} New Model: {}".format(
                detail['model'], new_model))
            db.save_model(detail['listing_id'], new_model)

        if car_attributes.get('Trim'):
            new_trim = car_attributes['Trim']
            print("  Old trim: {} New trim: {}".format(detail['trim'],
                                                       new_trim))
            db.save_trim(detail['listing_id'], new_trim)

    page_num += 1

print("Total entries: {}".format(total_entries))