예제 #1
0
def wide_and_deep(output_dir,
                  nbuckets=5,
                  hidden_units='64,16,4',
                  learning_rate=0.01):
    real, sparse = get_features()

    hidden_units = hidden_units.split(',')
    hidden_units = list(map(int, hidden_units))
    print(".........................", hidden_units)

    # bucketise/discretise lat and lon to nbuckets
    latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist()  # USA
    lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist()  # USA

    disc = {}
    disc.update({
        'd_{}'.format(key): tflayers.bucketized_column(real[key], latbuckets) \
        for key in ['dep_lat', 'arr_lat']
    })
    disc.update({
        'd_{}'.format(key): tflayers.bucketized_column(real[key], lonbuckets) \
        for key in ['dep_lon', 'arr_lon']
    })

    # cross columns for new features
    sparse['dep_loc'] = tflayers.crossed_column(
        [disc['d_dep_lat'], disc['d_dep_lon']], nbuckets * nbuckets)
    sparse['arr_loc'] = tflayers.crossed_column(
        [disc['d_arr_lat'], disc['d_arr_lon']], nbuckets * nbuckets)

    sparse['dep_arr'] = tflayers.crossed_column(
        [sparse['dep_loc'], sparse['arr_loc']], nbuckets**4)
    sparse['ori_dest'] = tflayers.crossed_column(
        [sparse['origin'], sparse['dest']], hash_bucket_size=1000)
    # checkpoint
    # create embeddings of all the sparse columns
    embed = {
       colname : create_embed(col) \
          for colname, col in sparse.items()
    }
    real.update(embed)

    estimator = tflearn.DNNLinearCombinedClassifier(
        model_dir=output_dir,
        linear_feature_columns=sparse.values(),
        dnn_feature_columns=real.values(),
        dnn_hidden_units=hidden_units)

    estimator.params["head"]._thresholds = [0.7]

    return estimator
def build_estimator(model_dir, model_type):
    """build an estimator"""

    # base sparse feature process
    gender = layers.sparse_column_with_keys(column_name='gender', keys=['female', 'male'])
    education = layers.sparse_column_with_hash_bucket(column_name='education', hash_bucket_size=1000)
    relationship = layers.sparse_column_with_hash_bucket(column_name='relationship', hash_bucket_size=100)
    workclass = layers.sparse_column_with_hash_bucket(column_name='workclass', hash_bucket_size=100)
    occupation = layers.sparse_column_with_hash_bucket(column_name='occupation', hash_bucket_size=1000)
    native_country = layers.sparse_column_with_hash_bucket(column_name='native_country', hash_bucket_size=1000)

    # base continuous feature
    age = layers.real_valued_column(column_name='age')
    education_num = layers.real_valued_column(column_name='education_num')
    capital_gain = layers.real_valued_column(column_name='capital_gain')
    capital_loss = layers.real_valued_column(column_name='capital_loss')
    hours_per_week = layers.real_valued_column(column_name='hours_per_week')

    # transformation.bucketization 将连续变量转化为类别标签。从而提高我们的准确性
    age_bucket = layers.bucketized_column(source_column=age,
                                          boundaries=[18, 25, 30, 35, 40, 45,50, 55, 60, 65])

    # wide columns and deep columns
    # 深度模型使用到的特征和广度模型使用到的特征
    # 广度模型特征只只用到了分类标签
    wide_columns = [gender, native_country, education, relationship, workclass, occupation, age_bucket,
                    layers.crossed_column(columns=[education, occupation], hash_bucket_size=int(1e4)),
                    layers.crossed_column(columns=[age_bucket, education, occupation], hash_bucket_size=int(1e6)),
                    layers.crossed_column(columns=[native_country, occupation], hash_bucket_size=int(1e4))]

    deep_columns = [layers.embedding_column(workclass, dimension=8),
                    layers.embedding_column(education, dimension=8),
                    layers.embedding_column(gender, dimension=8),
                    layers.embedding_column(relationship, dimension=8),
                    layers.embedding_column(native_country, dimension=8),
                    layers.embedding_column(occupation, dimension=8),
                    age, education_num, capital_gain, capital_loss, hours_per_week]

    if model_type == "wide":
        m=learn.LinearClassifier(feature_columns=wide_columns, model_dir=model_dir)
    elif model_type == "deep":
        m=learn.DNNClassifier(feature_columns=deep_columns, model_dir=model_dir, hidden_units=[100, 50])
    else:
        m=learn.DNNLinearCombinedClassifier(model_dir=model_dir,
                                            linear_feature_columns=wide_columns,
                                            dnn_feature_columns=deep_columns,
                                            dnn_hidden_units=[256, 128, 64],
                                            dnn_activation_fn=tf.nn.relu)
    return m
예제 #3
0
def wide_and_deep_model(output_dir, nbuckets=5, hidden_units='64,32', learning_rate=0.01):
    real, sparse = get_features()

    # the lat/lon columns can be discretized to yield "air traffic corridors"
    latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist()  # USA
    lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist() # USA
    disc = {}
    disc.update({
       'd_{}'.format(key) : tflayers.bucketized_column(real[key], latbuckets) \
          for key in ['dep_lat', 'arr_lat']
    })
    disc.update({
       'd_{}'.format(key) : tflayers.bucketized_column(real[key], lonbuckets) \
          for key in ['dep_lon', 'arr_lon']
    })

    # cross columns that make sense in combination
    sparse['dep_loc'] = tflayers.crossed_column([disc['d_dep_lat'], disc['d_dep_lon']],\
                                                nbuckets*nbuckets)
    sparse['arr_loc'] = tflayers.crossed_column([disc['d_arr_lat'], disc['d_arr_lon']],\
                                                nbuckets*nbuckets)
    sparse['dep_arr'] = tflayers.crossed_column([sparse['dep_loc'], sparse['arr_loc']],\
                                                nbuckets ** 4)
    sparse['ori_dest'] = tflayers.crossed_column([sparse['origin'], sparse['dest']], \
                                                hash_bucket_size=1000)
    
    # create embeddings of all the sparse columns
    embed = {
       colname : create_embed(col) \
          for colname, col in sparse.items()
    }
    real.update(embed)
 
    estimator = \
        tflearn.DNNLinearCombinedClassifier(model_dir=output_dir,
                                           linear_feature_columns=sparse.values(),
                                           dnn_feature_columns=real.values(),
                                           dnn_hidden_units=parse_hidden_units(hidden_units))
                                           #linear_optimizer=tf.train.FtrlOptimizer(learning_rate=learning_rate),
                                           #dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate*0.25))
    estimator.params["head"]._thresholds = [0.7]  # FIXME: hack
    return estimator
    print >> sys.stderr, "   ", cid, C
    cid += 1

ttrn = [class2id[C] for C in ttrn]
ttst = [class2id[C] for C in ttst]

#classifier = lrn.DNNClassifier(feature_columns=[tf.contrib.layers.real_valued_column("", dimension=snps.shape[1])],
#                                                    hidden_units=[10, 20, 10],
#                                                    n_classes=n_classes)

#classifier = lrn.DNNLinearCombinedClassifier(dnn_feature_columns=[tf.contrib.layers.real_valued_column("", dimension=snps.shape[1])],
#                                                 dnn_hidden_units=[10,20,10],
#                                                 n_classes=n_classes)
classifier = lrn.DNNLinearCombinedClassifier(
    linear_feature_columns=[
        tf.contrib.layers.real_valued_column("", dimension=snps.shape[1])
    ],
    #dnn_hidden_units=[10,20,10],
    n_classes=n_classes)
#classifier = lrn.LinearClassifier(feature_columns=[tf.contrib.layers.real_valued_column("", dimension=snps.shape[1])], n_classes=n_classes)
classifier.fit(strn, ttrn, steps=50)

pred = classifier.predict(stst)

#setup counters
count = 0
correct = 0
classCount = dict()
classCorrect = dict()
for C in classes:
    classCount[C] = 0
    classCorrect[C] = 0
def build_estimator(model_dir, classifier):

  # Categorical columns
  sex = tf.contrib.layers.sparse_column_with_keys(column_name="Sex",
                                                     keys=["female", "male"])
  family = tf.contrib.layers.sparse_column_with_keys(column_name="Family",
                                                       keys=["Large", "Nuclear", "Solo"])
  child = tf.contrib.layers.sparse_column_with_keys(column_name="Child",
                                                       keys=["0", "1"])
  ageknown = tf.contrib.layers.sparse_column_with_keys(column_name="AgeKnown",
                                                       keys=["0", "1"])
  embarked = tf.contrib.layers.sparse_column_with_keys(column_name="Embarked",
                                                       keys=["C", "S", "Q"])
  young = tf.contrib.layers.sparse_column_with_keys(column_name="Young",
                                                       keys=["0", "1"])
  malebadticket = tf.contrib.layers.sparse_column_with_keys(column_name="MaleBadTicket",
                                                       keys=["0", "1"])
  cab = tf.contrib.layers.sparse_column_with_hash_bucket(
      "Cab", hash_bucket_size=10)
  namet = tf.contrib.layers.sparse_column_with_hash_bucket(
      "NameT", hash_bucket_size=20)

  # Continuous columns
  age = tf.contrib.layers.real_valued_column("Age")
  namelength = tf.contrib.layers.real_valued_column("NameLength")
  fare = tf.contrib.layers.real_valued_column("Fare")
  p_class = tf.contrib.layers.real_valued_column("Pclass")

  # Transformations.
  fare_buckets = tf.contrib.layers.bucketized_column(fare,
  						     boundaries=[
						        5, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550
						     ])
  age_buckets = tf.contrib.layers.bucketized_column(age,
                                                    boundaries=[
                                                        5, 18, 25, 30, 35, 40, 45, 50, 55, 65
                                                    ])
  pclass_buckets = tf.contrib.layers.bucketized_column(p_class,
                                                    boundaries=[1, 2, 3])

   # Wide columns and deep columns.
  wide_columns = [sex, cab, namet, child, ageknown, embarked, young, family,
                  tf.contrib.layers.crossed_column(
                      [age_buckets, sex],
                      hash_bucket_size=int(1e3)),
		  tf.contrib.layers.crossed_column(
		      [pclass_buckets, sex],
                      hash_bucket_size=int(1e3)),
		  tf.contrib.layers.crossed_column(
		      [fare_buckets, pclass_buckets],
                      hash_bucket_size=int(1e3)),
		  tf.contrib.layers.crossed_column(
		      [embarked, pclass_buckets],
                      hash_bucket_size=int(1e3)),
		  tf.contrib.layers.crossed_column(
		      [embarked, sex],
                      hash_bucket_size=int(1e3))]


  deep_columns = [
      namelength,
      fare,
      p_class,
      tf.contrib.layers.embedding_column(sex, dimension=8),
      tf.contrib.layers.embedding_column(child, dimension=8),
      tf.contrib.layers.embedding_column(family, dimension=8),
      tf.contrib.layers.embedding_column(cab, dimension=8),
      tf.contrib.layers.embedding_column(namet, dimension=8),
      tf.contrib.layers.embedding_column(ageknown, dimension=8),
      tf.contrib.layers.embedding_column(embarked, dimension=8),
      tf.contrib.layers.embedding_column(young, dimension=8),
      tf.contrib.layers.embedding_column(malebadticket, dimension=8)
  ]

  if classifier == "deep":
    return Learn.DNNClassifier(model_dir=model_dir,
                               feature_columns=deep_columns,
                               hidden_units=[32, 16],
                               optimizer=tf.train.ProximalAdagradOptimizer(
                               learning_rate=0.1,
                               l2_regularization_strength=0.001))
  elif classifier == "wide":
    return Learn.LinearClassifier(
            feature_columns=wide_columns,
            optimizer=tf.train.FtrlOptimizer(
                    learning_rate=5,
                    l1_regularization_strength=1000.0,
                    l2_regularization_strength=1000.0),
                    model_dir=model_dir)
  else:
    return Learn.DNNLinearCombinedClassifier(
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[32, 16],
            model_dir=model_dir,   
	    linear_optimizer=tf.train.FtrlOptimizer(
	                        learning_rate=10,
				l1_regularization_strength=100.0,
				l2_regularization_strength=100.0),
            dnn_optimizer=tf.train.ProximalAdagradOptimizer(
	                            learning_rate=0.1,
                                    l2_regularization_strength=0.001))
     validation_set_pca_sc_scaled, "Standard-Scaled->PCA->Random Forest"))
pipelines.append((RandomForestClassifier(), training_set_imputed,
                  validation_set_imputed, "Imputed->Random Forest"))
pipelines.append((RandomForestClassifier(), training_set_mn_scaled_imputed,
                  validation_set_mn_scaled_imputed,
                  "Imputed->MinMax-Scaled->Random Forest"))
pipelines.append((RandomForestClassifier(), training_set_sc_scaled_imputed,
                  validation_set_sc_scaled_imputed,
                  "Imputed->Standard-Scaled->Random Forest"))
pipelines.append((RandomForestClassifier(), training_set_pca_sc_scaled_imputed,
                  validation_set_pca_sc_scaled_imputed,
                  "Imputed->Standard-Scaled->PCA->Random Forest"))

fc = [layers.real_valued_column("", dimension=len(training_set.columns))]
classifier_lc = learn.LinearClassifier(feature_columns=fc, n_classes=2)
classifier_dlc = learn.DNNLinearCombinedClassifier(linear_feature_columns=fc,
                                                   n_classes=2)
classifier_dc = learn.DNNClassifier(feature_columns=fc,
                                    n_classes=2,
                                    hidden_units=[1000, 300, 200])

fc_imputed = [
    layers.real_valued_column("", dimension=len(training_set_imputed.columns))
]
classifier_lc_imputed = learn.LinearClassifier(feature_columns=fc_imputed,
                                               n_classes=2)
classifier_dlc_imputed = learn.DNNLinearCombinedClassifier(
    linear_feature_columns=fc_imputed, n_classes=2)
classifier_dc_imputed = learn.DNNClassifier(feature_columns=fc_imputed,
                                            n_classes=2,
                                            hidden_units=[1000, 300, 200])