def main(): top_obj = class_defs.top () input_list_fp = open ("input_file_list.txt") key_list_fp = open ("key_file_list.txt") for ifile,kfile in zip(input_list_fp, key_list_fp): ifile = ifile.strip ('\n') kfile = kfile.strip ('\n') print ("Now Processing {}, {}".format(ifile, kfile)) top_obj.docs[ifile] = class_defs.document (top_obj, ifile, kfile) input_list_fp.close () key_list_fp.close () #Select a subset of the negative data generated randomly utils_temp.select_neg_data (top_obj, 2) print ("Pos Create Ana Encountered : ", top_obj.pos_create_ana_encountered) print ("Number of Positive and Negative Samples Generated") print ("Positive : {} Negative {} Selected Negative {}".format (len(top_obj.pos_list), len(top_obj.neg_list), len(top_obj.selected_neg_list))) #Debug Prints #for key, dobj in top_obj.docs.items(): # utils.compare_total_antecedents(dobj) utils.create_features (top_obj)
def train(self, X_train, Y_train): y_train = torch.from_numpy(Y_train.astype(int)).type(torch.LongTensor) tot_loss = 0.0 all_preds = [] for t in range(self.epochs): epoch_loss = 0.0 #model.train() y_pred = self.model(A, utils.create_features(graph)) all_preds.append(y_pred) loss = self.loss_function(y_pred[X_train], y_train) self.optimizer.zero_grad() epoch_loss += loss tot_loss += loss loss.backward() self.optimizer.step() print(str(t), 'epoch_loss:' + str(epoch_loss), 'total loss:' + str(tot_loss)) self.all_preds = all_preds
def process_graph(self, graph_path, batch_loss): """ Reading a graph and doing a forward pass on a graph with a time budget. :param graph_path: Location of the graph to process. :param batch_loss: Loss on the graphs processed so far in the batch. :return batch_loss: Incremented loss on the current batch being processed. """ data = json.load(open(graph_path)) graph, features = create_features(data, self.model.identifiers) node = random.choice(list(graph.nodes())) attention_loss = 0 for t in range(self.args.time): predictions, node, attention_score = self.model(data, graph, features, node) target, prediction_loss = calculate_predictive_loss(data, predictions) batch_loss = batch_loss + prediction_loss if t < self.args.time-2: attention_loss += (self.args.gamma**(self.args.time-t))*torch.log(attention_score) reward = calculate_reward(target, predictions) batch_loss = batch_loss-reward*attention_loss self.model.reset_attention() return batch_loss
def score(self): """ Scoring the test set graphs. """ print("\n") print("\nScoring the test set.\n") self.model.eval() self.predictions = [] for data in tqdm(self.test_graphs): data = json.load(open(data)) graph, features = create_features(data, self.model.identifiers) node_predictions = [] for _ in range(self.args.repetitions): node = random.choice(list(graph.nodes())) for _ in range(self.args.time): prediction, node, _ = self.model(data, graph, features, node) node_predictions.append(np.argmax(prediction.detach())) self.model.reset_attention() prediction = max(set(node_predictions), key=node_predictions.count) self.score_graph(data, prediction) self.accuracy = float(np.mean(self.predictions)) print("\nThe test set accuracy is: "+str(round(self.accuracy, 4))+".\n")
data_dict = pickle.load(data_file) # coerce data features to numeric values or a NaN df = pd.DataFrame(data_dict).transpose().apply(pd.to_numeric, errors="coerce") ### Task 2: Remove outliers # remove the TOTAL key from the dict because it's an outlier in the data and represents an aggregate value # 'THE TRAVEL AGENCY IN THE PARK' doesn't represent a person, but rather an entity df = df.drop(["TOTAL", "THE TRAVEL AGENCY IN THE PARK"], errors="ignore", axis=0) df = df.fillna(0) ### Task 3: Create new feature(s) df["pct_poi_messages"] = df.apply(calculate_pct_poi_msgs, axis=1) ### Store to my_dataset for easy export below. features, labels, my_dataset = create_features(df, features_list) # construct a PCA to use as a pipeline step pca = PCA() ### Task 4: Try a variety of classifiers # models = trial_models # these models below were tuned. # comment out this variable and comment in the `models = trial_models` variable above to run the trial models models = [ { "title": "DecisionTreeClassifier (RobustScaler + PCA) -- Tuned", "pipeline": Pipeline( steps=[ ("scaler", RobustScaler()),
from random import shuffle from time import sleep import time from datetime import datetime from sklearn.model_selection import train_test_split import utils import pandas as pd from keras.layers import Dropout from keras import regularizers nlp = sp.load('en_core_web_lg') with open(Path('../data/models/features/data_3.json'), 'r') as f: datalist = json.loads(f.read()) # dictionary: data, labels, ngrams = utils.create_features( datalist) # fulldata[:300] sind die chunk vectors docvec = [(np.fromstring(instance['title_vec'].strip('[]'), sep=',') + np.fromstring(instance['abstract_vec'].strip('[]'), sep=',') + np.fromstring(instance['text_vec'].strip('[]'), sep=',')) / 3 for instance in datalist] docvec = np.array(docvec) positive_examples = sum(labels) negative_ratio = 1 # sample equal number of negative and positive labels neg_idx = [i for i in range(labels.shape[0]) if labels[i] == 0] # indices of negative examples neg_idx = np.random.choice(np.array(neg_idx),
7 times faster for = 750 . C ONCLUSION The self loop feedback gating mechanism of recurrent networks has been derived from first princi- ples via a postulate of invariance to time warpings. ''' model = keras.models.load_model('../data/models/keras/model4.h5') doc_data = utils.simple_preprocess(title=title, abstract=abstract, text=text) fulldata, ngrams = utils.create_features(doc_data, labels=False) docvec = [(np.fromstring(instance['title_vec'].strip('[]'), sep=',') + np.fromstring(instance['abstract_vec'].strip('[]'), sep=',') + np.fromstring(instance['text_vec'].strip('[]'), sep=',')) / 3 for instance in doc_data] docvec = np.array(docvec) #data = fulldata[:,:300] - docvec data = fulldata[:, 300:] # 11 features data = data[:, [7]] predictions = model.predict(data) df_ = np.array([ ngrams.reshape((-1, )),
from random import shuffle from time import sleep import time from datetime import datetime from sklearn.model_selection import train_test_split import utils import pandas as pd nlp = sp.load('en_core_web_lg') with open(Path('../data/models/features/data_3.json'), 'r') as f: datalist = json.loads(f.read()) # dictionary: fulldata, labels, ngrams = utils.create_features(datalist) positive_examples = sum(labels) negative_ratio = 2 # sample equal number of negative and positive labels neg_idx = [i for i in range(labels.shape[0]) if labels[i] == 0] # indices of negative examples neg_idx = np.random.choice(np.array(neg_idx), positive_examples*negative_ratio, replace=False) pos_idx = [i for i in range(labels.shape[0]) if labels[i] == 1] # indices of positive examples pos_idx = np.random.choice(np.array(pos_idx), positive_examples, replace=False) idx = np.hstack((pos_idx, neg_idx))
self.softmax = nn.Softmax() def forward(self, Adj_matrix, input_features): x = self.layer1(Adj_matrix, input_features) x = self.activation(x) x = self.layer2(Adj_matrix, x) x = self.softmax(x) return x # In[6]: model = GCN(inputs_shape=utils.create_features(graph).shape[1], outputs_shape=4, n_classes=2, activation='Tanh') # In[7]: trainer = train.Trainer(model, optimizer=optim.Adam(model.parameters(), lr=0.01), loss_function=F.cross_entropy, epochs=250) # In[8]: trainer.train(X_train, Y_train)