from sklearn import feature_extraction data = [{ "weight": 60., "sex": 'female', "student": True }, { "weight": 80.1, "sex": 'male', "student": False }, { "weight": 65.3, "sex": 'male', "student": True }, { "weight": 58.5, "sex": 'female', "student": False }] vectorizer = feature_extraction.DictVectorizer(sparse=False) vectors = vectorizer.fit_transform(data) print vectors print vectorizer.get_feature_names()
def handcrafted_features(data, tags): # # DOI 10.1007/s00251-017-1023-5 # Code from https://github.com/bittremieux/TCR-Classifier/blob/master/tcr_classifier.ipynb # Modified to apply handcrafted features twice, once to the alpha chain and again to the beta chain # Modified to handle split for training, validation, and test cohorts # Modified for multinomial classification # # physicochemical amino acid properties basicity = { 'A': 206.4, 'B': 210.7, 'C': 206.2, 'D': 208.6, 'E': 215.6, 'F': 212.1, 'G': 202.7, 'H': 223.7, 'I': 210.8, 'K': 221.8, 'L': 209.6, 'M': 213.3, 'N': 212.8, 'P': 214.4, 'Q': 214.2, 'R': 237.0, 'S': 207.6, 'T': 211.7, 'V': 208.7, 'W': 216.1, 'X': 210.2, 'Y': 213.1, 'Z': 214.9 } hydrophobicity = { 'A': 0.16, 'B': -3.14, 'C': 2.50, 'D': -2.49, 'E': -1.50, 'F': 5.00, 'G': -3.31, 'H': -4.63, 'I': 4.41, 'K': -5.00, 'L': 4.76, 'M': 3.23, 'N': -3.79, 'P': -4.92, 'Q': -2.76, 'R': -2.77, 'S': -2.85, 'T': -1.08, 'V': 3.02, 'W': 4.88, 'X': 4.59, 'Y': 2.00, 'Z': -2.13 } helicity = { 'A': 1.24, 'B': 0.92, 'C': 0.79, 'D': 0.89, 'E': 0.85, 'F': 1.26, 'G': 1.15, 'H': 0.97, 'I': 1.29, 'K': 0.88, 'L': 1.28, 'M': 1.22, 'N': 0.94, 'P': 0.57, 'Q': 0.96, 'R': 0.95, 'S': 1.00, 'T': 1.09, 'V': 1.27, 'W': 1.07, 'X': 1.29, 'Y': 1.11, 'Z': 0.91 } mutation_stability = { 'A': 13, 'C': 52, 'D': 11, 'E': 12, 'F': 32, 'G': 27, 'H': 15, 'I': 10, 'K': 24, 'L': 34, 'M': 6, 'N': 6, 'P': 20, 'Q': 10, 'R': 17, 'S': 10, 'T': 11, 'V': 17, 'W': 55, 'Y': 31 } # feature conversion and generation features_list = [] for chain in ['tra', 'trb']: onehot_encoder = feature_extraction.DictVectorizer(sparse=False) features_list.append( pd.DataFrame(onehot_encoder.fit_transform( data[[chain + '_vgene', chain + '_jgene']].to_dict(orient='records')), columns=onehot_encoder.feature_names_)) # sequence length features_list.append(data[chain + '_cdr3'].apply( lambda sequence: parser.length(sequence)).to_frame().rename( columns={chain + '_cdr3': 'length'})) # number of occurences of each amino acid aa_counts = pd.DataFrame.from_records([ parser.amino_acid_composition(sequence) for sequence in data[chain + '_cdr3'] ]).fillna(0) aa_counts.columns = [ chain + '_count_{}'.format(column) for column in aa_counts.columns ] features_list.append(aa_counts) # physicochemical properties: (average) basicity, (average) hydrophobicity, # (average) helicity, pI, (average) mutation stability features_list.append( data[chain + '_cdr3'].apply(lambda seq: sum([basicity[aa] for aa in seq]) / parser.length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_basicity'})) features_list.append(data[chain + '_cdr3'].apply(lambda seq: sum( [hydrophobicity[aa] for aa in seq]) / parser.length(seq)).to_frame( ).rename(columns={chain + '_cdr3': 'avg_hydrophobicity'})) features_list.append( data[chain + '_cdr3'].apply(lambda seq: sum([helicity[aa] for aa in seq]) / parser.length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_helicity'})) features_list.append(data[chain + '_cdr3'].apply( lambda seq: electrochem.pI(seq)).to_frame().rename( columns={chain + '_cdr3': 'pI'})) features_list.append(data[chain + '_cdr3'].apply( lambda seq: sum([mutation_stability[aa] for aa in seq]) / parser. length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_mutation_stability'})) # peptide mass features_list.append(data[chain + '_cdr3'].apply( lambda seq: mass.fast_mass(seq)).to_frame().rename( columns={chain + '_cdr3': 'mass'})) # positional features # amino acid occurence and physicochemical properties at a given position from the center pos_aa, pos_basicity, pos_hydro, pos_helicity, pos_pI, pos_mutation = [ [] for _ in range(6) ] for sequence in data[chain + '_cdr3']: length = parser.length(sequence) start_pos = -1 * (length // 2) pos_range = list(range(start_pos, start_pos + length)) if length % 2 == 1 else\ list(range(start_pos, 0)) + list(range(1, start_pos + length + 1)) pos_aa.append({ chain + '_pos_{}_{}'.format(pos, aa): 1 for pos, aa in zip(pos_range, sequence) }) pos_basicity.append({ chain + '_pos_{}_basicity'.format(pos): basicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_hydro.append({ chain + '_pos_{}_hydrophobicity'.format(pos): hydrophobicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_helicity.append({ chain + '_pos_{}_helicity'.format(pos): helicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_pI.append({ chain + '_pos_{}_pI'.format(pos): electrochem.pI(aa) for pos, aa in zip(pos_range, sequence) }) pos_mutation.append({ chain + '_pos_{}_mutation_stability'.format(pos): mutation_stability[aa] for pos, aa in zip(pos_range, sequence) }) features_list.append(pd.DataFrame.from_records(pos_aa).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_basicity).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_hydro).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_helicity).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_pI).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_mutation).fillna(0)) features_list.append(data['weights']) for tag in tags: features_list.append(data['labels_' + tag]) features_list.append(data['split']) # combine all features data_processed = pd.concat(features_list, axis=1) return data_processed
def extract_features(feature_sets, instances, instance_labels, identity_categories, test_instances=None, test_instance_labels=None, remove_zeros=False, initialization=None, test_initialization=None, categories=['all'], model_name=None, output_dirpath=None, save=False, extras=[], force=False): """ Main feature extraction function that selects utility feature extractors. Args: remove_zeros: whether or not to remove instances where the follower and all of their followees do not give the category force: If True, force extraction of new features. Otherwise, will load saved features if available. """ # Try loading data if output_dirpath and model_name and not force: features_fpath = os.path.join( output_dirpath, 'output', 'features', f'{model_name.replace("/", "_").replace(" ", "_")}_features.pkl') vectorizer_fpath = os.path.join( output_dirpath, 'output', 'feature_vectorizers', f'{model_name.replace("/", "_").replace(" ", "_")}_feature_vec.pkl' ) if os.path.exists(features_fpath): with open(features_fpath, 'rb') as f: X_train, y_train, X_test, y_test = pickle.load(f) with open(vectorizer_fpath, 'rb') as f: features_vectorizer = pickle.load(f) return X_train, y_train, X_test, y_test, vstack( [X_train, X_test]), features_vectorizer feature_set_extractors = { 'post_baseline': extract_features_post_baseline, 'experiment1': extract_features_experiment_1, 'experiment2': extract_features_experiment_2, 'experiment3': extract_features_experiment_3, } X = [] y = [] if categories == ['all']: categories = identity_categories if remove_zeros: # Build hashmap of followers that have zero presence of the category and all their followees do, too, for each category category_user_remove = variance_analysis(instances, identity_categories) remove_ids = set.intersection( *[set(category_user_remove[c]) for c in categories]) def _extract_features(feature_sets, reblog_candidate, nonreblog_candidate, label, initial_features={}, categories=categories, extras=extras): instance_features = initial_features for feature_set in feature_sets: instance_features.update(feature_set_extractors[feature_set]( reblog_candidate, nonreblog_candidate, label, categories=categories, extras=extras)) return instance_features if initialization: initial_features = initialization else: initial_features = [{} for _ in range(len(instances))] if test_instances: initial_features_test = test_initialization keep_indices = [] # Extract features for individual reblog/nonreblog pairings for i, ((reblog_candidate, nonreblog_candidate), label, initial) in enumerate( tqdm(zip(instances, instance_labels, initial_features), total=len(instances), ncols=50)): if remove_zeros: follower_id = reblog_candidate['tumblog_id_follower'] if not follower_id in remove_ids: X.append( _extract_features(feature_sets, reblog_candidate, nonreblog_candidate, label, initial_features=initial, categories=categories, extras=extras)) y.append(label) keep_indices(i) else: X.append( _extract_features(feature_sets, reblog_candidate, nonreblog_candidate, label, initial_features=initial, categories=categories, extras=extras)) y.append(label) features_vectorizer = feature_extraction.DictVectorizer() features_scaler = preprocessing.StandardScaler( with_mean=False) # normalization standard scaler if test_instances: X_test = [] y_test = [] # Extract features for individual reblog/nonreblog pairings for i, ((reblog_candidate, nonreblog_candidate), label, initial) in enumerate( tqdm(zip(test_instances, test_instance_labels, initial_features_test), total=len(test_instances), ncols=50)): X_test.append( _extract_features(feature_sets, reblog_candidate, nonreblog_candidate, label, initial_features=initial, categories=categories, extras=extras)) y_test.append(label) X_train = X y_train = y else: # split into train/test X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1, random_state=12345) X_train = features_vectorizer.fit_transform(X_train) X_train = features_scaler.fit_transform(X_train) X_test = features_vectorizer.transform(X_test) X_test = features_scaler.transform(X_test) # Save feature vectorizer for error analysis if output_dirpath and model_name: outpath = os.path.join( output_dirpath, 'output', 'feature_vectorizers', f'{model_name.replace("/", "_").replace(" ", "_")}_feature_vec.pkl' ) if not os.path.exists( os.path.join(output_dirpath, 'output', 'feature_vectorizers')): os.mkdir( os.path.join(output_dirpath, 'output', 'feature_vectorizers')) with open(outpath, 'wb') as f: pickle.dump(features_vectorizer, f) # Save features if save and output_dirpath and model_name: dirpath = os.path.join(output_dirpath, 'output', 'features') if not os.path.exists(dirpath): os.mkdir(dirpath) outpath = os.path.join( dirpath, f'{model_name.replace("/", "_").replace(" ", "_")}_features.pkl') with open(outpath, 'wb') as f: pickle.dump((X_train, y_train, X_test, y_test), f) # Save row indices of instances kept #if data_dirpath and model_name: # outpath = os.path.join(data_dirpath, 'output', f'{model_name.replace("/", "_").replace(" ", "_")}_instances_kept.txt') # with open(outpath, 'w') as f: # for i in keep_indices: # f.write(f"{i}\n") return X_train, y_train, X_test, y_test, X, features_vectorizer
def __init__(self): """ Initalization of dataset configure predefined columns """ self.train_data_from_text = urllib.urlopen( 'kddcup.data_10_percent_corrected') self.test_data_from_text = urllib.urlopen('corrected') """ Train data read from frame """ self.class_train = pd.read_csv( self.train_data_from_text, quotechar=',', skipinitialspace=True, names=[ 'Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Class' ]) self.class_test = pd.read_csv( self.test_data_from_text, quotechar=',', skipinitialspace=True, names=[ 'Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Class' ]) """ Change of train classes by tagging normal or attack """ self.class_train.loc[(self.class_train['Class'] != 'normal.'), 'Class'] = 'attack' self.class_train.loc[(self.class_train['Class'] == 'normal.'), 'Class'] = 'normal' """ Change of test classes of tagging normal or attack """ self.class_test.loc[(self.class_test['Class'] != 'normal.'), 'Class'] = 'attack' self.class_test.loc[(self.class_test['Class'] == 'normal.'), 'Class'] = 'normal' """ Trainset encoding- decoding """ self.attribute_encoder = feature_extraction.DictVectorizer( sparse=False) self.label_encoder = preprocessing.LabelEncoder() self.neighbors = 5 self.train_data_dataframe = self.attribute_encoder.fit_transform( self.class_train.iloc[:, :-1].T.to_dict().values()) self.train_target_dataframe = self.label_encoder.fit_transform( self.class_train.iloc[:, -1]) self.train_data_decoded = pd.DataFrame(self.train_data_dataframe) self.train_target_decoded = pd.DataFrame(self.train_target_dataframe) self.test_data_dataframe = self.attribute_encoder.transform( self.class_test.iloc[:, :-1].T.to_dict().values()) self.test_target_dataframe = self.label_encoder.transform( self.class_test.iloc[:, -1]) self.test_data_decoded = pd.DataFrame(self.test_data_dataframe) self.test_target_decoded = pd.DataFrame(self.test_target_dataframe) self.usedThresholds = {} self.Tree = np.ones((1000, 1)) self.Thresholds = np.ones((1000, 1)) self.decisions = {} self.Tree = -1 * self.Tree for i in range(0, 29): self.usedThresholds[i] = set() print("************************************************") print("Train Data Dimensions Without Feature Selections") print(self.train_data_decoded.shape) print("Test Data Dimensions Without Feature Selections") print(self.test_data_decoded.shape) print("************************************************")
) # --- Convert User Features --- print("Converting user features") # Extract user data and user_ids user_ids = list(users_in_graph) user_attributes = [ {k: v for k, v in user_data_raw[uid].items() if k in user_feature_names} for uid in user_ids ] # Preprocess user features using Scikit-Learn # Note we use a nonlinear transform as the user features as mostly counts # which are highly non-normal. uf_extract = feature_extraction.DictVectorizer(sparse=use_sparse) uf_transform = preprocessing.FunctionTransformer(np.log1p, np.expm1) uf_encoder = pipeline.Pipeline([("extract", uf_extract), ("scale", uf_transform)]) user_features = uf_encoder.fit_transform(user_attributes) # Create a Pandas dataframe to store features user_features = pd.DataFrame(user_features, index=user_ids) del user_attributes # Get user targets: # 'elite' attribute is a comma separated list of years that they are elite # target_data = [ # {k: 1 for k in user_data_raw[uid][user_target_name].split(", ")} # for uid in user_ids # ]
def __init__(self, config, rouge): super(Regression, self).__init__(config, rouge) self.model = linear_model.LinearRegression(normalize=True) self.vectorizer = feature_extraction.DictVectorizer()
regex=True).values LocationNormalized = data['LocationNormalized'].fillna('nan', inplace=True) #.values ContractTime = data['ContractTime'].fillna('nan', inplace=True) #.values SalaryNormalized = data['SalaryNormalized'].values TFD = feature_extraction.text.TfidfVectorizer(min_df=5) for x in range(len(FullDescription)): FullDescription[x] = FullDescription[x].lower() for x in range(len(FullDescription_test)): FullDescription_test[x] = FullDescription_test[x].lower() TFD_FullDescription = TFD.fit_transform(FullDescription) TFD_FullDescription_test = TFD.transform(FullDescription_test) DV = feature_extraction.DictVectorizer() data_categ = DV.fit_transform(data[['LocationNormalized', 'ContractTime']].to_dict('records')) test_categ = DV.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records')) Xtrain = scipy.sparse.hstack([TFD_FullDescription, data_categ]) Xtest = scipy.sparse.hstack([TFD_FullDescription_test, test_categ]) #new_data = scipy.sparse.hstack(TFD_FullDescription, data_categ) R = Ridge(alpha=1, random_state=241) #fit_intercept=False, solver='lsqr') R.fit(Xtrain, SalaryNormalized) print(np.round(R.predict(Xtest), 2))
def train( edgelist, node_data, attn_heads, layer_sizes, num_epochs=10, learning_rate=0.005, es_patience=100, dropout=0.0, target_name="subject", ): """ Train a GAT model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes attn_heads: Number of attention heads in GAT layers layer_sizes: A list of number of hidden nodes in each layer num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records") ) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist) # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=55232, ) # Further split test set into validation and test val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=1000, random_state=523214 ) # Create mappers for GraphSAGE that input data from the graph to the model generator = FullBatchNodeGenerator(G, method="gat") train_gen = generator.flow(train_nodes, train_targets) val_gen = generator.flow(val_nodes, val_targets) # GAT model gat = GAT( layer_sizes=layer_sizes, attn_heads=attn_heads, generator=generator, bias=True, in_dropout=dropout, attn_dropout=dropout, activations=["elu", "elu"], normalize=None, ) # Expose the input and output tensors of the GAT model for nodes: x_inp, x_out = gat.node_model() # Snap the final estimator layer to x_out x_out = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=x_out) model.compile( optimizer=optimizers.Adam(lr=learning_rate, decay=0.001), loss=losses.categorical_crossentropy, metrics=["acc"], ) print(model.summary()) # Train model # Callbacks if not os.path.isdir("logs"): os.makedirs("logs") N = len(node_ids) es_callback = EarlyStopping(monitor="val_acc", patience=es_patience) tb_callback = TensorBoard(batch_size=N) mc_callback = ModelCheckpoint( "logs/best_model.h5", monitor="val_acc", save_best_only=True, save_weights_only=True, ) if args.interface == "fit": print("\nUsing model.fit() to train the model\n") # Get the training data inputs_train, y_train = train_gen[0] # Get the validation data inputs_val, y_val = val_gen[0] history = model.fit( x=inputs_train, y=y_train, shuffle=False, # must be False, since shuffling data means shuffling the whole graph epochs=num_epochs, verbose=2, validation_data=(inputs_val, y_val), callbacks=[es_callback, tb_callback, mc_callback], ) else: print("\nUsing model.fit_generator() to train the model\n") history = model.fit_generator( train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=False, callbacks=[es_callback, tb_callback, mc_callback], ) # Load best model model.load_weights("logs/best_model.h5") # Evaluate on validation set and print metrics if args.interface == "fit": val_metrics = model.evaluate(x=inputs_val, y=y_val) else: val_metrics = model.evaluate_generator(val_gen) print("\nBest model's Validation Set Metrics:") for name, val in zip(model.metrics_names, val_metrics): print("\t{}: {:0.4f}".format(name, val)) # Evaluate on test set and print metrics if args.interface == "fit": inputs_test, y_test = generator.flow(test_nodes, test_targets)[0] test_metrics = model.evaluate(x=inputs_test, y=y_test) else: test_metrics = model.evaluate_generator( generator.flow(test_nodes, test_targets) ) print("\nBest model's Test Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes all_predictions = model.predict_generator(generator.flow(node_ids)) # Remove singleton batch dimension all_predictions = np.squeeze(all_predictions) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=list(G.nodes()) ) accuracy = np.mean( [ "subject=" + gt_subject == p for gt_subject, p in zip( node_data["subject"], node_predictions.idxmax(axis=1) ) ] ) print("\nAll-node accuracy: {:0.4f}".format(accuracy)) # Save the trained model save_str = "_h{}_l{}_d{}_r{}".format( attn_heads, "_".join([str(x) for x in layer_sizes]), dropout, learning_rate ) model.save("cora_gat_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_gat_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)
def _train_model(self, gnx, train_data, test_data, all_features, target_feature_name): subject_groups_train = Counter(train_data[target_feature_name]) subject_groups_test = Counter(test_data[target_feature_name]) graph = sg.StellarGraph(gnx, node_features=all_features) output_results = { 'train_size': len(train_data), 'test_size': len(test_data), 'subject_groups_train': subject_groups_train, 'subject_groups_test': subject_groups_test, 'graph_info': graph.info() } num_samples = [10, 5] generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[[target_feature_name]].to_dict('records')) class_weights = class_weight.compute_class_weight( class_weight='balanced', classes=np.unique(train_data[target_feature_name].to_list()), y=train_data[target_feature_name].to_list()) class_weights = dict(enumerate(class_weights)) test_targets = target_encoding.transform( test_data[[target_feature_name]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graph_sage_model = GraphSAGE( layer_sizes=[80, 80], generator=generator, # train_gen, bias=True, dropout=0.5, ) print('building model...') x_inp, x_out = graph_sage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print('compiling model...') model.compile( optimizer=optimizers.Adam(learning_rate=0.005), loss=losses.categorical_crossentropy, metrics=['acc', metrics.categorical_accuracy], ) print('testing the model...') test_gen = generator.flow(test_data.index, test_targets) history = model.fit( train_gen, epochs=self.num_epochs, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate(test_gen) print('Test Set Metrics:') output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ 'Predicted': results, 'True': test_data[target_feature_name] }) clean_result_labels = df['Predicted'].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) output_results['history'] = { 'epochs': history.epoch, 'training_log': history.history, 'training_params': history.params } return generator, model, x_inp, x_out, history, target_encoding, output_results
def build_all_2(): print 'For each class, we build all the trees and save them in CSVs' path_to_save = '../data/test/try' """ nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative') write_tree_in_csv(nar_trees) arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative') write_tree_in_csv(arg_trees) inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative') write_tree_in_csv(inf_trees) des_trees = [] # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier. all_trees = nar_trees + arg_trees + inf_trees + des_trees int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'} T = [t[0] for t in all_trees] pickle.dump(T,open(path_to_save+'trees.pkl','wb'))""" T = pickle.load(open('../data/trees_with_labels.pkl','r')) T = [t[0] for t in T] """y_nar = [0 for t in nar_trees] y_arg = [1 for t in arg_trees] y_inf = [2 for t in inf_trees] y_des = [3 for t in des_trees] y = np.array( y_nar + y_arg + y_inf + y_des ) pickle.dump(y,open(path_to_save+'labels.pkl','wb'))""" index = ['bin','count','norm','height','tfid'] print 'Dicts' D_bin = vectorizers.build_bin_vects(T) D_count = vectorizers.build_count_vects(T) D_norm = vectorizers.build_norm_vects(T) D_height = vectorizers.build_height_vects(T) D_tfid = vectorizers.build_tfid_vects(T) D_all = {'bin':D_bin ,'count': D_count,'norm': D_norm,'height': D_height,'tfid': D_tfid} pickle.dump(D_all,open(path_to_save+'dicts.pkl','wb')) print 'Vects' vectorizer = feature_extraction.DictVectorizer(sparse=False) V_bin = vectorizer.fit_transform(D_bin) V_count = vectorizer.fit_transform(D_count) V_norm = vectorizer.fit_transform(D_norm) V_height = vectorizer.fit_transform(D_height) V_tfid = vectorizer.fit_transform(D_tfid) V_all = {'bin':V_bin ,'count': V_count,'norm': V_norm,'height': V_height,'tfid': V_tfid} pickle.dump(V_all,open(path_to_save+'vects.pkl','wb')) #Y = vectorizer.inverse_transform(V_bin) print 'Kernels' ## tree kernels #max_depth = 15 #T_p = [ctree.prune(t,max_depth) for t in T] #K_tree = kernels.compute_gram(T_p,T_p,kernels.tree_kernel) #pickle.dump(K_tree,open(path_to_save+'tree_kernel.pkl')) print 'vector kernels' print 'linear' K_bin_lin = pairwise.linear_kernel(V_bin) K_count_lin = pairwise.linear_kernel(V_count) K_norm_lin = pairwise.linear_kernel(V_norm) K_height_lin = pairwise.linear_kernel(V_height) K_tfid_lin = pairwise.linear_kernel(V_tfid) K_all_lin = {'bin':K_bin_lin, 'count':K_count_lin, 'norm':K_norm_lin, 'height':K_height_lin, 'tfid':K_tfid_lin} print 'rbf' K_bin_rbf = pairwise.rbf_kernel(V_bin) K_count_rbf = pairwise.rbf_kernel(V_count) K_norm_rbf = pairwise.rbf_kernel(V_norm) K_height_rbf = pairwise.rbf_kernel(V_height) K_tfid_rbf = pairwise.rbf_kernel(V_tfid) K_all_rbf = {'bin':K_bin_rbf, 'count':K_count_rbf, 'norm':K_norm_rbf, 'height':K_height_rbf, 'tfid':K_tfid_rbf} print 'cosine sim' K_bin_cos_sim = pairwise.cosine_similarity(V_bin) K_count_cos_sim = pairwise.cosine_similarity(V_count) K_norm_cos_sim = pairwise.cosine_similarity(V_norm) K_height_cos_sim = pairwise.cosine_similarity(V_height) K_tfid_cos_sim = pairwise.cosine_similarity(V_tfid) K_all_cos_sim = {'bin':K_bin_cos_sim, 'count':K_count_cos_sim, 'norm':K_norm_cos_sim, 'height':K_height_cos_sim, 'tfid':K_tfid_cos_sim} print 'euclidean distance' K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean') K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean') K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean') K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean') K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean') K_all_eucl_dist = {'bin':K_bin_eucl_dist, 'count':K_count_eucl_dist, 'norm':K_norm_eucl_dist, 'height':K_height_eucl_dist, 'tfid':K_tfid_eucl_dist} print 'minkowski distance' K_bin_mink_dist = pairwise.pairwise_distances(V_bin,metric='minkowski') K_count_mink_dist = pairwise.pairwise_distances(V_count,metric='minkowski') K_norm_mink_dist = pairwise.pairwise_distances(V_norm,metric='minkowski') K_height_mink_dist = pairwise.pairwise_distances(V_height,metric='minkowski') K_tfid_mink_dist = pairwise.pairwise_distances(V_tfid,metric='minkowski') K_all_mink_dist = {'bin':K_bin_mink_dist, 'count':K_count_mink_dist, 'norm':K_norm_mink_dist, 'height':K_height_mink_dist, 'tfid':K_tfid_mink_dist} K_all = {'lin':K_all_lin, 'rbf':K_all_rbf, 'cos_sim':K_all_cos_sim,'eucl_dist':K_all_eucl_dist,'mink_dist':K_all_mink_dist} pickle.dump(K_all,open(path_to_save+'vect_kernels.pkl','wb')) print "done"
def build_all(): # For each class, we build all the trees and save them in CSVs nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative') write_tree_in_csv(nar_trees) arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative/') write_tree_in_csv(arg_trees) inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/') write_tree_in_csv(inf_trees) des_trees = [] #des_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/') #write_tree_in_csv(des_trees) # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier. all_trees = nar_trees + arg_trees + inf_trees + des_trees int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'} path_to_save = '~/Documents/s2/tal/discourseAnalysis/data/' y_nar = [0 for t in nar_trees] y_arg = [1 for t in arg_trees] y_inf = [2 for t in inf_trees] y_des = [3 for t in des_trees] y = np.array( y_nar + y_arg + y_inf + y_des ) pickle.dump(y,open(path_to_save+'labels_test.pkl','wb')) T = [t[0] for t in all_trees] pickle.dump(T,open(path_to_save+'trees_test.pkl','wb')) index = ['bin','count','norm','height','tfid'] #Dicts D_bin = vectorizers.build_bin_vects(T) D_count = vectorizers.build_count_vects(T) D_norm = vectorizers.build_norm_vects(T) D_height = vectorizers.build_height_vects(T) D_tfid = vectorizers.build_tfid_vects(T) D_df = pd.DataFrame([D_bin,D_count,D_norm,D_height,D_tfid],index=index) D_df = D_df.transpose() D_df.to_pickle(path_to_save+'dicts_test.pkl') #Vects vectorizer = feature_extraction.DictVectorizer(sparse=False) V_bin = vectorizer.fit_transform(D_bin) V_count = vectorizer.fit_transform(D_count) V_norm = vectorizer.fit_transform(D_norm) V_height = vectorizer.fit_transform(D_height) V_tfid = vectorizer.fit_transform(D_tfid) V_all = np.zeros((len(index),V_bin.shape[0],V_bin.shape[1])) V_all = np.array([V_bin,V_count,V_norm,V_height,V_tfid]) V_df = [] for i in range(V_all.shape[1]): d = {} for j,v in enumerate(V_all[:,i]): d[index[j]]=v V_df.append(d) V_df = pd.DataFrame(V_df) V_df.to_pickle(path_to_save+'vects_test.pkl') #euclidean distance K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean') K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean') K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean') K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean') K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean') K_all_eucl_dist = [K_bin_eucl_dist, K_count_eucl_dist, K_norm_eucl_dist, K_height_eucl_dist, K_tfid_eucl_dist] K_all = {'eucl_dist':K_all_eucl_dist} pickle.dump(K_all,open(path_to_save+'kernels_test.pkl','wb'))
def onehot(csv): records = csv[gconfig['categorial_features']].to_dict(orient='records') dv = feature_extraction.DictVectorizer(separator='_', sparse=False) dv.fit(records) return dv