示例#1
0
from sklearn import feature_extraction

data = [{
    "weight": 60.,
    "sex": 'female',
    "student": True
}, {
    "weight": 80.1,
    "sex": 'male',
    "student": False
}, {
    "weight": 65.3,
    "sex": 'male',
    "student": True
}, {
    "weight": 58.5,
    "sex": 'female',
    "student": False
}]

vectorizer = feature_extraction.DictVectorizer(sparse=False)

vectors = vectorizer.fit_transform(data)
print vectors
print vectorizer.get_feature_names()
示例#2
0
def handcrafted_features(data, tags):

    #
    # DOI 10.1007/s00251-017-1023-5
    # Code from https://github.com/bittremieux/TCR-Classifier/blob/master/tcr_classifier.ipynb
    # Modified to apply handcrafted features twice, once to the alpha chain and again to the beta chain
    # Modified to handle split for training, validation, and test cohorts
    # Modified for multinomial classification
    #

    # physicochemical amino acid properties
    basicity = {
        'A': 206.4,
        'B': 210.7,
        'C': 206.2,
        'D': 208.6,
        'E': 215.6,
        'F': 212.1,
        'G': 202.7,
        'H': 223.7,
        'I': 210.8,
        'K': 221.8,
        'L': 209.6,
        'M': 213.3,
        'N': 212.8,
        'P': 214.4,
        'Q': 214.2,
        'R': 237.0,
        'S': 207.6,
        'T': 211.7,
        'V': 208.7,
        'W': 216.1,
        'X': 210.2,
        'Y': 213.1,
        'Z': 214.9
    }

    hydrophobicity = {
        'A': 0.16,
        'B': -3.14,
        'C': 2.50,
        'D': -2.49,
        'E': -1.50,
        'F': 5.00,
        'G': -3.31,
        'H': -4.63,
        'I': 4.41,
        'K': -5.00,
        'L': 4.76,
        'M': 3.23,
        'N': -3.79,
        'P': -4.92,
        'Q': -2.76,
        'R': -2.77,
        'S': -2.85,
        'T': -1.08,
        'V': 3.02,
        'W': 4.88,
        'X': 4.59,
        'Y': 2.00,
        'Z': -2.13
    }

    helicity = {
        'A': 1.24,
        'B': 0.92,
        'C': 0.79,
        'D': 0.89,
        'E': 0.85,
        'F': 1.26,
        'G': 1.15,
        'H': 0.97,
        'I': 1.29,
        'K': 0.88,
        'L': 1.28,
        'M': 1.22,
        'N': 0.94,
        'P': 0.57,
        'Q': 0.96,
        'R': 0.95,
        'S': 1.00,
        'T': 1.09,
        'V': 1.27,
        'W': 1.07,
        'X': 1.29,
        'Y': 1.11,
        'Z': 0.91
    }

    mutation_stability = {
        'A': 13,
        'C': 52,
        'D': 11,
        'E': 12,
        'F': 32,
        'G': 27,
        'H': 15,
        'I': 10,
        'K': 24,
        'L': 34,
        'M': 6,
        'N': 6,
        'P': 20,
        'Q': 10,
        'R': 17,
        'S': 10,
        'T': 11,
        'V': 17,
        'W': 55,
        'Y': 31
    }

    # feature conversion and generation
    features_list = []

    for chain in ['tra', 'trb']:

        onehot_encoder = feature_extraction.DictVectorizer(sparse=False)
        features_list.append(
            pd.DataFrame(onehot_encoder.fit_transform(
                data[[chain + '_vgene',
                      chain + '_jgene']].to_dict(orient='records')),
                         columns=onehot_encoder.feature_names_))

        # sequence length
        features_list.append(data[chain + '_cdr3'].apply(
            lambda sequence: parser.length(sequence)).to_frame().rename(
                columns={chain + '_cdr3': 'length'}))

        # number of occurences of each amino acid
        aa_counts = pd.DataFrame.from_records([
            parser.amino_acid_composition(sequence)
            for sequence in data[chain + '_cdr3']
        ]).fillna(0)
        aa_counts.columns = [
            chain + '_count_{}'.format(column) for column in aa_counts.columns
        ]
        features_list.append(aa_counts)

        # physicochemical properties: (average) basicity, (average) hydrophobicity,
        #                             (average) helicity, pI, (average) mutation stability
        features_list.append(
            data[chain +
                 '_cdr3'].apply(lambda seq: sum([basicity[aa] for aa in seq]) /
                                parser.length(seq)).to_frame().rename(
                                    columns={chain + '_cdr3': 'avg_basicity'}))
        features_list.append(data[chain + '_cdr3'].apply(lambda seq: sum(
            [hydrophobicity[aa] for aa in seq]) / parser.length(seq)).to_frame(
            ).rename(columns={chain + '_cdr3': 'avg_hydrophobicity'}))
        features_list.append(
            data[chain +
                 '_cdr3'].apply(lambda seq: sum([helicity[aa] for aa in seq]) /
                                parser.length(seq)).to_frame().rename(
                                    columns={chain + '_cdr3': 'avg_helicity'}))
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: electrochem.pI(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'pI'}))
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: sum([mutation_stability[aa] for aa in seq]) / parser.
            length(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'avg_mutation_stability'}))

        # peptide mass
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: mass.fast_mass(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'mass'}))

        # positional features
        # amino acid occurence and physicochemical properties at a given position from the center
        pos_aa, pos_basicity, pos_hydro, pos_helicity, pos_pI, pos_mutation = [
            [] for _ in range(6)
        ]
        for sequence in data[chain + '_cdr3']:
            length = parser.length(sequence)
            start_pos = -1 * (length // 2)
            pos_range = list(range(start_pos, start_pos + length)) if length % 2 == 1 else\
              list(range(start_pos, 0)) + list(range(1, start_pos + length + 1))

            pos_aa.append({
                chain + '_pos_{}_{}'.format(pos, aa): 1
                for pos, aa in zip(pos_range, sequence)
            })
            pos_basicity.append({
                chain + '_pos_{}_basicity'.format(pos): basicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_hydro.append({
                chain + '_pos_{}_hydrophobicity'.format(pos):
                hydrophobicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_helicity.append({
                chain + '_pos_{}_helicity'.format(pos): helicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_pI.append({
                chain + '_pos_{}_pI'.format(pos): electrochem.pI(aa)
                for pos, aa in zip(pos_range, sequence)
            })
            pos_mutation.append({
                chain + '_pos_{}_mutation_stability'.format(pos):
                mutation_stability[aa]
                for pos, aa in zip(pos_range, sequence)
            })

        features_list.append(pd.DataFrame.from_records(pos_aa).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_basicity).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_hydro).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_helicity).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_pI).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_mutation).fillna(0))

    features_list.append(data['weights'])
    for tag in tags:
        features_list.append(data['labels_' + tag])
    features_list.append(data['split'])

    # combine all features
    data_processed = pd.concat(features_list, axis=1)

    return data_processed
示例#3
0
def extract_features(feature_sets,
                     instances,
                     instance_labels,
                     identity_categories,
                     test_instances=None,
                     test_instance_labels=None,
                     remove_zeros=False,
                     initialization=None,
                     test_initialization=None,
                     categories=['all'],
                     model_name=None,
                     output_dirpath=None,
                     save=False,
                     extras=[],
                     force=False):
    """
        Main feature extraction function that selects utility feature extractors.

        Args:
            remove_zeros: whether or not to remove instances where the follower and all of their followees do not give the category
            force: If True, force extraction of new features. Otherwise, will load saved features if available.
    """
    # Try loading data
    if output_dirpath and model_name and not force:
        features_fpath = os.path.join(
            output_dirpath, 'output', 'features',
            f'{model_name.replace("/", "_").replace(" ", "_")}_features.pkl')
        vectorizer_fpath = os.path.join(
            output_dirpath, 'output', 'feature_vectorizers',
            f'{model_name.replace("/", "_").replace(" ", "_")}_feature_vec.pkl'
        )

        if os.path.exists(features_fpath):
            with open(features_fpath, 'rb') as f:
                X_train, y_train, X_test, y_test = pickle.load(f)
            with open(vectorizer_fpath, 'rb') as f:
                features_vectorizer = pickle.load(f)

            return X_train, y_train, X_test, y_test, vstack(
                [X_train, X_test]), features_vectorizer

    feature_set_extractors = {
        'post_baseline': extract_features_post_baseline,
        'experiment1': extract_features_experiment_1,
        'experiment2': extract_features_experiment_2,
        'experiment3': extract_features_experiment_3,
    }

    X = []
    y = []

    if categories == ['all']:
        categories = identity_categories

    if remove_zeros:
        # Build hashmap of followers that have zero presence of the category and all their followees do, too, for each category
        category_user_remove = variance_analysis(instances,
                                                 identity_categories)
        remove_ids = set.intersection(
            *[set(category_user_remove[c]) for c in categories])

    def _extract_features(feature_sets,
                          reblog_candidate,
                          nonreblog_candidate,
                          label,
                          initial_features={},
                          categories=categories,
                          extras=extras):
        instance_features = initial_features
        for feature_set in feature_sets:
            instance_features.update(feature_set_extractors[feature_set](
                reblog_candidate,
                nonreblog_candidate,
                label,
                categories=categories,
                extras=extras))

        return instance_features

    if initialization:
        initial_features = initialization
    else:
        initial_features = [{} for _ in range(len(instances))]

    if test_instances:
        initial_features_test = test_initialization

    keep_indices = []

    # Extract features for individual reblog/nonreblog pairings
    for i, ((reblog_candidate, nonreblog_candidate), label,
            initial) in enumerate(
                tqdm(zip(instances, instance_labels, initial_features),
                     total=len(instances),
                     ncols=50)):

        if remove_zeros:
            follower_id = reblog_candidate['tumblog_id_follower']
            if not follower_id in remove_ids:
                X.append(
                    _extract_features(feature_sets,
                                      reblog_candidate,
                                      nonreblog_candidate,
                                      label,
                                      initial_features=initial,
                                      categories=categories,
                                      extras=extras))
                y.append(label)
                keep_indices(i)

        else:
            X.append(
                _extract_features(feature_sets,
                                  reblog_candidate,
                                  nonreblog_candidate,
                                  label,
                                  initial_features=initial,
                                  categories=categories,
                                  extras=extras))
            y.append(label)

    features_vectorizer = feature_extraction.DictVectorizer()
    features_scaler = preprocessing.StandardScaler(
        with_mean=False)  # normalization standard scaler

    if test_instances:
        X_test = []
        y_test = []

        # Extract features for individual reblog/nonreblog pairings
        for i, ((reblog_candidate, nonreblog_candidate), label,
                initial) in enumerate(
                    tqdm(zip(test_instances, test_instance_labels,
                             initial_features_test),
                         total=len(test_instances),
                         ncols=50)):

            X_test.append(
                _extract_features(feature_sets,
                                  reblog_candidate,
                                  nonreblog_candidate,
                                  label,
                                  initial_features=initial,
                                  categories=categories,
                                  extras=extras))
            y_test.append(label)

        X_train = X
        y_train = y

    else:
        # split into train/test
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size=0.1, random_state=12345)

    X_train = features_vectorizer.fit_transform(X_train)
    X_train = features_scaler.fit_transform(X_train)
    X_test = features_vectorizer.transform(X_test)
    X_test = features_scaler.transform(X_test)

    # Save feature vectorizer for error analysis
    if output_dirpath and model_name:
        outpath = os.path.join(
            output_dirpath, 'output', 'feature_vectorizers',
            f'{model_name.replace("/", "_").replace(" ", "_")}_feature_vec.pkl'
        )
        if not os.path.exists(
                os.path.join(output_dirpath, 'output', 'feature_vectorizers')):
            os.mkdir(
                os.path.join(output_dirpath, 'output', 'feature_vectorizers'))
        with open(outpath, 'wb') as f:
            pickle.dump(features_vectorizer, f)

    # Save features
    if save and output_dirpath and model_name:
        dirpath = os.path.join(output_dirpath, 'output', 'features')
        if not os.path.exists(dirpath):
            os.mkdir(dirpath)
        outpath = os.path.join(
            dirpath,
            f'{model_name.replace("/", "_").replace(" ", "_")}_features.pkl')
        with open(outpath, 'wb') as f:
            pickle.dump((X_train, y_train, X_test, y_test), f)

    # Save row indices of instances kept
    #if data_dirpath and model_name:
    #    outpath = os.path.join(data_dirpath, 'output', f'{model_name.replace("/", "_").replace(" ", "_")}_instances_kept.txt')
    #    with open(outpath, 'w') as f:
    #        for i in keep_indices:
    #            f.write(f"{i}\n")

    return X_train, y_train, X_test, y_test, X, features_vectorizer
    def __init__(self):
        """ Initalization of dataset configure predefined columns
		"""
        self.train_data_from_text = urllib.urlopen(
            'kddcup.data_10_percent_corrected')
        self.test_data_from_text = urllib.urlopen('corrected')
        """ Train data read from frame """
        self.class_train = pd.read_csv(
            self.train_data_from_text,
            quotechar=',',
            skipinitialspace=True,
            names=[
                'Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes',
                'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot',
                'num_failed_logins', 'logged_in', 'num_compromised',
                'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
                'num_shells', 'num_access_files', 'num_outbound_cmds',
                'is_host_login', 'is_guest_login', 'Count', 'srv_count',
                'serror_rate', 'srv_serror_rate', 'rerror_rate',
                'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
                'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
                'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
                'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Class'
            ])
        self.class_test = pd.read_csv(
            self.test_data_from_text,
            quotechar=',',
            skipinitialspace=True,
            names=[
                'Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes',
                'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot',
                'num_failed_logins', 'logged_in', 'num_compromised',
                'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
                'num_shells', 'num_access_files', 'num_outbound_cmds',
                'is_host_login', 'is_guest_login', 'Count', 'srv_count',
                'serror_rate', 'srv_serror_rate', 'rerror_rate',
                'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
                'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
                'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
                'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Class'
            ])
        """ Change of train classes by tagging normal or attack """
        self.class_train.loc[(self.class_train['Class'] != 'normal.'),
                             'Class'] = 'attack'
        self.class_train.loc[(self.class_train['Class'] == 'normal.'),
                             'Class'] = 'normal'
        """ Change of test classes of tagging normal or attack """
        self.class_test.loc[(self.class_test['Class'] != 'normal.'),
                            'Class'] = 'attack'
        self.class_test.loc[(self.class_test['Class'] == 'normal.'),
                            'Class'] = 'normal'
        """ Trainset encoding- decoding """
        self.attribute_encoder = feature_extraction.DictVectorizer(
            sparse=False)
        self.label_encoder = preprocessing.LabelEncoder()

        self.neighbors = 5

        self.train_data_dataframe = self.attribute_encoder.fit_transform(
            self.class_train.iloc[:, :-1].T.to_dict().values())
        self.train_target_dataframe = self.label_encoder.fit_transform(
            self.class_train.iloc[:, -1])

        self.train_data_decoded = pd.DataFrame(self.train_data_dataframe)
        self.train_target_decoded = pd.DataFrame(self.train_target_dataframe)

        self.test_data_dataframe = self.attribute_encoder.transform(
            self.class_test.iloc[:, :-1].T.to_dict().values())
        self.test_target_dataframe = self.label_encoder.transform(
            self.class_test.iloc[:, -1])

        self.test_data_decoded = pd.DataFrame(self.test_data_dataframe)
        self.test_target_decoded = pd.DataFrame(self.test_target_dataframe)
        self.usedThresholds = {}
        self.Tree = np.ones((1000, 1))
        self.Thresholds = np.ones((1000, 1))
        self.decisions = {}
        self.Tree = -1 * self.Tree
        for i in range(0, 29):
            self.usedThresholds[i] = set()

        print("************************************************")
        print("Train Data Dimensions Without Feature Selections")
        print(self.train_data_decoded.shape)
        print("Test Data Dimensions Without Feature Selections")
        print(self.test_data_decoded.shape)
        print("************************************************")
    )

    # --- Convert User Features ---
    print("Converting user features")

    # Extract user data and user_ids
    user_ids = list(users_in_graph)
    user_attributes = [
        {k: v for k, v in user_data_raw[uid].items() if k in user_feature_names}
        for uid in user_ids
    ]

    # Preprocess user features using Scikit-Learn
    # Note we use a nonlinear transform as the user features as mostly counts
    # which are highly non-normal.
    uf_extract = feature_extraction.DictVectorizer(sparse=use_sparse)
    uf_transform = preprocessing.FunctionTransformer(np.log1p, np.expm1)
    uf_encoder = pipeline.Pipeline([("extract", uf_extract), ("scale", uf_transform)])
    user_features = uf_encoder.fit_transform(user_attributes)

    # Create a Pandas dataframe to store features
    user_features = pd.DataFrame(user_features, index=user_ids)
    del user_attributes

    # Get user targets:

    # 'elite' attribute is a comma separated list of years that they are elite
    # target_data = [
    #     {k: 1 for k in user_data_raw[uid][user_target_name].split(", ")}
    #     for uid in user_ids
    # ]
示例#6
0
    def __init__(self, config, rouge):
        super(Regression, self).__init__(config, rouge)

        self.model = linear_model.LinearRegression(normalize=True)
        self.vectorizer = feature_extraction.DictVectorizer()
示例#7
0
                                                       regex=True).values

LocationNormalized = data['LocationNormalized'].fillna('nan',
                                                       inplace=True)  #.values
ContractTime = data['ContractTime'].fillna('nan', inplace=True)  #.values
SalaryNormalized = data['SalaryNormalized'].values

TFD = feature_extraction.text.TfidfVectorizer(min_df=5)

for x in range(len(FullDescription)):
    FullDescription[x] = FullDescription[x].lower()
for x in range(len(FullDescription_test)):
    FullDescription_test[x] = FullDescription_test[x].lower()

TFD_FullDescription = TFD.fit_transform(FullDescription)
TFD_FullDescription_test = TFD.transform(FullDescription_test)

DV = feature_extraction.DictVectorizer()
data_categ = DV.fit_transform(data[['LocationNormalized',
                                    'ContractTime']].to_dict('records'))
test_categ = DV.transform(test[['LocationNormalized',
                                'ContractTime']].to_dict('records'))

Xtrain = scipy.sparse.hstack([TFD_FullDescription, data_categ])
Xtest = scipy.sparse.hstack([TFD_FullDescription_test, test_categ])
#new_data = scipy.sparse.hstack(TFD_FullDescription, data_categ)

R = Ridge(alpha=1, random_state=241)  #fit_intercept=False, solver='lsqr')
R.fit(Xtrain, SalaryNormalized)
print(np.round(R.predict(Xtest), 2))
示例#8
0
def train(
    edgelist,
    node_data,
    attn_heads,
    layer_sizes,
    num_epochs=10,
    learning_rate=0.005,
    es_patience=100,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GAT model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        attn_heads: Number of attention heads in GAT layers
        layer_sizes: A list of number of hidden nodes in each layer
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records")
    )
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist)

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=55232,
    )

    # Further split test set into validation and test
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes, test_targets, train_size=500, test_size=1000, random_state=523214
    )

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = FullBatchNodeGenerator(G, method="gat")
    train_gen = generator.flow(train_nodes, train_targets)
    val_gen = generator.flow(val_nodes, val_targets)

    # GAT model
    gat = GAT(
        layer_sizes=layer_sizes,
        attn_heads=attn_heads,
        generator=generator,
        bias=True,
        in_dropout=dropout,
        attn_dropout=dropout,
        activations=["elu", "elu"],
        normalize=None,
    )
    # Expose the input and output tensors of the GAT model for nodes:
    x_inp, x_out = gat.node_model()

    # Snap the final estimator layer to x_out
    x_out = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=x_out)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        metrics=["acc"],
    )
    print(model.summary())

    # Train model
    # Callbacks
    if not os.path.isdir("logs"):
        os.makedirs("logs")
    N = len(node_ids)
    es_callback = EarlyStopping(monitor="val_acc", patience=es_patience)
    tb_callback = TensorBoard(batch_size=N)
    mc_callback = ModelCheckpoint(
        "logs/best_model.h5",
        monitor="val_acc",
        save_best_only=True,
        save_weights_only=True,
    )

    if args.interface == "fit":
        print("\nUsing model.fit() to train the model\n")
        # Get the training data
        inputs_train, y_train = train_gen[0]

        # Get the validation data
        inputs_val, y_val = val_gen[0]

        history = model.fit(
            x=inputs_train,
            y=y_train,
            shuffle=False,  # must be False, since shuffling data means shuffling the whole graph
            epochs=num_epochs,
            verbose=2,
            validation_data=(inputs_val, y_val),
            callbacks=[es_callback, tb_callback, mc_callback],
        )
    else:
        print("\nUsing model.fit_generator() to train the model\n")
        history = model.fit_generator(
            train_gen,
            epochs=num_epochs,
            validation_data=val_gen,
            verbose=2,
            shuffle=False,
            callbacks=[es_callback, tb_callback, mc_callback],
        )

    # Load best model
    model.load_weights("logs/best_model.h5")

    # Evaluate on validation set and print metrics
    if args.interface == "fit":
        val_metrics = model.evaluate(x=inputs_val, y=y_val)
    else:
        val_metrics = model.evaluate_generator(val_gen)

    print("\nBest model's Validation Set Metrics:")
    for name, val in zip(model.metrics_names, val_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Evaluate on test set and print metrics
    if args.interface == "fit":
        inputs_test, y_test = generator.flow(test_nodes, test_targets)[0]
        test_metrics = model.evaluate(x=inputs_test, y=y_test)
    else:
        test_metrics = model.evaluate_generator(
            generator.flow(test_nodes, test_targets)
        )

    print("\nBest model's Test Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    all_predictions = model.predict_generator(generator.flow(node_ids))

    # Remove singleton batch dimension
    all_predictions = np.squeeze(all_predictions)

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions), index=list(G.nodes())
    )
    accuracy = np.mean(
        [
            "subject=" + gt_subject == p
            for gt_subject, p in zip(
                node_data["subject"], node_predictions.idxmax(axis=1)
            )
        ]
    )
    print("\nAll-node accuracy: {:0.4f}".format(accuracy))

    # Save the trained model
    save_str = "_h{}_l{}_d{}_r{}".format(
        attn_heads, "_".join([str(x) for x in layer_sizes]), dropout, learning_rate
    )
    model.save("cora_gat_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_gat_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
示例#9
0
    def _train_model(self, gnx, train_data, test_data, all_features,
                     target_feature_name):
        subject_groups_train = Counter(train_data[target_feature_name])
        subject_groups_test = Counter(test_data[target_feature_name])

        graph = sg.StellarGraph(gnx, node_features=all_features)

        output_results = {
            'train_size': len(train_data),
            'test_size': len(test_data),
            'subject_groups_train': subject_groups_train,
            'subject_groups_test': subject_groups_test,
            'graph_info': graph.info()
        }

        num_samples = [10, 5]
        generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples)

        target_encoding = feature_extraction.DictVectorizer(sparse=False)
        train_targets = target_encoding.fit_transform(
            train_data[[target_feature_name]].to_dict('records'))
        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(train_data[target_feature_name].to_list()),
            y=train_data[target_feature_name].to_list())
        class_weights = dict(enumerate(class_weights))
        test_targets = target_encoding.transform(
            test_data[[target_feature_name]].to_dict('records'))
        train_gen = generator.flow(train_data.index,
                                   train_targets,
                                   shuffle=True)
        graph_sage_model = GraphSAGE(
            layer_sizes=[80, 80],
            generator=generator,  # train_gen,
            bias=True,
            dropout=0.5,
        )
        print('building model...')

        x_inp, x_out = graph_sage_model.build()
        prediction = layers.Dense(units=train_targets.shape[1],
                                  activation="softmax")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        print('compiling model...')
        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.005),
            loss=losses.categorical_crossentropy,
            metrics=['acc', metrics.categorical_accuracy],
        )
        print('testing the model...')
        test_gen = generator.flow(test_data.index, test_targets)
        history = model.fit(
            train_gen,
            epochs=self.num_epochs,
            validation_data=test_gen,
            verbose=2,
            shuffle=True,
            class_weight=class_weights,
        )
        # save test metrics
        test_metrics = model.evaluate(test_gen)
        print('Test Set Metrics:')
        output_results['test_metrics'] = []
        for name, val in zip(model.metrics_names, test_metrics):
            output_results['test_metrics'].append({'name': name, 'val:': val})
            print("\t{}: {:0.4f}".format(name, val))

        test_nodes = test_data.index
        test_mapper = generator.flow(test_nodes)
        test_predictions = model.predict(test_mapper)
        node_predictions = target_encoding.inverse_transform(test_predictions)
        results = pd.DataFrame(node_predictions,
                               index=test_nodes).idxmax(axis=1)
        df = pd.DataFrame({
            'Predicted': results,
            'True': test_data[target_feature_name]
        })
        clean_result_labels = df['Predicted'].map(
            lambda x: x.replace('subject=', ''))

        # save predicted labels
        pred_labels = np.unique(clean_result_labels.values)
        precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
            df['True'].values,
            clean_result_labels.values,
            average=None,
            labels=pred_labels)
        output_results['classifier'] = []
        for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
            output_results['classifier'].append({
                'label': lbl,
                'precision': prec,
                'recall': rec,
                'fscore': fm
            })

        print(output_results['classifier'])
        print(pred_labels)
        print('precision: {}'.format(precision))
        print('recall: {}'.format(recall))
        print('fscore: {}'.format(f1))

        output_results['history'] = {
            'epochs': history.epoch,
            'training_log': history.history,
            'training_params': history.params
        }

        return generator, model, x_inp, x_out, history, target_encoding, output_results
示例#10
0
def build_all_2():
    print 'For each class, we build all the trees and save them in CSVs'
    path_to_save = '../data/test/try'
    """
    nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative')
    write_tree_in_csv(nar_trees)    
    
    arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative')
    write_tree_in_csv(arg_trees) 
     
    inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative')
    write_tree_in_csv(inf_trees) 
    
    des_trees = []

    # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier.
    all_trees = nar_trees + arg_trees + inf_trees + des_trees
    int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'}

    T = [t[0] for t in all_trees]
    pickle.dump(T,open(path_to_save+'trees.pkl','wb'))"""
    T = pickle.load(open('../data/trees_with_labels.pkl','r'))
    T = [t[0] for t in T]

    """y_nar = [0 for t in nar_trees]
    y_arg = [1 for t in arg_trees]
    y_inf = [2 for t in inf_trees]
    y_des = [3 for t in des_trees]
    y = np.array( y_nar + y_arg + y_inf + y_des )
    pickle.dump(y,open(path_to_save+'labels.pkl','wb'))"""
    
    index = ['bin','count','norm','height','tfid']

    print 'Dicts'
    D_bin = vectorizers.build_bin_vects(T)
    D_count = vectorizers.build_count_vects(T)
    D_norm = vectorizers.build_norm_vects(T)
    D_height = vectorizers.build_height_vects(T)
    D_tfid = vectorizers.build_tfid_vects(T)
    
    D_all = {'bin':D_bin ,'count': D_count,'norm': D_norm,'height': D_height,'tfid': D_tfid}
    pickle.dump(D_all,open(path_to_save+'dicts.pkl','wb'))
    

    print 'Vects'
    vectorizer = feature_extraction.DictVectorizer(sparse=False)
    V_bin = vectorizer.fit_transform(D_bin)
    V_count = vectorizer.fit_transform(D_count)
    V_norm = vectorizer.fit_transform(D_norm)
    V_height = vectorizer.fit_transform(D_height)
    V_tfid = vectorizer.fit_transform(D_tfid)

    V_all = {'bin':V_bin ,'count': V_count,'norm': V_norm,'height': V_height,'tfid': V_tfid}
    pickle.dump(V_all,open(path_to_save+'vects.pkl','wb'))
    
    #Y = vectorizer.inverse_transform(V_bin)



    print 'Kernels'

    ## tree kernels
    #max_depth = 15
    #T_p = [ctree.prune(t,max_depth) for t in T]
    #K_tree = kernels.compute_gram(T_p,T_p,kernels.tree_kernel)
    #pickle.dump(K_tree,open(path_to_save+'tree_kernel.pkl'))

    print 'vector kernels'
    print 'linear'
    K_bin_lin = pairwise.linear_kernel(V_bin)
    K_count_lin = pairwise.linear_kernel(V_count)
    K_norm_lin = pairwise.linear_kernel(V_norm)
    K_height_lin = pairwise.linear_kernel(V_height)
    K_tfid_lin = pairwise.linear_kernel(V_tfid)
    K_all_lin = {'bin':K_bin_lin, 'count':K_count_lin, 'norm':K_norm_lin, 'height':K_height_lin, 'tfid':K_tfid_lin}
    print 'rbf'
    K_bin_rbf = pairwise.rbf_kernel(V_bin)
    K_count_rbf = pairwise.rbf_kernel(V_count)
    K_norm_rbf = pairwise.rbf_kernel(V_norm)
    K_height_rbf = pairwise.rbf_kernel(V_height)
    K_tfid_rbf = pairwise.rbf_kernel(V_tfid)
    K_all_rbf = {'bin':K_bin_rbf, 'count':K_count_rbf, 'norm':K_norm_rbf, 'height':K_height_rbf, 'tfid':K_tfid_rbf}
    print 'cosine sim'
    K_bin_cos_sim = pairwise.cosine_similarity(V_bin)
    K_count_cos_sim = pairwise.cosine_similarity(V_count)
    K_norm_cos_sim = pairwise.cosine_similarity(V_norm)
    K_height_cos_sim = pairwise.cosine_similarity(V_height)
    K_tfid_cos_sim = pairwise.cosine_similarity(V_tfid)
    K_all_cos_sim = {'bin':K_bin_cos_sim, 'count':K_count_cos_sim, 'norm':K_norm_cos_sim, 'height':K_height_cos_sim, 'tfid':K_tfid_cos_sim}
    print 'euclidean distance'
    K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean')
    K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean')
    K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean')
    K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean')
    K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean')
    K_all_eucl_dist = {'bin':K_bin_eucl_dist, 'count':K_count_eucl_dist, 'norm':K_norm_eucl_dist, 'height':K_height_eucl_dist, 'tfid':K_tfid_eucl_dist}
    print 'minkowski distance'
    K_bin_mink_dist = pairwise.pairwise_distances(V_bin,metric='minkowski')
    K_count_mink_dist = pairwise.pairwise_distances(V_count,metric='minkowski')
    K_norm_mink_dist = pairwise.pairwise_distances(V_norm,metric='minkowski')
    K_height_mink_dist = pairwise.pairwise_distances(V_height,metric='minkowski')
    K_tfid_mink_dist = pairwise.pairwise_distances(V_tfid,metric='minkowski')
    K_all_mink_dist = {'bin':K_bin_mink_dist, 'count':K_count_mink_dist, 'norm':K_norm_mink_dist, 'height':K_height_mink_dist, 'tfid':K_tfid_mink_dist}


    K_all = {'lin':K_all_lin, 'rbf':K_all_rbf, 'cos_sim':K_all_cos_sim,'eucl_dist':K_all_eucl_dist,'mink_dist':K_all_mink_dist}
    pickle.dump(K_all,open(path_to_save+'vect_kernels.pkl','wb'))
    print "done"
示例#11
0
def build_all():
    # For each class, we build all the trees and save them in CSVs
    nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative')
    write_tree_in_csv(nar_trees)    
    
    arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative/')
    write_tree_in_csv(arg_trees) 
     
    inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/')
    write_tree_in_csv(inf_trees) 
    
    des_trees = []
    #des_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/')
    #write_tree_in_csv(des_trees) 
    
    
    # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier.
    all_trees = nar_trees + arg_trees + inf_trees + des_trees
    int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'}

    path_to_save = '~/Documents/s2/tal/discourseAnalysis/data/'
    y_nar = [0 for t in nar_trees]
    y_arg = [1 for t in arg_trees]
    y_inf = [2 for t in inf_trees]
    y_des = [3 for t in des_trees]
    y = np.array( y_nar + y_arg + y_inf + y_des )
    pickle.dump(y,open(path_to_save+'labels_test.pkl','wb'))

    T = [t[0] for t in all_trees]
    pickle.dump(T,open(path_to_save+'trees_test.pkl','wb'))
    
    index = ['bin','count','norm','height','tfid']

    #Dicts
    D_bin = vectorizers.build_bin_vects(T)
    D_count = vectorizers.build_count_vects(T)
    D_norm = vectorizers.build_norm_vects(T)
    D_height = vectorizers.build_height_vects(T)
    D_tfid = vectorizers.build_tfid_vects(T)
    
    D_df = pd.DataFrame([D_bin,D_count,D_norm,D_height,D_tfid],index=index)
    D_df = D_df.transpose()
    D_df.to_pickle(path_to_save+'dicts_test.pkl')
    

    #Vects
    vectorizer = feature_extraction.DictVectorizer(sparse=False)
    V_bin = vectorizer.fit_transform(D_bin)
    V_count = vectorizer.fit_transform(D_count)
    V_norm = vectorizer.fit_transform(D_norm)
    V_height = vectorizer.fit_transform(D_height)
    V_tfid = vectorizer.fit_transform(D_tfid)

    V_all = np.zeros((len(index),V_bin.shape[0],V_bin.shape[1]))
    V_all = np.array([V_bin,V_count,V_norm,V_height,V_tfid])
    V_df = []
    for i in range(V_all.shape[1]):
        d = {}
        for j,v in enumerate(V_all[:,i]):
            d[index[j]]=v
        V_df.append(d)
    V_df = pd.DataFrame(V_df)
    V_df.to_pickle(path_to_save+'vects_test.pkl')
    
    #euclidean distance
    K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean')
    K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean')
    K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean')
    K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean')
    K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean')
    K_all_eucl_dist = [K_bin_eucl_dist, K_count_eucl_dist, K_norm_eucl_dist, K_height_eucl_dist, K_tfid_eucl_dist]
    
    K_all = {'eucl_dist':K_all_eucl_dist}
    pickle.dump(K_all,open(path_to_save+'kernels_test.pkl','wb'))
示例#12
0
def onehot(csv):
    records = csv[gconfig['categorial_features']].to_dict(orient='records')
    dv = feature_extraction.DictVectorizer(separator='_', sparse=False)
    dv.fit(records)
    return dv