def run(train_csv, train_tensor, test_csv, test_tensor, metric='euclidean'): print("K means\n") """ Construction du dataset train. """ df = pd.read_csv(train_csv, sep=';', header='infer', quotechar='"', low_memory=True) df = df[['Longitude','Latitude','glc19SpId','scName']]\ .dropna(axis=0, how='all')\ .astype({'glc19SpId': 'int64'}) # target pandas series of the species identifiers (there are 505 labels) target_df = df['glc19SpId'] # building the environmental data env_df = build_environmental_data(df[['Latitude', 'Longitude']], patches_dir=train_tensor) X_train = env_df.values y_train = target_df.values """ Construction du dataset test. """ df = pd.read_csv(test_csv, sep=';', header='infer', quotechar='"', low_memory=True) df = df[['Longitude','Latitude','glc19SpId','scName']]\ .dropna(axis=0, how='all')\ .astype({'glc19SpId': 'int64'}) # target pandas series of the species identifiers (there are 505 labels) target_df = df['glc19SpId'] # building the environmental data env_df = build_environmental_data(df[['Latitude', 'Longitude']], patches_dir=test_tensor) X_test = env_df.values y_test = target_df.values """ Entrainement modèle. """ # Standardize the features by removing the mean and scaling to unit variance scaler = StandardScaler() X_train = scaler.fit_transform(X_train) classifier = VectorModel(metric=metric) classifier.fit(X_train, y_train) """ Évaluation et Calcul de score. """ y_predicted = classifier.predict(X_test) print(f'Top30 score:{classifier.top30_score(y_predicted, y_test)}') print(f'MRR score:{classifier.mrr_score(y_predicted, y_test)}') print('Params:', classifier.get_params())
.dropna(axis=0, how='all')\ .astype({'glc19SpId': 'int64'}) # target pandas series of the species identifiers (there are 505 labels) target_df = df['glc19SpId'] # correspondence table between ids and the species taxonomic names # (Taxref names with year of discoverie) taxonomic_names = pd.read_csv('../data/occurrences/taxaName_glc19SpId.csv', sep=';', header='infer', quotechar='"', low_memory=True) # building the environmental data env_df = build_environmental_data(df[['Latitude', 'Longitude']], patches_dir='example_envtensors') X = env_df.values y = target_df.values # Standardize the features by removing the mean and scaling to unit variance scaler = StandardScaler() X = scaler.fit_transform(X) # Evaluate as the average accuracy on one train/split random sample: print("Test nearest centroid model") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) classifier = NearestCentroidModel(metric='euclidean') classifier.fit(X_train, y_train) y_predicted = classifier.predict(X_test) print(f'Top30 score:{classifier.top30_score(y_predicted, y_test)}') print(f'MRR score:{classifier.mrr_score(y_predicted, y_test)}') print('Params:', classifier.get_params())