def splitData(dataFile, test_size): # 加载数据集 header = ['user_id', 'item_id', 'rating', 'timestamp'] df = pd.read_csv(dataFile, sep='\t', names=header) n_users = df.user_id.unique().shape[0] n_items = df.item_id.unique().shape[0] print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) train_data, test_data = cv.train_test_split(df, test_size=test_size) print("数据量:", len(train_data), len(test_data)) return df, n_users, n_items, train_data, test_data
X_encoded[:, i] = X[:, i] else: label_encoder.append(preprocessing.LabelEncoder()) X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i]) X = X_encoded[:, :-1].astype(int) y = X_encoded[:, -1].astype(int) # Create SVM classifier classifier = OneVsOneClassifier(LinearSVC(random_state=0)) # Train the classifier classifier.fit(X, y) # Cross validation X_train, X_test, y_train, y_test = train_test_split.train_test_split(X, y, test_size=0.2, random_state=5) classifier = OneVsOneClassifier(LinearSVC(random_state=0)) classifier.fit(X_train, y_train) y_test_pred = classifier.predict(X_test) # Compute the F1 score of the SVM classifier f1 = train_test_split.cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3) print("F1 score: " + str(round(100*f1.mean(), 2)) + "%") # Predict output for a test datapoint input_data = ['37', 'Private', '215646', 'HS-grad', '9', 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0', '0', '40', 'United-States'] # Encode test datapoint input_data_encoded = [-1] * len(input_data) count = 0 for i, item in enumerate(input_data):