parser.add_argument( '--model-dir', type=str, default='../models', metavar='str', help=f'path where fitted model will be saved. Dataset name is added') opt = parser.parse_args() if not opt.dataset: parser.error('Missing dataset name') dataset_name = opt.dataset dataset = Dataset.load(dataset_name=dataset_name, pickle_path=os.path.join(opt.pickle_dir, f'{dataset_name}.pickle')) dataset.show() singlelabel = dataset.classification_type == 'singlelabel' if singlelabel: ModelArgsClass = ClassificationArgs ModelClass = ClassificationModel confusion_function = confusion_matrix else: ModelArgsClass = MultiLabelClassificationArgs ModelClass = MultiLabelClassificationModel confusion_function = multilabel_confusion_matrix
from sklearn.svm import LinearSVC from sklearn.metrics import f1_score, make_scorer def classify_and_test(X, y, Xte, yte, average=1): svm = LinearSVC() svm.fit(X, y) # print(svm.best_params_) yte_ = svm.predict(Xte) if average>1: yte_ = yte_.reshape(-1,average).mean(axis=1)>0.5 f1 = f1_score(yte, yte_) print(f'f1={f1:.3f}') dataset = Dataset.load('reuters21578', pickle_path='./reuters.pickle') Xtr, Xte = dataset.devel_raw, dataset.test_raw ytr, yte = dataset.devel_target, dataset.test_target # generate the co-occurrence matrices counter = CountVectorizer(min_df=5) Xtr = counter.fit_transform(Xtr) Xte = counter.transform(Xte) train_nwords = Xtr.sum(axis=1).getA().flatten() test_nwords = Xte.sum(axis=1).getA().flatten() # generate the tfidf matrices tfidf = TfidfTransformer() Xtr = tfidf.fit_transform(Xtr) Xte = tfidf.transform(Xte)