예제 #1
0
def test_multilabelclassifier():
    np.random.seed(1)

    N = 1000
    split = N // 2
    X = np.random.rand(N, 2)

    Ya = [1 if X[i, 0] < 0.5 else 0 for i in range(X.shape[0])]
    Yb = [1 if X[i, 1] < 0.5 else 0 for i in range(X.shape[0])]

    Y = np.column_stack((Ya, Yb))

    Xtrain = X[:split, :]
    Ytrain = Y[:split, :]

    Xtest = X[split:, :]
    Ytest = Y[split:, :]

    classifier = kindred.MultiLabelClassifier(
        sklearn.linear_model.LogisticRegression,
        random_state=1,
        solver='lbfgs')

    classifier.fit(Xtrain, Ytrain)

    predicted = classifier.predict(Xtest)
    assert predicted.shape == Ytest.shape
    rmse = np.sqrt(np.mean((predicted - Ytest)**2))
    assert round(rmse, 3) == 0.071

    probs = classifier.predict_proba(Xtest)
    assert probs.shape == Ytest.shape
    rmse_probs = np.sqrt(np.mean((probs - Ytest)**2))
    assert round(rmse_probs, 3) == 0.202
예제 #2
0
	def train(self,corpus):
		"""
		Trains the classifier using this corpus. All relations in the corpus will be used for training.

		:param corpus: Corpus to use for training
		:type corpus: kindred.Corpus
		"""
		assert isinstance(corpus,kindred.Corpus)

		if not corpus.parsed:
			parser = kindred.Parser(model=self.model)
			parser.parse(corpus)
		
		self.candidateBuilder = CandidateBuilder(entityCount=self.entityCount,acceptedEntityTypes=self.acceptedEntityTypes)
		candidateRelations = self.candidateBuilder.build(corpus)

		if len(candidateRelations) == 0:
			raise RuntimeError("No candidate relations found in corpus for training. Does the corpus contain text and entity annotations with at least one sentence containing %d entities." % (self.entityCount))

		candidateRelationKeys = set()
		for cr in candidateRelations:
			assert isinstance(cr,kindred.CandidateRelation)
			for knownType,knownArgNames in cr.knownTypesAndArgNames:
				relKey = tuple([knownType] + knownArgNames)
				candidateRelationKeys.add(relKey)
		
		# Create mappings from the class index to a relation type and back again
		self.colToRelType = sorted(list(candidateRelationKeys))
		self.relTypeToCol = { relationType:i for i,relationType in enumerate(self.colToRelType) }
		
		Y = np.zeros((len(candidateRelations),len(self.colToRelType)),np.int32)
		
		candidateClasses = []
		for i,cr in enumerate(candidateRelations):
			for knownType,knownArgNames in cr.knownTypesAndArgNames:
				relKey = tuple([knownType] + knownArgNames)
				col = self.relTypeToCol[relKey]
				Y[i,col] = 1

		entityCountsInRelations = set([ len(r.entities) for r in corpus.getRelations() ])
		entityCountsInRelations = sorted(list(set(entityCountsInRelations)))
		assert self.entityCount in entityCountsInRelations, "Relation classifier is expecting to train on relations with %d entities (entityCount=%d). But the known relations in the corpus contain relations with the following entity counts: %s. Perhaps the entityCount parameter should be changed or there is a problem with the training corpus." % (self.entityCount,self.entityCount,str(entityCountsInRelations))

		self.relTypeToValidEntityTypes = defaultdict(set)

		for d in corpus.documents:
			for r in d.relations:
				validEntityTypes = tuple([ e.entityType for e in r.entities ])
				
				relKey = tuple([r.relationType] + r.argNames)
				self.relTypeToValidEntityTypes[relKey].add(validEntityTypes)

		self.vectorizer = Vectorizer(entityCount=self.entityCount,featureChoice=self.chosenFeatures,tfidf=self.tfidf)
		trainVectors = self.vectorizer.fit_transform(candidateRelations)
	
		assert trainVectors.shape[0] == Y.shape[0]

		posCount = Y.sum()
		negCount = Y.shape[0]*Y.shape[1] - posCount

		assert negCount > 0, "Must have at least one negative candidate relation in set for training"
		assert posCount > 0, "Must have at least one positive candidate relation in set for training"

		self.clf = None
		if self.classifierType == 'SVM':
			self.clf = kindred.MultiLabelClassifier(svm.LinearSVC,class_weight='balanced',random_state=1,max_iter=10000)
		elif self.classifierType == 'LogisticRegression' and self.threshold is None:
			self.clf = kindred.MultiLabelClassifier(LogisticRegression,class_weight='balanced',random_state=1,solver='liblinear',multi_class='ovr')
		elif self.classifierType == 'LogisticRegression' and not self.threshold is None:
			self.clf = kindred.MultiLabelClassifier(kindred.LogisticRegressionWithThreshold,threshold=self.threshold)
		
		self.clf.fit(trainVectors,Y)
		
		self.isTrained = True