class Pipeline(object): def __init__(self, trainFilePath, valFilePath, retrievalInstance, featurizerInstance, classifierInstance, predFilePath): self.retrievalInstance = retrievalInstance self.featurizerInstance = featurizerInstance self.classifierInstance = classifierInstance self.predFile = predFilePath trainfile = open(trainFilePath, 'r') self.trainData = json.load(trainfile) trainfile.close() valfile = open(valFilePath, 'r') self.valData = json.load(valfile) valfile.close() self.question_answering() def makeXY(self, dataQuestions): X = [] Y = [] for question in dataQuestions: long_snippets = self.retrievalInstance.getLongSnippets(question) short_snippets = self.retrievalInstance.getShortSnippets(question) X.append(short_snippets) Y.append(question['answers'][0]) return X, Y def question_answering(self): dataset_type = self.trainData['origin'] candidate_answers = self.trainData['candidates'] X_train, Y_train = self.makeXY(self.trainData['questions'][0:6000]) X_val, Y_val_true = self.makeXY(self.valData['questions']) #featurization X_features_train, X_features_val = self.featurizerInstance.getFeatureRepresentation(X_train, X_val) self.clf = self.classifierInstance.buildClassifier(X_features_train, Y_train) #Prediction Y_val_pred = self.clf.predict(X_features_val) self.evaluatorInstance = Evaluator() a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred) p,r,f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred) # write to file: comparison, pred, true print self.predFile predfile = open(self.predFile, 'w') for i in range(len(Y_val_true)): if Y_val_true[i] == Y_val_pred[i]: predfile.write("1 " + str(Y_val_pred[i]) + " " + str(Y_val_true[i]) + '\n') elif Y_val_true[i] != Y_val_pred[i]: predfile.write("0 " + str(Y_val_pred[i]) + " " + str(Y_val_true[i]) + '\n') print "Accuracy: " + str(a) print "Precision: " + str(p) print "Recall: " + str(r) print "F-measure: " + str(f)
class Pipeline(object): def __init__(self, trainFilePath, valFilePath, saveFilePath, retrievalInstance, featurizerInstance, classifierInstance): self.saveFilePath = saveFilePath self.retrievalInstance = retrievalInstance self.featurizerInstance = featurizerInstance self.classifierInstance = classifierInstance trainfile = open(trainFilePath, 'r') self.trainData = json.load(trainfile) trainfile.close() valfile = open(valFilePath, 'r') self.valData = json.load(valfile) valfile.close() self.question_answering() def makeXY(self, dataQuestions): X = [] Y = [] for question in dataQuestions: long_snippets = self.retrievalInstance.getLongSnippets(question) short_snippets = self.retrievalInstance.getShortSnippets(question) X.append(short_snippets) Y.append(question['answers'][0]) return X, Y def question_answering(self): dataset_type = self.trainData['origin'] candidate_answers = self.trainData['candidates'] X_train, Y_train = self.makeXY(self.trainData['questions']) X_val, Y_val_true = self.makeXY(self.valData['questions']) # featurization X_features_train, X_features_val = self.featurizerInstance.getFeatureRepresentation( X_train, X_val) self.clf = self.classifierInstance.buildClassifier( X_features_train, Y_train) # Prediction Y_val_pred = self.clf.predict(X_features_val) self.evaluatorInstance = Evaluator() a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred) p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred) print("Accuracy: " + str(a)) print("Precision: " + str(p)) print("Recall: " + str(r)) print("F-measure: " + str(f)) with open(self.saveFilePath, 'w') as fout: writer = csv.writer(fout) for tup in zip(Y_val_true, Y_val_pred): writer.writerow(tup) return Y_val_pred
class Pipeline(object): def __init__(self, trainFilePath, valFilePath, retrievalInstance, featurizerInstances, classifierInstances): self.retrievalInstance = retrievalInstance self.featurizerInstances = featurizerInstances self.classifierInstances = classifierInstances trainfile = open(trainFilePath, 'r') self.trainData = json.load(trainfile) trainfile.close() valfile = open(valFilePath, 'r') self.valData = json.load(valfile) valfile.close() self.question_answering() def makeXY(self, dataQuestions): X = [] Y = [] for question in dataQuestions: long_snippets = self.retrievalInstance.getLongSnippets(question) short_snippets = self.retrievalInstance.getShortSnippets(question) X.append(short_snippets) Y.append(question['answers'][0]) return X, Y def question_answering(self): dataset_type = self.trainData['origin'] candidate_answers = self.trainData['candidates'] X_train, Y_train = self.makeXY(self.trainData['questions'][0:1000]) X_val, Y_val_true = self.makeXY(self.valData['questions']) for featurizer in self.featurizerInstances: for classifier in self.classifierInstances: print "Running pipeline with featurizer: ", featurizer, " and classifier ", classifier #featurization X_features_train, X_features_val = featurizer.getFeatureRepresentation( X_train, X_val) self.clf = classifier.buildClassifier(X_features_train, Y_train) #Prediction Y_val_pred = self.clf.predict(X_features_val) # Evaluation self.evaluatorInstance = Evaluator() a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred) p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred) print "Accuracy: " + str(a) print "Precision: " + str(a) print "Recall: " + str(a) print "F-measure: " + str(a) print '\n'
class Pipeline(object): def __init__(self, trainFilePath, valFilePath, retrievalInstance, featurizerInstance, classifierInstance): self.retrievalInstance = retrievalInstance self.featurizerInstance = featurizerInstance self.classifierInstance = classifierInstance self.evaluatorInstance = Evaluator() trainfile = open(trainFilePath, 'r') self.trainData = json.load(trainfile) self.trainData['questions'] = self.trainData['questions'][0:N] trainfile.close() valfile = open(valFilePath, 'r') self.valData = json.load(valfile) valfile.close() #self.question_answering() self.prepare_data() self.prepare_features() def makeXY(self, dataQuestions): X = [] Y = [] for question in dataQuestions: long_snippets = self.retrievalInstance.getLongSnippets(question) short_snippets = self.retrievalInstance.getShortSnippets(question) X.append(short_snippets) Y.append(question['answers'][0]) return X, Y def get_data(self): dataset_type = self.trainData['origin'] candidate_answers = self.trainData['candidates'] ## return self.makeXY(self.trainData['questions']) def prepare_data(self): dataset_type = self.trainData['origin'] candidate_answers = self.trainData['candidates'] ## self.X_train, self.Y_train = self.makeXY(self.trainData['questions']) self.X_val, self.Y_val_true = self.makeXY(self.valData['questions']) def prepare_features(self): #featurization self.X_features_train, self.X_features_val = self.featurizerInstance.getFeatureRepresentation(self.X_train, self.X_val) def qa(self): self.clf = self.classifierInstance.buildClassifier(self.X_features_train, self.Y_train) #Prediction Y_val_pred = self.clf.predict(self.X_features_val) a = self.evaluatorInstance.getAccuracy(self.Y_val_true, Y_val_pred) p, r, f = self.evaluatorInstance.getPRF(self.Y_val_true, Y_val_pred) print("Accuracy: " + str(a)) print("Precision: " + str(p)) print("Recall: " + str(r)) print("F-measure: " + str(f))
class Pipeline(object): def __init__(self, trainFilePath, valFilePath, retrievalInstance, featurizerInstance, classifierInstance, resultsPATH): self.retrievalInstance = retrievalInstance self.featurizerInstance = featurizerInstance self.classifierInstance = classifierInstance trainfile = open(trainFilePath, 'r') self.trainData = json.load(trainfile) trainfile.close() valfile = open(valFilePath, 'r') self.valData = json.load(valfile) valfile.close() self.PATH = resultsPATH self.question_answering() def makeXY(self, dataQuestions): X = [] Y = [] for question in dataQuestions: long_snippets = self.retrievalInstance.getLongSnippets(question) short_snippets = self.retrievalInstance.getShortSnippets(question) X.append(short_snippets) Y.append(question['answers'][0]) return X, Y def question_answering(self): print('Loading data...') dataset_type = self.trainData['origin'] candidate_answers = self.trainData['candidates'] X_train, Y_train = self.makeXY( self.trainData['questions'][0:30000]) # 31049 questions X_val, Y_val_true = self.makeXY(self.valData['questions']) # featurization print('Feature Extraction...') X_features_train, X_features_val = self.featurizerInstance.getFeatureRepresentation( X_train, X_val) self.clf = self.classifierInstance.buildClassifier( X_features_train, Y_train) # Prediction print('Prediction...') Y_val_pred = self.clf.predict(X_features_val) self.evaluatorInstance = Evaluator() a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred) p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred) print("Accuracy: " + str(a)) print("Precision: " + str(p)) print("Recall: " + str(r)) print("F-measure: " + str(f)) # Correctly answered questions # correct_questions_indices = np.where(np.equal(Y_val_pred, Y_val_true)) # correct_questions = X_val[correct_questions_indices] # Save predictions in json results = { 'feature': self.featurizerInstance.__class__.__name__, 'classifier': self.classifierInstance.__class__.__name__, 'training size': len(X_train), 'accuracy': a, 'precision': p, 'recall': r, 'F-measure': f, 'predictions': Y_val_pred.tolist() } file = open(os.path.join( self.PATH, self.featurizerInstance.__class__.__name__ + self.classifierInstance.__class__.__name__), 'w', encoding='utf-8') json.dump(results, file, ensure_ascii=False)