def main(outDir): """ Pairs up events with non-event to determine realis """ print("Building Converters") w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" d2vPath = "data/vectors/doc2vec/ace/doc_embeddings.txt" s2vPath = "data/vectors/doc2vec/ace/sent_embeddings.txt" entTrain = "data/entities_training.csv" entDev = "data/entities_dev.csv" entTest = "data/entities_testing.csv" entities = readEntities(entTrain) + readEntities(entDev) + readEntities( entTest) entFeats = v.EntityFeats(entities) wordIndex = load(open("data/word_index.p")) entityIndex = load(open("data/entity_map.p")) leftConverter = v.WindowFeats([ v.WordEmbeddingFeats(wordIndex), v.EntityEmbeddingFeats(entityIndex, entities), v.DistanceEmbeddingFeats() ], 15) #rightConverters = [v.Word2VecFeats(v.loadW2V(w2vPath), 1), #v.Doc2VecFeats(d2vPath), #v.Sentence2VecFeats(s2vPath)] #rightConverters = [v.Word2VecFeats(w2vModel, 1), v.Doc2VecFeats(d2vPath), v.Sentence2VecFeats(s2vPath)] #rightConverters = [v.Word2VecFeats(gloveModel, 1), v.Word2VecFeats(w2vModel, 1), #v.Doc2VecFeats(d2vPath), v.Sentence2VecFeats(s2vPath), entFeats, depFeats, posFeats, docFeats] #rightConverters = [v.Doc2VecFeats(d2vPath), v.Sentence2VecFeats(s2vPath)] rightConverters = [] mkdir(outDir) #vectorize the data print("Read training") trainingEvents = writeWindow(c.dataPath, c.trainingFile, leftConverter, join(outDir, "training_{}.p")) print("Read dev") devEvents = writeWindow(c.dataPath, c.devFile, leftConverter, join(outDir, "dev_{}.p")) print("Read testing") testEvents = writeWindow(c.dataPath, c.testFile, leftConverter, join(outDir, "test_{}.p")) data = { "train_events": trainingEvents, "dev_events": devEvents, "test_events": testEvents, "info": "\n".join(map(str, [leftConverter] + rightConverters)) } with open(join(outDir, "info.p"), "w") as out: dump(data, out)
def crossTrain(data, labels, outDir, modelsPer, validator, params): """ Does cross validated based training """ completed = 0 results = [] out = open(join(outDir, "log.txt"), "a") #for each fold split the data and train models keeping the best for i, partition in enumerate( validator.partition(data, params.eventMap.matrixToIndex(labels))): if params.limit and completed == params.limit: return results print("Fold {}".format(i)) #create the output directory foldDir = join(outDir, "fold{}".format(i)) if not os.access(foldDir, os.F_OK): mkdir(foldDir) print("Training Size {}, Dev Size {}".format( len(partition[0]), len(partition[1]))) #train models bestScore, rnd, epoch, best = trainOnFold(data, labels, foldDir, modelsPer, partition, params) print("Best Score {}".format(bestScore)) out.write("Round {}, Epoch {}, Score {}\n".format( rnd, epoch, bestScore)) out.flush() os.fsync(out) results.append(best) completed += 1 else: print("Fold {} already exists".format(i)) out.close() return results
def main(outDir): """ Creates a tensor of sequences """ EVENT_MAP = "data/event_map.p" ENTITY_MAP = "data/entity_map.p" EVENT_ENTITY_MAP = "data/entity_event_map.p" ENTITY_MAP = "data/entity_map.p" d2vPath = "data/vectors/doc2vec/ace/doc_embeddings.txt" s2vPath = "data/vectors/doc2vec/ace/sent_embeddings.txt" w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" glovePath = "data/vectors/glove/glove.6B.50d.txt" mkdir(outDir) w2vModel = v.loadW2V(w2vPath) #gloveModel = v.loadGlove(glovePath) wordConv = v.Word2VecFeats(w2vModel, 0) converters = [v.Doc2VecFeats(d2vPath), v.Sentence2VecFeats(s2vPath)] #make the event map #eventMap = load(open(EVENT_MAP)) #eventMap = load(open(ENTITY_MAP)) eventMap = load(open(EVENT_ENTITY_MAP)) #vectorize the data print("Read training") trainingEvents, trainingTags = writeSequences(c.dataPath, c.trainingFile, c.trainingEnts, converters, wordConv, eventMap, join(outDir, "training_{}.p")) print("Read dev") devEvents, devTags = writeSequences(c.dataPath, c.devFile, c.devEnts, converters, wordConv, eventMap, join(outDir, "dev_{}.p")) print("Read testing") testEvents, testTags = writeSequences(c.dataPath, c.testFile, c.testEnts, converters, wordConv, eventMap, join(outDir, "test_{}.p")) data = { "train_events":trainingEvents, "dev_events":devEvents, "test_events":testEvents, "info": "\n".join(map(str,converters)), "train_tags":trainingTags, "dev_tags":devTags, "test_tags":testTags} with open(join(outDir, "info.p"),"w") as infoOut: dump(data, infoOut)
def trainOnFold(data, labels, outDir, numModels, partition, params): """ Trains several models the given data and parition and returns the best one """ trainPart, devPart = partition #partition the data and labels trainX, trainY = partitionData(data, labels, trainPart) devX, devY = partitionData(data, labels, devPart) models = [] #train multiple models for i in range(numModels): modelDir = join(outDir, str(i)) mkdir(modelDir) #setup logger logger = makeLogger(modelDir, params.eventMap) #train model model, index = trainModel(trainX, trainY, devX, devY, logger, params) #make predictions pred = predictClasses(model, devX, params.batchSize) #evaluate using F1 score = evaluatePredictions(pred, params.eventMap.matrixToNames(devY), params.eventMap, False) models.append((score, i, index, model)) #need to clean up after building a model b.clear_session() #return best model return max(models)
def __init__(self, path, expName, expComment, params, redirect=True, groupId=None, noLog=False): self.path = path self.name = expName self.comment = expComment self.params = params self.oldStreams = (sys.stdout, sys.stderr) self.loggingStreams = None self.timeStart = None self.timeEnd = None self.groupId = groupId self.noLogging = noLog if not noLog: # make the connection self.conn = sql.connect(join(path, "expmeta.db")) # make sure the schema is setup initSchema(self.conn) # generate a UUID self.expId = str(uuid4()) # make an entry for the experiment self.createExp(expName, expComment) # log the params self.logParams(params) # make sure the output dirs exist mkdir(self.directory()) # write the meta data to a file self.writeMeta(expName, expComment, params) # redirect the streams if redirect: self.redirectStreams()
def main(args): """ Runs and evaluates the model """ #n.random.seed(13) n.random.seed(16) #n.random.seed(17) #better for realis14 #n.random.seed(20) print("Reading the data") dataDict = loadData(args.f, args.s) useBothHalves = args.full useEmb = args.emb or args.full if args.o: mkdir(args.o) #unpack the data if useEmb: trainData = setupEmbeddings(dataDict["train_x"], useBothHalves) devData = setupEmbeddings(dataDict["dev_x"], useBothHalves) testData = setupEmbeddings(dataDict["test_x"], useBothHalves) else: trainData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.m)) trainingLabels = eventMap.namesToMatrix(rawTrainingLabels) devLabels = eventMap.namesToMatrix(rawDevLabels) testingLabels = eventMap.namesToMatrix(rawTestingLabels) if useEmb: (samples, seqLen) = trainData[0].shape else: (samples, seqLen, dim) = trainData[0].shape print(trainData[0].shape) if args.s: (rightSamples, contextDim) = trainData[2].shape else: if useBothHalves: rightSamples = trainData[0].shape[0] (_, contextDim) = trainData[-1].shape else: (rightSamples, contextDim) = trainData[-1].shape print("#instances: {}, seq len: {}".format(samples, seqLen)) print("right side {} {}".format(rightSamples, contextDim)) print("labels shape {}".format(trainingLabels.shape)) print("Building the model") #get the model if useEmb: w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" indexPath = "data/word_index.p" #load the realis data if args.realis: realisData = loadRealisData(args.realis) trainData += [realisData[0]] devData += [realisData[1]] testData += [realisData[2]] (_, contextDim) = realisData[0].shape #load the weights w2v = loadW2V(w2vPath) #load the index wordIndex = load(open(indexPath)) #make the initial weights initWeights = makeEmbeddingWeights(w2v, wordIndex) if args.full or args.realis: model = buildMultiEmbModel(len(eventMap), seqLen, contextDim, initWeights, eventMap) else: model = buildCNNEmbModel(len(eventMap), seqLen, contextDim, initWeights, eventMap) else: model = buildCNNModel(len(eventMap), seqLen, dim, contextDim, eventMap) #train the model print("Training the model") #hard coding class weights... weights = defaultdict(lambda: 5.5) #weights = defaultdict(lambda: 7.0) weights[eventMap.nilIndex()] = 1.0 #make the logger logger = makeLogger(args.o, eventMap) model.fit(trainData, trainingLabels, nb_epoch=args.e, batch_size=args.b, validation_data=(devData, devLabels), callbacks=[logger], class_weight=weights) #get the best model best = logger.best() print("Best Model round: {} val: {}".format(logger.bestModel, logger.bestScore)) print("Make Predictions") #make predictions trainPred = predictClasses(best, trainData, args.b) devPred = predictClasses(best, devData, args.b) print("\nEvalutation") #evaluate the model print("-----Training Scores-----") evaluatePredictions(trainPred, rawTrainingLabels, eventMap) print("\n-----Dev Scores------") evaluatePredictions(devPred, rawDevLabels, eventMap) if args.t: testPred = predictClasses(best, testData, args.b) print("\n\n-----Test Scores------") evaluatePredictions(testPred, rawTestingLabels, eventMap)
def main(args): """ Makes predictions using the loaded model """ w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" useBothHalves = args.full useEmb = args.emb or args.full print("Reading Data") dataDict = loadData(args.f, args.s) #unpack the data if useEmb: trainData = setupEmbeddings(dataDict["train_x"], useBothHalves) devData = setupEmbeddings(dataDict["dev_x"], useBothHalves) testData = setupEmbeddings(dataDict["test_x"], useBothHalves) else: trainData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] #rawTrainingLabels = dataDict["train_y"] #rawDevLabels = dataDict["dev_y"] #rawTestingLabels = dataDict["test_y"] if useEmb: (samples, seqLen) = trainData[0].shape else: (samples, seqLen, dim) = trainData[0].shape print(trainData[0].shape) if args.s: (rightSamples, contextDim) = trainData[2].shape else: if useBothHalves: rightSamples = trainData[0].shape[0] (_, contextDim) = trainData[-1].shape else: (rightSamples, contextDim) = trainData[-1].shape print("#instances: {}, seq len: {}".format(samples, seqLen)) print("right side {} {}".format(rightSamples, contextDim)) #print("labels shape {}".format(trainingLabels.shape)) #load the realis data if args.realis: realisData = loadRealisData(args.realis) trainData += [realisData[0]] devData += [realisData[1]] testData += [realisData[2]] (_, contextDim) = realisData[0].shape eventMap = load(open(args.m)) #load the model model = loadBest([args.a], eventMap)[0] model.summary() eventOut = "eventOut" eventProbOut = "eventProbOut" mkdir(join(args.a, eventOut)) mkdir(join(args.a, eventProbOut)) makeNames = lambda p: [ join(args.a, p, i) for i in ["training_pred.csv", "dev_pred.csv", "test_pred.csv"] ] outModel = buildCNNEmbOutput(len(eventMap), seqLen, contextDim, eventMap, model, len(args.realis) > 0) eventEmb = predictEventEmb(outModel, [trainData, devData, testData]) eventProbEmb = predictEventEmb(model, [trainData, devData, testData]) #load event info eventInfo = loadEvents(args.f) writeEventEmb(eventEmb, makeNames(eventOut), eventInfo) writeEventEmb(eventProbEmb, makeNames(eventProbOut), eventInfo)
def main(args): """ Runs and evaluates the model """ #n.random.seed(13) n.random.seed(16) print("Reading the data") dataDict = loadData(args.f) useEmb = args.full if args.o: mkdir(args.o) #unpack the data trainData = setupEmbeddings(dataDict["train_x"]) devData = setupEmbeddings(dataDict["dev_x"]) testData = setupEmbeddings(dataDict["test_x"]) rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.m)) trainingLabels = eventMap.namesToMatrix(rawTrainingLabels) devLabels = eventMap.namesToMatrix(rawDevLabels) testingLabels = eventMap.namesToMatrix(rawTestingLabels) (samples, seqLen) = trainData[0].shape print(trainData[0].shape) (rightSamples, contextDim) = trainData[-1].shape print("#instances: {}, seq len: {}".format(samples, seqLen)) print("right side {} {}".format(rightSamples, contextDim)) print("labels shape {}".format(trainingLabels.shape)) print("Building the model") #get the model w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" indexPath = "data/word_index.p" #load the weights #maybe it was commented for run 13? w2v = loadW2V(w2vPath) #w2v = {} #load the index wordIndex = load(open(indexPath)) #make the initial weights initWeights = makeEmbeddingWeights(w2v, wordIndex) if args.full: model = buildMultiEmbModel(len(eventMap), seqLen, contextDim, initWeights, eventMap) else: model = buildCNNEmbModel(len(eventMap), seqLen, contextDim, initWeights, eventMap) #train the model print("Training the model") #hard coding class weights... #weights = {0:1.0, 1:5.5} weights = {0: 1.0, 1: 6.0} #weights = {0:1.0, 1:9.0} #make the logger logger = makeLogger(args.o, eventMap) model.fit(trainData, trainingLabels, nb_epoch=args.e, batch_size=args.b, validation_data=(devData, devLabels), callbacks=[logger], class_weight=weights) #get the best model best = logger.best() print("Best Model round: {} val: {}".format(logger.bestModel, logger.bestScore)) #print("F1 Best Model round: {} val: {}".format(sndLog.bestModel, sndLog.bestScore)) print("Make Predictions") #make predictions trainPred = predictClasses(best, trainData, args.b) devPred = predictClasses(best, devData, args.b) print("\nEvalutation") #evaluate the model print("-----Training Scores-----") evaluatePredictions(trainPred, rawTrainingLabels, eventMap) print("\n-----Dev Scores------") evaluatePredictions(devPred, rawDevLabels, eventMap) if args.t: testPred = predictClasses(best, testData, args.b) print("\n\n-----Test Scores------") evaluatePredictions(testPred, rawTestingLabels, eventMap) #output the embedded layer if args.out and not args.full: realisOut = "realisOut" realisProbOut = "realisProbOut" makeNames = lambda p: [ join(args.o, p, i) for i in ["training_pred", "dev_pred", "test_pred"] ] outModel = buildCNNEmbOutput(len(eventMap), seqLen, contextDim, eventMap, best) #do realis layer prediction realis = predictRealis(outModel, [trainData, devData, testData]) realisPaths = makeNames(realisOut) mkdir(join(args.o, realisOut)) #do realis prob prediction realisProb = predictRealis(best, [trainData, devData, testData]) realisProbPaths = makeNames(realisProbOut) mkdir(join(args.o, realisProbOut)) writeRealis(padRealis(args.eventPath, args.f, realis), realisPaths) writeRealis(padRealis(args.eventPath, args.f, realisProb), realisProbPaths)
def main(args): """ Runs and evaluates the model """ print("Reading the data") dataDict = loadData(args.f, args.s) useBothHalves = args.full useEmb = args.emb or args.full #unpack the data if useEmb: trainData = setupEmbeddings(dataDict["train_x"], useBothHalves) devData = setupEmbeddings(dataDict["dev_x"], useBothHalves) testData = setupEmbeddings(dataDict["test_x"], useBothHalves) else: trainData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.m)) params = Parameters(eventMap) trainingLabels = eventMap.namesToMatrix(rawTrainingLabels) devLabels = eventMap.namesToMatrix(rawDevLabels) #testingLabels = eventMap.namesToMatrix(rawTestingLabels) if args.dev: data, labels = joinDev(trainData, trainingLabels, devData, devLabels) else: data = trainData labels = trainingLabels params.emb = args.emb params.useBothHalves = useBothHalves params.samples = data[0].shape[0] params.windowSize = data[0].shape[1] params.batchSize = args.b params.epochs = args.e params.split = args.s params.limit = args.limit if useEmb: w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" indexPath = "data/word_index.p" #load the weights w2v = loadW2V(w2vPath) #load the index wordIndex = load(open(indexPath)) #make the initial weights params.wordWeights = makeEmbeddingWeights(w2v, wordIndex) else: params.wordSize = data[0].shape[2] if args.s: params.contextSize = data[2].shape[1] else: if useBothHalves: params.contextSize = data[-1].shape[1] else: params.contextSize = data[1].shape[1] print("Training") if args.std: print("Standard Cross Validation") validator = StandardSplitter(args.c) elif args.strat: print("Stratified Cross Validation") validator = StratifiedSplitter(args.c) else: print("Random Cross Validation") validator = RandomSplitter(args.p, args.c) mkdir(args.o) models = crossTrain(data, labels, args.o, args.k, validator, params) print("Make Predictions") """
def main(outDir): """ Prepares the data according the config sets and save it to disk as a pickled map """ print("Building Converters") glovePath = "data/vectors/glove/glove.6B.50d.txt" w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" d2vPath = "data/vectors/doc2vec/ace/doc_embeddings.txt" s2vPath = "data/vectors/doc2vec/ace/sent_embeddings.txt" #w2vModel = v.loadW2V(w2vPath) #gloveModel = v.loadGlove(glovePath) entTrain = "data/entities_training.csv" entDev = "data/entities_dev.csv" entTest = "data/entities_testing.csv" entities = readEntities(entTrain) + readEntities(entDev) + readEntities(entTest) entFeats = v.EntityFeats(entities) posFeats = v.SparsePOSFeats(load(open("data/pos_tags.p"))) depFeats = v.SparseDependencyFeats(load(open("data/dep_tags.p"))) docFeats = v.SparseDocTypeFeats("data/doc_types.txt") wordIndex = load(open("data/word_index.p")) entityIndex = load(open("data/entity_map.p")) #TODO remove #w2v = v.Word2VecFeats(defaultdict(lambda: [0.0])) #dataPath = "/home/walker/Data/ace/tmp/" #leftConverter = v.Word2VecFeats(v.loadW2V("data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz"), 5) #leftConverter = v.Word2VecFeats(v.loadGlove("data/vectors/glove/glove.6B.50d.txt"), 20) #leftConverter = v.WindowFeats([v.Word2VecFeats(v.loadGlove("data/vectors/glove/glove.6B.50d.txt")), v.NERFeats("data/vectors/ner/nerIndex.p")], 20) #leftConverter = v.WindowFeats([v.Word2VecFeats(v.loadGlove(glovePath)), v.PositionFeats()], 20) #leftConverter = v.WindowFeats([v.Word2VecFeats(v.loadGlove(glovePath))], 20) #leftConverter = v.WindowFeats([v.Word2VecFeats(w2vModel)], 10) #leftConverter = v.WindowFeats([v.Word2VecFeats(gloveModel), entFeats, posFeats, depFeats], 20) #leftConverter = v.WindowFeats([v.Word2VecFeats(w2vModel), entFeats, v.PositionFeats()], 15) leftConverter = v.WindowFeats([v.WordEmbeddingFeats(wordIndex), v.EntityEmbeddingFeats(entityIndex, entities), v.DistanceEmbeddingFeats()], 15) #rightConverters = [v.Word2VecFeats(v.loadW2V(w2vPath), 1), #v.Doc2VecFeats(d2vPath), #v.Sentence2VecFeats(s2vPath)] #rightConverters = [v.Word2VecFeats(w2vModel, 1), v.Doc2VecFeats(d2vPath), v.Sentence2VecFeats(s2vPath)] #rightConverters = [v.Word2VecFeats(gloveModel, 1), v.Word2VecFeats(w2vModel, 1), #v.Doc2VecFeats(d2vPath), v.Sentence2VecFeats(s2vPath), entFeats, depFeats, posFeats, docFeats] rightConverters = [v.Doc2VecFeats(d2vPath), v.Sentence2VecFeats(s2vPath)] mkdir(outDir) #vectorize the data print("Read training") trainingEvents = writeWindow(c.dataPath, c.trainingFile, rightConverters, leftConverter, join(outDir, "training_{}.p")) print("Read dev") devEvents = writeWindow(c.dataPath, c.devFile, rightConverters, leftConverter, join(outDir, "dev_{}.p")) print("Read testing") testEvents = writeWindow(c.dataPath, c.testFile, rightConverters, leftConverter, join(outDir, "test_{}.p")) data = {"train_events":trainingEvents, "dev_events":devEvents, "test_events":testEvents, "info": "\n".join(map(str,[leftConverter] + rightConverters))} with open(join(outDir, "info.p"),"w") as out: dump(data, out)