b = Progbar(len(dataset)) for i, data in enumerate(dataset): tokens, casing, char, labels = data tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) #Predict the classes correctLabels.append(labels) predLabels.append(pred) b.update(i) b.update(i + 1) return predLabels, correctLabels trainSentences = readfile("data/train.txt") devSentences = readfile("data/valid.txt") testSentences = readfile("data/test.txt") trainSentences = addCharInformatioin(trainSentences) devSentences = addCharInformatioin(devSentences) testSentences = addCharInformatioin(testSentences) labelSet = set() words = {} for dataset in [trainSentences, devSentences, testSentences]: for sentence in dataset: for token, char, label in sentence: labelSet.add(label) words[token.lower()] = True
import os.path from extract_all_words import extract_words from candidate_retriever import generate_training_data epochs = 100 training_data_path = "../data/ner_training_data.txt" all_words_path = "../data/words.txt" word_embedding_path = "../data/glove.6B.100d.txt" if not os.path.isfile(all_words_path): extract_words() if not os.path.isfile(training_data_path): generate_training_data() trainSentences = readfile(training_data_path) trainSentences = addCharInformatioin(trainSentences) ##LOAD all words from train, test and dev words = {} with open(all_words_path, encoding="utf-8") as f: content = f.readlines() for w in enumerate(content): words[w] = True # :: Create a mapping for the labels :: label2Idx = {} label2Idx["I"] = 1 label2Idx["O"] = 0 # :: Read in word embeddings ::
b.update(i) b.update(i + 1) return predLabels, correctLabels # trainSentences = readfile("data/train.txt") # devSentences = readfile("data/valid.txt") # testSentences = readfile("data/test.txt") # trainSentences = readfile("data/ref_train.txt") # devSentences = readfile("data/ref_dev.txt") # testSentences = readfile("data/ref_test.txt") # trainSentences = readfile("data/WkAnno_output_train_data.txt") # devSentences = readfile("data/WkAnno_output_dev_data.txt") # testSentences = readfile("data/WkAnno_output_test_data.txt") trainSentences = readfile("training_data/train.txt") devSentences = readfile("training_data/dev.txt") testSentences = readfile("training_data/test.txt") trainSentences = addCharInformatioin(trainSentences) devSentences = addCharInformatioin(devSentences) testSentences = addCharInformatioin(testSentences) labelSet = set() words = {} for dataset in [trainSentences, devSentences, testSentences]: for sentence in dataset: for token, char, label in sentence: labelSet.add(label) words[token.lower()] = True
def loadData(self): """Load data and add character information""" self.trainSentences = readfile("data/train.txt") self.devSentences = readfile("data/dev.txt") self.testSentences = readfile("data/test.txt")
samplingMethod = sys.argv[2] #"entropySampling"] #Name of the model... See models.py for details modelName = "LSTM_word_char" datasetName = sys.argv[1] #"Cadec" print(datasetName + " " + samplingMethod) #Loading The dataset if datasetName == "Twitter": trainSentences = readfileTwitter("twitter/TwitterTrainBIO.tsv") learnSentences = trainSentences[int(len(trainSentences)/10):] trainSentences = trainSentences[:int(len(trainSentences)/10)] testSentences = readfile("twitter/TwitterTestBIO.tsv") elif datasetName == "Medline": trainSentences = readfileTwitter("twitter/MedlineBIO.tsv") learnSentences = [] testSentences = [] elif datasetName == "Cadec": trainSentences = readfileTwitter("twitter/CadecBIO.tsv") learnSentences = [] testSentences = [] trainSentences = addCharInformatioin(trainSentences) learnSentences = addCharInformatioin(learnSentences) testSentences = addCharInformatioin(testSentences)
def make_dataset(file_name): Senetnecs = readfile(file_name) Senetnecs = addCharInformatioin(Senetnecs) return Senetnecs
predLabels = [] b = Progbar(len(dataset)) for i, data in enumerate(dataset): tokens, char, labels = data tokens = np.asarray([tokens]) char = np.asarray([char]) pred = model.predict([tokens, char], verbose=False)[0] pred = pred.argmax(axis=-1) #Predict the classes correctLabels.append(labels) predLabels.append(pred) b.update(i) return predLabels, correctLabels trainSentences = readfile("train_data.txt") testSentences = readfile("test_data.txt") #testSentences.pop(0) #trainSentences.pop(0) #trainSentences[0].pop(0) #testSentences[0].pop(0) trainSentences = addCharInformatioin(trainSentences) testSentences = addCharInformatioin(testSentences) labelSet = set() words = {} """for sentence in trainSentences: for token,char,label in sentence: labelSet.add(label)
def loadData(self): """Load data and add character information""" self.trainSentences = readfile( "data/NER-ABSA-16_Restaurants_Train.txt") #self.devSentences = readfile("data/dev.txt") self.testSentences = readfile("data/NER-ABSA-16_Restaurants_Test.txt")