def predict(self, rawFeatures, labelsToAppend): sentenceFeatures = modelSystem.rawToSentenceFeatures(rawFeatures) predictionMatrix = [] predictionMatrix.append([[1, 0]] * len(sentenceFeatures)) #for 0th column sentences = [] for i in range(len(sentenceFeatures)): sentence = Sentence(None, sentenceFeatures[i]) if labelsToAppend is not None: sentence.addAdditionalFeatures(labelsToAppend[i]) sentences.append(sentence) for i in range(1, self.sentenceLength): clf = self.loadPositionModel(i) prediction = clf.predict_proba([sentence.getAllFeatures() for sentence in sentences]) for a in range(len(prediction)): if type(prediction[a]) == int: prediction[a] = [prediction[a], 1 - prediction[a]] predictionMatrix.append(prediction) transposed = [] for i in range(len(predictionMatrix[0])): predictionSet = [] for j in range(len(predictionMatrix)): predictionSet.append(predictionMatrix[j][i]) transposed.append(predictionSet) return np.array(transposed)
def _parse_sentence(json): sentence = Sentence() sentence.set_id(json[ID]) language = None for key in json.keys(): if key != ID: language = Language() language.set_locale(key) language.set_value(json[key]) sentence.add_language(language) return sentence
def trainIntermediateModels(self, numData, frameBatchData): rawFeatures = bridge.loadMergedFeatures(self.sentenceLength, end = numData) sentenceFeatures = modelSystem.rawToSentenceFeatures(rawFeatures) golds = gold.loadDefaultGolds(self.sentenceLength) sentences = [] for i in range(len(sentenceFeatures)): sentences.append(Sentence(golds[i], sentenceFeatures[i])) for ratio in range(len(frameBatchData)): for index, additionalLabels in enumerate(frameBatchData[ratio]): sentences[index].addAdditionalFeatures(additionalLabels) batchSentences = util.cutInHalf(sentences) predictionsPerBatch = [] for ratio in range(len(frameBatchData)): predictionsPerBatch.append([0]) for batch in range(2): batchPredictions = [] for i in range(len(batchSentences[0])): batchPredictions.append([0] * self.sentenceLength) for pos in range(self.sentenceLength): model = PositionModel(self.sentenceLength, pos) model.train(batchSentences[batch]) predictions = model.predict([sentence.getAllFeatures() for sentence in batchSentences[abs(1 - batch)]]) for i in range(len(predictions)): batchPredictions[i][pos] = predictions[i] predictionsPerBatch[abs(1 - batch)] = batchPredictions allPredictions = predictionsPerBatch[0] + predictionsPerBatch[1] return allPredictions
def createData(self, output, numDataPoints): allFeatures = getMergedFeatures( "C:/MissingWord/mergedFeatures" + str(self.sentenceLength) + ".txt", numDataPoints) print("Loaded Features") golds = loadGolds("C:/MissingWord/" + str(self.sentenceLength) + "Gold.txt", numGolds=len(allFeatures), length=self.sentenceLength) print(len(allFeatures)) sentences = [] for i in range(len(allFeatures)): sentences.append(Sentence(golds[i], allFeatures[i])) appendLabels(str(self.sentenceLength) + "rf15.txt", sentences) appendLabels(str(self.sentenceLength) + "rf110.txt", sentences) data = [(sentence.getAllFeatures(), sentence.getGold().getRemovedIndex()) for sentence in sentences] with open(output, "wb") as f: pickle.dump(data, f) return data
def assembleData(self, posPredictions, sentenceFeatures, golds=None): toPredict = [] #non all-zero indices flattened = [] ''' for index, predictionSet in enumerate(posPredictions): flat = np.array([pos[0] for pos in predictionSet]) for value in flat: if value < 0.5: toPredict.append(index) break flattened.append(flat) ''' count = [0] * 15 for index, predictionSet in enumerate(posPredictions): flat = np.array([pos[0] for pos in predictionSet]) flattened.append(flat) binarized = [1 if value < 0.5 else 0 for value in flat] count[binarized.count(1)] += 1 if (binarized.count(1) == 1 or binarized.count(1) == 2): toPredict.append(index) for i in range(len(count)): print(count[i]) sentences = [] for i in range(len(sentenceFeatures)): sentence = Sentence(golds[i] if golds is not None else None, sentenceFeatures[i]) sentence.addAdditionalFeatures(flattened[i]) sentences.append(sentence) data = [] for i in toPredict: if i > len(sentences): break data.append((sentences[i].getAllFeatures(), sentences[i].getRemovedIndex())) return np.array(data), toPredict
def assembleData(self, posPredictions, sentenceFeatures, golds = None): toPredict = [] #non all-zero indices flattened = [] ''' for index, predictionSet in enumerate(posPredictions): flat = np.array([pos[0] for pos in predictionSet]) for value in flat: if value < 0.5: toPredict.append(index) break flattened.append(flat) ''' count = [0] * 15 for index, predictionSet in enumerate(posPredictions): flat = np.array([pos[0] for pos in predictionSet]) flattened.append(flat) binarized = [1 if value < 0.5 else 0 for value in flat] count[binarized.count(1)] += 1 if(binarized.count(1) == 1 or binarized.count(1) == 2): toPredict.append(index) for i in range(len(count)): print(count[i]) sentences = [] for i in range(len(sentenceFeatures)): sentence = Sentence(golds[i] if golds is not None else None, sentenceFeatures[i]) sentence.addAdditionalFeatures(flattened[i]) sentences.append(sentence) data = [] for i in toPredict: if i > len(sentences): break data.append((sentences[i].getAllFeatures(), sentences[i].getRemovedIndex())) return np.array(data), toPredict
def trainPredictiveModel(self, rawFeatures, golds, frameBatchData): sentenceFeatures = modelSystem.rawToSentenceFeatures(rawFeatures) sentences = [] for i in range(len(sentenceFeatures)): sentences.append(Sentence(golds[i], sentenceFeatures[i])) if frameBatchData is not None: for index, additionalLabels in enumerate(frameBatchData): sentences[index].addAdditionalFeatures(additionalLabels) for i in range(self.sentenceLength): posModel = PositionModel(self.sentenceLength, i) posModel.train(np.array(sentences)) posModel.export(modelSystem.posModelPredictionFile(self.sentenceLength, i))
def process_document(text): raw_sentences = nltk.sent_tokenize(text) sentences = [] document = dict() for s in raw_sentences: words = nltk.word_tokenize(text_classifier.remove_punctuation(s)) unique_words = set(words) sentence_dictionary = dict() increase_dict_value(unique_words, document) increase_dict_value(words, sentence_dictionary) sentences.append(Sentence(s, sentence_dictionary)) return document, sentences
def read_testing_file(filenameprefix): """Read the triplets files of the segments that correspond to the test file""" file_name = 'data/transformed_triplet_files/' + filenameprefix + '*.txt' files = glob.glob(file_name) files.sort() true_segment = [] all_sentences = [] line_count_total = 0 for segments_file in files: # Delete the teaser files if (segments_file.split('/')[-1].split('_')[-1].split('|')[0].split( ':')[-1] == 'Teaser' or segments_file.split('/')[-1].split('_')[-1].split('.')[0] == 'NULL'): continue line_count = -1 current_seg_sentences = [] with open(segments_file, 'r') as f: for line in f: if (line[0] != '<'): line_count += 1 line = (line[:-2]).lower() triplets = line.split('|') np1 = triplets[0].split() if np1 != [] and _is_pronoun(np1[0]): pronoun_flag = True else: pronoun_flag = False np1 = cleansing.clean(triplets[0].split()) vp = cleansing.clean(triplets[1].split()) np2 = cleansing.clean(triplets[2].split()) current_seg_sentences.append( Sentence(np1, vp, np2, pronoun_flag)) # Only keep segments longer than 5 sentences segment_length = len(current_seg_sentences) if (segment_length > 5): seg = [(sid + line_count_total) for sid in range(0, segment_length)] true_segment.append(set(seg)) all_sentences.extend(current_seg_sentences) line_count_total += segment_length return [all_sentences, true_segment]
thisFeatureSet.extend(elem) ofLength.append(thisFeatureSet) return ofLength allFeatures = getMergedFeatures("C:/MissingWord/mergedFeatures15.txt") golds = model.gold.loadGolds("C:/MissingWord/15Gold.txt", sentenceLength) golds = golds[:len(allFeatures)] print(len(allFeatures)) sentences = [] for i in range(len(allFeatures)): sentences.append(Sentence(golds[i], allFeatures[i])) appendLabels("15rf15.txt", sentences) appendLabels("15rf110.txt", sentences) sentences = [ sentence for sentence in sentences if len(sentence) == sentenceLength ] data = [(sentence.getAllFeatures(), sentence.getGold().getRemovedIndex()) for sentence in sentences] random.shuffle(data) cutoff = int(len(data) * 7 / 10)