示例#1
0
    def predict(self, rawFeatures, labelsToAppend):
        sentenceFeatures = modelSystem.rawToSentenceFeatures(rawFeatures)
        predictionMatrix = []
        predictionMatrix.append([[1, 0]] * len(sentenceFeatures)) #for 0th column

        sentences = []
        for i in range(len(sentenceFeatures)):
            sentence = Sentence(None, sentenceFeatures[i])
            if labelsToAppend is not None:
                sentence.addAdditionalFeatures(labelsToAppend[i])
            sentences.append(sentence)

        for i in range(1, self.sentenceLength):
            clf = self.loadPositionModel(i)
            prediction = clf.predict_proba([sentence.getAllFeatures() for sentence in sentences])
            for a in range(len(prediction)):
                if type(prediction[a]) == int:
                    prediction[a] = [prediction[a], 1 - prediction[a]]
            predictionMatrix.append(prediction)

        transposed = []

        for i in range(len(predictionMatrix[0])):
            predictionSet = []
            for j in range(len(predictionMatrix)):
                predictionSet.append(predictionMatrix[j][i])
            transposed.append(predictionSet)

        return np.array(transposed)
示例#2
0
def _parse_sentence(json):
    sentence = Sentence()
    sentence.set_id(json[ID])
    language = None
    for key in json.keys():
        if key != ID:
            language = Language()
            language.set_locale(key)
            language.set_value(json[key])
            sentence.add_language(language)
    return sentence
示例#3
0
    def trainIntermediateModels(self, numData, frameBatchData):
        rawFeatures = bridge.loadMergedFeatures(self.sentenceLength, end = numData)
        sentenceFeatures = modelSystem.rawToSentenceFeatures(rawFeatures)
        golds = gold.loadDefaultGolds(self.sentenceLength)

        sentences = []
        for i in range(len(sentenceFeatures)):
            sentences.append(Sentence(golds[i], sentenceFeatures[i]))

        for ratio in range(len(frameBatchData)):
            for index, additionalLabels in enumerate(frameBatchData[ratio]):
                sentences[index].addAdditionalFeatures(additionalLabels)

        batchSentences = util.cutInHalf(sentences)

        predictionsPerBatch = []
        for ratio in range(len(frameBatchData)):
            predictionsPerBatch.append([0])

        for batch in range(2):
            batchPredictions = []
            for i in range(len(batchSentences[0])):
                batchPredictions.append([0] * self.sentenceLength)
            for pos in range(self.sentenceLength):
                model = PositionModel(self.sentenceLength, pos)
                model.train(batchSentences[batch])
                predictions = model.predict([sentence.getAllFeatures() for sentence in batchSentences[abs(1 - batch)]])
                for i in range(len(predictions)):
                    batchPredictions[i][pos] = predictions[i]

            predictionsPerBatch[abs(1 - batch)] = batchPredictions

        allPredictions = predictionsPerBatch[0] + predictionsPerBatch[1]

        return allPredictions
    def createData(self, output, numDataPoints):

        allFeatures = getMergedFeatures(
            "C:/MissingWord/mergedFeatures" + str(self.sentenceLength) +
            ".txt", numDataPoints)

        print("Loaded Features")
        golds = loadGolds("C:/MissingWord/" + str(self.sentenceLength) +
                          "Gold.txt",
                          numGolds=len(allFeatures),
                          length=self.sentenceLength)

        print(len(allFeatures))

        sentences = []
        for i in range(len(allFeatures)):
            sentences.append(Sentence(golds[i], allFeatures[i]))

        appendLabels(str(self.sentenceLength) + "rf15.txt", sentences)
        appendLabels(str(self.sentenceLength) + "rf110.txt", sentences)

        data = [(sentence.getAllFeatures(),
                 sentence.getGold().getRemovedIndex())
                for sentence in sentences]

        with open(output, "wb") as f:
            pickle.dump(data, f)

        return data
示例#5
0
    def assembleData(self, posPredictions, sentenceFeatures, golds=None):

        toPredict = []  #non all-zero indices
        flattened = []
        '''
        for index, predictionSet in enumerate(posPredictions):
            flat = np.array([pos[0] for pos in predictionSet])
            for value in flat:
                if value < 0.5:
                    toPredict.append(index)
                    break
            flattened.append(flat)
        '''

        count = [0] * 15
        for index, predictionSet in enumerate(posPredictions):
            flat = np.array([pos[0] for pos in predictionSet])
            flattened.append(flat)
            binarized = [1 if value < 0.5 else 0 for value in flat]
            count[binarized.count(1)] += 1
            if (binarized.count(1) == 1 or binarized.count(1) == 2):
                toPredict.append(index)

        for i in range(len(count)):
            print(count[i])

        sentences = []
        for i in range(len(sentenceFeatures)):
            sentence = Sentence(golds[i] if golds is not None else None,
                                sentenceFeatures[i])
            sentence.addAdditionalFeatures(flattened[i])
            sentences.append(sentence)

        data = []
        for i in toPredict:
            if i > len(sentences):
                break
            data.append((sentences[i].getAllFeatures(),
                         sentences[i].getRemovedIndex()))

        return np.array(data), toPredict
示例#6
0
    def assembleData(self, posPredictions, sentenceFeatures, golds = None):

        toPredict = [] #non all-zero indices
        flattened = []
        '''
        for index, predictionSet in enumerate(posPredictions):
            flat = np.array([pos[0] for pos in predictionSet])
            for value in flat:
                if value < 0.5:
                    toPredict.append(index)
                    break
            flattened.append(flat)
        '''

        count = [0] * 15
        for index, predictionSet in enumerate(posPredictions):
            flat = np.array([pos[0] for pos in predictionSet])
            flattened.append(flat)
            binarized = [1 if value < 0.5 else 0 for value in flat]
            count[binarized.count(1)] += 1
            if(binarized.count(1) == 1 or binarized.count(1) == 2):
                toPredict.append(index)

        for i in range(len(count)):
            print(count[i])

        sentences = []
        for i in range(len(sentenceFeatures)):
            sentence = Sentence(golds[i] if golds is not None else None, sentenceFeatures[i])
            sentence.addAdditionalFeatures(flattened[i])
            sentences.append(sentence)

        data = []
        for i in toPredict:
            if i > len(sentences):
                break
            data.append((sentences[i].getAllFeatures(), sentences[i].getRemovedIndex()))

        return np.array(data), toPredict
示例#7
0
    def trainPredictiveModel(self, rawFeatures, golds, frameBatchData):
        sentenceFeatures = modelSystem.rawToSentenceFeatures(rawFeatures)

        sentences = []
        for i in range(len(sentenceFeatures)):
            sentences.append(Sentence(golds[i], sentenceFeatures[i]))

        if frameBatchData is not None:
            for index, additionalLabels in enumerate(frameBatchData):
                sentences[index].addAdditionalFeatures(additionalLabels)

        for i in range(self.sentenceLength):
            posModel = PositionModel(self.sentenceLength, i)
            posModel.train(np.array(sentences))
            posModel.export(modelSystem.posModelPredictionFile(self.sentenceLength, i))
示例#8
0
def process_document(text):
    raw_sentences = nltk.sent_tokenize(text)
    sentences = []
    document = dict()

    for s in raw_sentences:
        words = nltk.word_tokenize(text_classifier.remove_punctuation(s))
        unique_words = set(words)
        sentence_dictionary = dict()

        increase_dict_value(unique_words, document)
        increase_dict_value(words, sentence_dictionary)

        sentences.append(Sentence(s, sentence_dictionary))

    return document, sentences
示例#9
0
def read_testing_file(filenameprefix):
    """Read the triplets files of the segments that correspond to the test file"""
    file_name = 'data/transformed_triplet_files/' + filenameprefix + '*.txt'
    files = glob.glob(file_name)
    files.sort()
    true_segment = []
    all_sentences = []
    line_count_total = 0
    for segments_file in files:
        # Delete the teaser files
        if (segments_file.split('/')[-1].split('_')[-1].split('|')[0].split(
                ':')[-1] == 'Teaser'
                or segments_file.split('/')[-1].split('_')[-1].split('.')[0]
                == 'NULL'):
            continue
        line_count = -1
        current_seg_sentences = []
        with open(segments_file, 'r') as f:
            for line in f:
                if (line[0] != '<'):
                    line_count += 1
                    line = (line[:-2]).lower()
                    triplets = line.split('|')

                    np1 = triplets[0].split()
                    if np1 != [] and _is_pronoun(np1[0]):
                        pronoun_flag = True
                    else:
                        pronoun_flag = False
                    np1 = cleansing.clean(triplets[0].split())
                    vp = cleansing.clean(triplets[1].split())
                    np2 = cleansing.clean(triplets[2].split())
                    current_seg_sentences.append(
                        Sentence(np1, vp, np2, pronoun_flag))

        # Only keep segments longer than 5 sentences
        segment_length = len(current_seg_sentences)
        if (segment_length > 5):
            seg = [(sid + line_count_total)
                   for sid in range(0, segment_length)]
            true_segment.append(set(seg))
            all_sentences.extend(current_seg_sentences)
            line_count_total += segment_length
    return [all_sentences, true_segment]
示例#10
0
                thisFeatureSet.extend(elem)
            ofLength.append(thisFeatureSet)
    return ofLength


allFeatures = getMergedFeatures("C:/MissingWord/mergedFeatures15.txt")

golds = model.gold.loadGolds("C:/MissingWord/15Gold.txt", sentenceLength)

golds = golds[:len(allFeatures)]

print(len(allFeatures))

sentences = []
for i in range(len(allFeatures)):
    sentences.append(Sentence(golds[i], allFeatures[i]))

appendLabels("15rf15.txt", sentences)
appendLabels("15rf110.txt", sentences)

sentences = [
    sentence for sentence in sentences if len(sentence) == sentenceLength
]

data = [(sentence.getAllFeatures(), sentence.getGold().getRemovedIndex())
        for sentence in sentences]

random.shuffle(data)

cutoff = int(len(data) * 7 / 10)