예제 #1
0
def conll_file(filename, fields, word_field, encoding="utf-8"):
    document = Document(os.path.basename(filename), encoding=encoding)
    document._corpus = Corpus.from_conll(filename, fields, encoding=encoding)
    character_index = 0
    sentence_index = 0
    contents = []
    word_spans = []
    sentence_spans = []
    for sentence in document._corpus.sentences:
        contents.append([])
        for token in sentence:
            word = token[word_field]
            contents[-1].append(word)
            word_spans.append(
                Span(character_index, character_index + len(word)))
            character_index += len(word) + 1
        sentence_spans.append(
            Span(sentence_index, sentence_index + len(sentence)))
        sentence_index += len(sentence)
    document._content = u"\n".join(
        [u" ".join(content) for content in contents])
    document.add_segmentation(Segmentation("tokens", spans=word_spans))
    document.add_segmentation(
        Segmentation("sentences",
                     reference=document.segmentation("tokens"),
                     spans=sentence_spans))
    return document
예제 #2
0
    def test_enrich(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word"],
                        sentences=[[{
                            u"word": u"Ceci"
                        }, {
                            u"word": u"est"
                        }, {
                            u"word": u"un"
                        }, {
                            u"word": u"test"
                        }, {
                            u"word": u"."
                        }]])
        document._corpus = corpus

        features = []
        cwg = DictGetterFeature(entry="word", x=0)
        features.append(BOSFeature(name="BOS", entry="word", getter=cwg))
        features.append(EOSFeature(name="EOS", entry="word", getter=cwg))

        informations = Informations(bentries=[Entry(u"word")],
                                    features=features)

        enrich = EnrichModule(informations)

        self.assertEquals(document._corpus.fields, [u"word"])

        enrich.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word", u"BOS", u"EOS"])
예제 #3
0
    def test_clean(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word", u"remove"],
                        sentences=[[{
                            u"word": u"Ceci",
                            u"remove": u"Ceci"
                        }, {
                            u"word": u"est",
                            u"remove": u"est"
                        }, {
                            u"word": u"un",
                            u"remove": u"un"
                        }, {
                            u"word": u"test",
                            u"remove": u"test"
                        }, {
                            u"word": u".",
                            u"remove": u"."
                        }]])
        document._corpus = corpus

        self.assertEquals(document._corpus.fields, [u"word", u"remove"])

        clean = CleanModule(to_keep=[u"word"])
        clean.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word"])
예제 #4
0
    def test_wapiti_label(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word"],
                        sentences=[[{
                            u"word": u"Ceci"
                        }, {
                            u"word": u"est"
                        }, {
                            u"word": u"un"
                        }, {
                            u"word": u"test"
                        }, {
                            u"word": u"."
                        }]])
        document._corpus = corpus

        self.assertEquals(document._corpus.fields, [u"word"])

        wapiti_label = WapitiLabelModule(
            os.path.join(SEM_DATA_DIR, "non-regression", "models", "model"),
            u"the_new_field")
        wapiti_label.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word", u"the_new_field"])

        sentence = document._corpus.sentences[0]
        self.assertEquals(sentence[0]["the_new_field"], u"A")
        self.assertEquals(sentence[1]["the_new_field"], u"B")
        self.assertEquals(sentence[2]["the_new_field"], u"B")
        self.assertEquals(sentence[3]["the_new_field"], u"A")
        self.assertEquals(sentence[4]["the_new_field"], u"O")