示例#1
0
 def read_conlldoc(self, inputdoc):
     words = list()
     sentbounds = list()
     #    pos = list()
     tags = list()
     #    lemmas = list()
     for sent in string2doc(inputdoc, hide_fields=HIDDEN_FIELDS):
         for i, tok in enumerate(sent):
             if i == 0:
                 sentbounds.append(True)
             else:
                 sentbounds.append(False)
             words.append(tok.word)
             tags.append(self.nlp.vocab.strings.add(tok.xpos))
             # pos.append(self.nlp.vocab.strings.add(conv_table.get(tok.xpos, "_")))
     #            lemmas.append(self.nlp.vocab.strings.add(tok.lemma))
     # attrs = [POS, TAG]
     attrs = [TAG]
     # arr = np.array(list(zip(pos, tags)), dtype="uint64")
     arr = np.array(tags, dtype="uint64")
     sdoc = Doc(self.nlp.vocab, words=words).from_array(attrs, arr)
     for i, sb in enumerate(sentbounds):
         if sb:
             sdoc[i].is_sent_start = True
         else:
             # these must be set to False, since,
             # if left as None, spaCy will add further sentbounds
             sdoc[i].is_sent_start = False
     #    lemma_array = np.array([[lemma] for lemma in lemmas], dtype="uint64")
     #    sdoc.from_array([LEMMA], lemma_array)
     if any(tags):
         sdoc.is_tagged = True
     return sdoc
示例#2
0
 def myprocessor(myinput):
     results = list()
     for sent in string2doc(myinput, hide_fields=HIDDEN_FIELDS):
         sent_arr = jpype.java.util.ArrayList()
         for tok in sent:
             sent_arr.add(ling.TaggedWord(tok.word, tok.xpos))
         results.append(self.parser.predict(sent_arr))
     return results
示例#3
0
 def myprocessor(myinput):
     mydoc = string2doc(myinput)
     for sent in mydoc:
         tokens = [t.word for t in sent]
         tags = [t.xpos for t in sent]
         lemmas = lemmatize_sentence(tokens, tags)
         for tok, lem in zip(sent, lemmas):
             tok.hide_fields(HIDDEN_FIELDS)
             tok.lemma = lem
     return mydoc
示例#4
0
            def myprocessor(myinput):
                newinput = list()
                for sent in string2doc(myinput, hide_fields=HIDDEN_FIELDS):
                    sent_strs = list()
                    for tok in sent:
                        sent_strs.append(tok.word + "\t" + tok.xpos)
                    newinput.append("\n".join(sent_strs))
                reformatted_input = "\n\n".join(newinput)

                return self.parser.main(reformatted_input,
                                        inputformat="tagged",
                                        outputformat="conll")
示例#5
0
 def myprocessor(myinput):
     mydoc = string2doc(myinput)
     for sent in mydoc:
         for tok in sent:
             try:
                 tok.lemma = self.lemmatizer.find_lemma(
                     tok.word, tok.xpos)
             except ValueError:
                 # unsupported POS
                 # use empty lemma
                 tok.lemma = "_"
             # don't repeat gold pos in output
             tok.hide_fields(HIDDEN_FIELDS)
     return mydoc
示例#6
0
 def myprocessor(myinput):
     mydoc = string2doc(myinput)
     for sent in mydoc:
         for tok in sent:
             try:
                 matching_lemmas = self.lemmatizer.lemmatize(
                     tok.word, conv_table.get(tok.xpos))
                 if matching_lemmas is None:
                     tok.lemma = "_"
                     # elif len(matching_lemmas) > 1:
                     #     print("lots o lemmas!", matching_lemmas)
                 else:
                     # unclear how to select best alternative
                     # just use first item in list
                     tok.lemma = matching_lemmas[0]
             except ValueError:
                 tok.lemma = "_"
             # don't repeat gold pos in output
             tok.hide_fields(HIDDEN_FIELDS)
     return mydoc
示例#7
0
 def postprocess(self):
     self.data = string2doc(self.output_data.conll_file.conll_as_string())