def process(self, doc):
     if self.config.get('pretokenized'):
         self.process_pre_tokenized_text(doc)
     else:
         # set up batches
         if self.config.get('lang') == 'vi':
             # special processing is due for Vietnamese
             text = '\n\n'.join([x
                                 for x in doc.text.split('\n\n')]).rstrip()
             dummy_labels = '\n\n'.join(
                 ['0' * len(x) for x in text.split('\n\n')])
             data = paras_to_chunks(text, dummy_labels)
             batches = DataLoader(self.config,
                                  input_data=data,
                                  vocab=self.vocab,
                                  evaluation=True)
         else:
             batches = DataLoader(self.config,
                                  input_text=doc.text,
                                  vocab=self.vocab,
                                  evaluation=True)
         # set up StringIO to get conllu data, run output predictions, set doc's conll file
         with io.StringIO() as conll_output_string:
             output_predictions(
                 conll_output_string, self.trainer, batches, self.vocab,
                 None,
                 self.config.get('max_seqlen',
                                 TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT))
             # set conll file for doc
             doc.conll_file = conll.CoNLLFile(
                 input_str=conll_output_string.getvalue())
示例#2
0
 def load_file(self, filename, evaluation=False):
     conll_file = conll.CoNLLFile(filename)
     if evaluation:
         data = [[c] for c in conll_file.get_mwt_expansion_cands()]
     else:
         data = conll_file.get_mwt_expansions()
     return conll_file, data
    def process_pre_tokenized_text(self, doc):
        """
        Pretokenized text can be provided in 2 manners:

        1.) str, tokenized by whitespace, sentence split by newline
        2.) list of token lists, each token list represents a sentence

        generate CoNLL-U output
        """
        conllu_output_string = ""

        # TODO: This was added for input, that is already in CoNLL-U format.
        #       The conll_file attribute is added manually do the Document instance in that case.
        if doc.text is None:
            return

        if isinstance(doc.text, str):
            sentences = [
                sent.rstrip(' ').split()
                for sent in doc.text.rstrip('\n').split('\n') if sent
            ]
        elif isinstance(doc.text, list):
            sentences = doc.text
        for sentence in sentences:
            for token_id, token in enumerate(sentence):
                conllu_data = ['_'] * conll.FIELD_NUM
                conllu_data[conll.FIELD_TO_IDX['id']] = str(token_id + 1)
                conllu_data[conll.FIELD_TO_IDX['word']] = token
                conllu_data[conll.FIELD_TO_IDX['head']] = str(token_id)
                conllu_output_string += ('\t'.join(conllu_data) + '\n')
            conllu_output_string += '\n'
        doc.conll_file = conll.CoNLLFile(input_str=conllu_output_string)
示例#4
0
    def process_pre_tokenized_text(self, doc):
        """
        Pretokenized text can be provided in 2 manners:

        1.) str, tokenized by whitespace, sentence split by newline
        2.) list of token lists, each token list represents a sentence

        generate CoNLL-U output
        """
        conllu_output_string = ""
        if isinstance(doc.text, str):
            sentences = [
                sent.rstrip(' ').split()
                for sent in doc.text.rstrip('\n').split('\n') if sent
            ]
        elif isinstance(doc.text, list):
            sentences = doc.text
        for sentence in sentences:
            for token_id, token in enumerate(sentence):
                conllu_data = ['_'] * conll.FIELD_NUM
                conllu_data[conll.FIELD_TO_IDX['id']] = str(token_id + 1)
                conllu_data[conll.FIELD_TO_IDX['word']] = token
                conllu_data[conll.FIELD_TO_IDX['head']] = str(token_id)
                conllu_output_string += ('\t'.join(conllu_data) + '\n')
            conllu_output_string += '\n'
        doc.conll_file = conll.CoNLLFile(input_str=conllu_output_string)
示例#5
0
    def process(self, doc):
        batch = DataLoader(doc,
                           self.config['batch_size'],
                           self.config,
                           vocab=self.vocab,
                           evaluation=True)
        if len(batch) > 0:
            dict_preds = self.trainer.predict_dict(
                batch.conll.get_mwt_expansion_cands())
            # decide trainer type and run eval
            if self.config['dict_only']:
                preds = dict_preds
            else:
                preds = []
                for i, b in enumerate(batch):
                    preds += self.trainer.predict(b)

                if self.config.get('ensemble_dict', False):
                    preds = self.trainer.ensemble(
                        batch.conll.get_mwt_expansion_cands(), preds)
        else:
            # skip eval if dev data does not exist
            preds = []

        with io.StringIO() as conll_with_mwt:
            batch.conll.write_conll_with_mwt_expansions(preds, conll_with_mwt)
            doc.conll_file = conll.CoNLLFile(
                input_str=conll_with_mwt.getvalue())
示例#6
0
 def process_pre_tokenized_text(self, doc):
     """Assume text is tokenized by whitespace, sentence split by newline, generate CoNLL-U output"""
     conllu_output_string = ""
     sentences = [
         sent for sent in doc.text.rstrip('\n').split('\n') if sent
     ]
     for sentence in sentences:
         tokens = sentence.rstrip(' ').split(' ')
         for token_id, token in enumerate(tokens):
             conllu_data = ['_'] * conll.FIELD_NUM
             conllu_data[conll.FIELD_TO_IDX['id']] = str(token_id + 1)
             conllu_data[conll.FIELD_TO_IDX['word']] = token
             conllu_data[conll.FIELD_TO_IDX['head']] = str(token_id)
             conllu_output_string += ('\t'.join(conllu_data) + '\n')
         conllu_output_string += '\n'
     doc.conll_file = conll.CoNLLFile(input_str=conllu_output_string)
示例#7
0
 def load_file(self, filename, evaluation=False):
     conll_file = conll.CoNLLFile(filename)
     data = conll_file.get(['word', 'upos', 'xpos', 'feats'],
                           as_sentences=True)
     return conll_file, data
示例#8
0
 def load_file(self, filename):
     conll_file = conll.CoNLLFile(filename)
     data = conll_file.get(['word', 'xpos', 'lemma'])
     return conll_file, data
示例#9
0
def main():
    args = parse_args()

    args = vars(args)
    print("Running UDPipe with module {}...".format(args['module']))

    # convert names
    short2tb = load_short2tb(args['short2tb'])
    tb_short = args['treebank']
    tb_full = short2tb[tb_short]

    lang_full = tb_full[3:].split('-')[0].lower()
    lang_short, tb_code = tb_short.split('_')

    # look for commands and models
    udpipe_script = '{}/bin-linux64/udpipe'.format(args['udpipe_dir'])
    model_name = '{}-{}-ud-2.2-conll18-180430.udpipe'.format(
        lang_full, tb_code)
    model_file = '{}/models/{}'.format(args['udpipe_dir'], model_name)

    if not os.path.exists(model_file):
        model_name = "mixed-ud-ud-2.2-conll18-180430.udpipe"
        model_file = '{}/models/{}'.format(args['udpipe_dir'], model_name)

    # check files
    if not args['output_file'].endswith('.conllu'):
        raise Exception("UDPipe module must write to conllu file.")

    if args['module'] == 'tokenize':
        # run tokenizer, ssplit and mwt expander at the same time
        if not args['input_file'].endswith('.txt'):
            raise Exception(
                "UDPipe must take txt file as input when module == tokenize.")
        # run tokenizer from txt file
        udpipe_cmd = "{} --tokenize {} {} --outfile={} --output=conllu".format(
            udpipe_script, model_file, args['input_file'], args['output_file'])
        run_udpipe(udpipe_cmd)
        print("Waiting for filesystem...")
        time.sleep(5)
    else:
        if not args['input_file'].endswith('.conllu'):
            raise Exception(
                "UDPipe must take conllu file as input when module != tokenize."
            )
        # first load the original input file
        input_conll = conll.CoNLLFile(args['input_file'])
        input_conll.load_all()

        # do udpipe
        if args['module'] == 'parse':
            udpipe_cmd = "{} --parse {} {} --output=conllu --input=conllu".format(
                udpipe_script, model_file, args['input_file'])
        else:
            udpipe_cmd = "{} --tag {} {} --output=conllu --input=conllu".format(
                udpipe_script, model_file, args['input_file'])
        udpipe_outputs = run_udpipe(udpipe_cmd, return_stdout=True)
        print("Waiting for filesystem...")
        time.sleep(5)

        # load conll back and merge with original conll
        udpipe_conll = conll.CoNLLFile(input_str=udpipe_outputs.decode())
        udpipe_conll.load_all()
        if args['module'] == 'lemma':
            fields = ['lemma']
        elif args['module'] == 'pos':
            fields = ['upos', 'xpos']
        elif args['module'] == 'ufeats':
            fields = ['feats']
        elif args['module'] == 'parse':
            fields = ['head', 'deprel', 'deps']
        else:
            raise Exception("Module {} not recognized.".format(args['module']))

        input_conll.set(fields, udpipe_conll.get(fields))  # set fields back
        # finally write to file
        input_conll.write_conll(args['output_file'])
        print("Waiting for filesystem...")
        time.sleep(5)

    print("All done running module {} with UDPipe.".format(args['module']))
示例#10
0
from stanfordnlp.models.common import conll

# This must run only once...
stanfordnlp.download('hu_szeged')

nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma,depparse',
                           lang="hu")  # Full pipeline
nlp1 = stanfordnlp.Pipeline(processors='tokenize,mwt', lang="hu")  # Part I.
nlp2 = stanfordnlp.Pipeline(processors='pos,lemma,depparse',
                            lang="hu")  # Part II.

# Analyze raw string
doc = nlp1('Kecském kucorog, macskám mocorog.')

# Print result...
for i in range(len(doc.sentences)):
    doc.sentences[i].print_tokens()

conllu_format = doc.conll_file.conll_as_string()
print(conllu_format)  # CoNLL text output...

# Documentation: https://stanfordnlp.github.io/stanfordnlp/processors.html

# Read CoNLL-U in any stage...
doc = stanfordnlp.Document(None)
doc.conll_file = conll.CoNLLFile(input_str=conllu_format)

# Analyze further and print the result...
doc2 = nlp2(doc)
print(doc2.conll_file.conll_as_string())