Пример #1
0
 def load_file(self, filename, evaluation=False):
     conll_file = conll.CoNLLFile(filename)
     if evaluation:
         data = [[c] for c in conll_file.get_mwt_expansion_cands()]
     else:
         data = conll_file.get_mwt_expansions()
     return conll_file, data
    def process(self, doc):
        batch = DataLoader(doc,
                           self.config['batch_size'],
                           self.config,
                           vocab=self.vocab,
                           evaluation=True)
        if len(batch) > 0:
            dict_preds = self.trainer.predict_dict(
                batch.conll.get_mwt_expansion_cands())
            # decide trainer type and run eval
            if self.config['dict_only']:
                preds = dict_preds
            else:
                preds = []
                for i, b in enumerate(batch):
                    preds += self.trainer.predict(b)

                if self.config.get('ensemble_dict', False):
                    preds = self.trainer.ensemble(
                        batch.conll.get_mwt_expansion_cands(), preds)
        else:
            # skip eval if dev data does not exist
            preds = []

        with io.StringIO() as conll_with_mwt:
            batch.conll.write_conll_with_mwt_expansions(preds, conll_with_mwt)
            doc.conll_file = conll.CoNLLFile(
                input_str=conll_with_mwt.getvalue())
Пример #3
0
    def process_pre_tokenized_text(self, doc):
        """
        Pretokenized text can be provided in 2 manners:

        1.) str, tokenized by whitespace, sentence split by newline
        2.) list of token lists, each token list represents a sentence

        generate CoNLL-U output
        """
        conllu_output_string = ""

        # TODO: This was added for input, that is already in CoNLL-U format.
        #       The conll_file attribute is added manually do the Document instance in that case.
        if doc.text is None:
            return

        if isinstance(doc.text, str):
            sentences = [sent.rstrip(' ').split() for sent in doc.text.rstrip('\n').split('\n') if sent]
        elif isinstance(doc.text, list):
            sentences = doc.text
        for sentence in sentences:
            for token_id, token in enumerate(sentence):
                conllu_data = ['_'] * conll.FIELD_NUM
                conllu_data[conll.FIELD_TO_IDX['id']] = str(token_id + 1)
                conllu_data[conll.FIELD_TO_IDX['word']] = token
                conllu_data[conll.FIELD_TO_IDX['head']] = str(token_id)
                conllu_output_string += ('\t'.join(conllu_data)+'\n')
            conllu_output_string += '\n'
        doc.conll_file = conll.CoNLLFile(input_str=conllu_output_string)
Пример #4
0
    def tokenize(self, document):
        """ Tokenize a document with the reldi tokenizer and add results to document.conll_file.
        """
        raw_text = '\n'.join(document.text) if isinstance(
            document.text, list) else document.text
        conllu_output_string = ''
        tokenizer = self.nlp.generate_tokenizer(self.lang)
        for par_id, text in enumerate(raw_text.split('\n')):
            conllu_output_string += self._reldi_tokenizer(
                self.nlp.process[self.type](tokenizer, text, self.lang),
                par_id + 1)

        document.conll_file = conll.CoNLLFile(input_str=conllu_output_string)
Пример #5
0
 def load_file(self, filename, evaluation=False):
     conll_file = conll.CoNLLFile(filename)
     data = conll_file.get(['word', 'upos', 'xpos', 'feats'],
                           as_sentences=True)
     return conll_file, data
Пример #6
0
 def load_file(self, filename):
     conll_file = conll.CoNLLFile(filename)
     data = conll_file.get(['word', 'xpos', 'lemma'])
     return conll_file, data
def main():
    args = parse_args()

    args = vars(args)
    print("Running UDPipe with module {}...".format(args['module']))

    # convert names
    short2tb = load_short2tb(args['short2tb'])
    tb_short = args['treebank']
    tb_full = short2tb[tb_short]

    lang_full = tb_full[3:].split('-')[0].lower()
    lang_short, tb_code = tb_short.split('_')

    # look for commands and models
    udpipe_script = '{}/bin-linux64/udpipe'.format(args['udpipe_dir'])
    model_name = '{}-{}-ud-2.2-conll18-180430.udpipe'.format(
        lang_full, tb_code)
    model_file = '{}/models/{}'.format(args['udpipe_dir'], model_name)

    if not os.path.exists(model_file):
        model_name = "mixed-ud-ud-2.2-conll18-180430.udpipe"
        model_file = '{}/models/{}'.format(args['udpipe_dir'], model_name)

    # check files
    if not args['output_file'].endswith('.conllu'):
        raise Exception("UDPipe module must write to conllu file.")

    if args['module'] == 'tokenize':
        # run tokenizer, ssplit and mwt expander at the same time
        if not args['input_file'].endswith('.txt'):
            raise Exception(
                "UDPipe must take txt file as input when module == tokenize.")
        # run tokenizer from txt file
        udpipe_cmd = "{} --tokenize {} {} --outfile={} --output=conllu".format(
            udpipe_script, model_file, args['input_file'], args['output_file'])
        run_udpipe(udpipe_cmd)
        print("Waiting for filesystem...")
        time.sleep(5)
    else:
        if not args['input_file'].endswith('.conllu'):
            raise Exception(
                "UDPipe must take conllu file as input when module != tokenize."
            )
        # first load the original input file
        input_conll = conll.CoNLLFile(args['input_file'])
        input_conll.load_all()

        # do udpipe
        if args['module'] == 'parse':
            udpipe_cmd = "{} --parse {} {} --output=conllu --input=conllu".format(
                udpipe_script, model_file, args['input_file'])
        else:
            udpipe_cmd = "{} --tag {} {} --output=conllu --input=conllu".format(
                udpipe_script, model_file, args['input_file'])
        udpipe_outputs = run_udpipe(udpipe_cmd, return_stdout=True)
        print("Waiting for filesystem...")
        time.sleep(5)

        # load conll back and merge with original conll
        udpipe_conll = conll.CoNLLFile(input_str=udpipe_outputs.decode())
        udpipe_conll.load_all()
        if args['module'] == 'lemma':
            fields = ['lemma']
        elif args['module'] == 'pos':
            fields = ['upos', 'xpos']
        elif args['module'] == 'ufeats':
            fields = ['feats']
        elif args['module'] == 'parse':
            fields = ['head', 'deprel', 'deps']
        else:
            raise Exception("Module {} not recognized.".format(args['module']))

        input_conll.set(fields, udpipe_conll.get(fields))  # set fields back
        # finally write to file
        input_conll.write_conll(args['output_file'])
        print("Waiting for filesystem...")
        time.sleep(5)

    print("All done running module {} with UDPipe.".format(args['module']))