Пример #1
0
    def create_vanila_treebank_list(self, options):
        """
        Create list of vanilla (i.e. non-UD) treebanks. Currently only one treebank is supported, so the list will always
        have one element. This is for consistency with the UD treebanks case where multi-monlingual experiments are allowed
        """
        treebank = utils.Treebank(options.trainfile, \
                                       options.devfile, options.testfile)
        treebank.iso_id = None
        treebank.outdir = options.outdir
        treebank.modeldir = options.modeldir
        #just one model specified by train/dev and/or test
        if options.predict:
            if not os.path.exists(options.testfile):
                raise Exception("Test file " + options.testfile + " not found")
            else:
                options.conllu = (os.path.splitext(
                    options.testfile.lower())[1] == '.conllu'
                                  )  # test if file in conllu format
                treebank.test_gold = options.testfile
        else:
            self.prepareDev(treebank, options)
            if options.devfile:
                options.conllu = (os.path.splitext(
                    options.devfile.lower())[1] == '.conllu')
            elif options.create_dev:
                options.conllu = (os.path.splitext(
                    options.trainfile.lower())[1] == '.conllu')

        if options.debug:
            self.createDebugData(treebank, options)

        return [
            treebank
        ]  # make it a list of one element just for the sake of consistency with the "include" case
Пример #2
0
    def __init__(self, options):
        """
        input: parser options
        object to harmonise the way we deal with the parser
        """

        print 'Using external embedding:', options.external_embedding
        self.deal_with_multiling(options)

        if options.include and not options.datadir:
            raise Exception(
                "You need to specify the data dir to include UD languages")

        if not options.predict:
            if not options.include and not options.trainfile:
                raise Exception(
                    "If not using the --include option, you must specify your training data with --trainfile"
                )
        else:
            if not options.include and not options.testfile:
                raise Exception(
                    "If not using the --include option, you must specify your test data with --testfile"
                )
            if not options.modeldir:
                options.modeldir = options.outdir  # set model directory to output directory by default

        if not options.outdir:
            raise Exception(
                "You must specify an output directory via the --outdir option")
        elif not os.path.exists(
                options.outdir):  # create output directory if it doesn't exist
            print "Creating output directory " + options.outdir
            os.mkdir(options.outdir)

        if not options.predict and not (options.rlFlag or options.rlMostFlag
                                        or options.headFlag):
            raise Exception(
                "Must include either head, rl or rlmost (For example, if you specified --disable-head and --disable-rlmost, you must specify --userl)"
            )

        if options.rlFlag and options.rlMostFlag:
            print 'Warning: Switching off rlMostFlag to allow rlFlag to take precedence'
            options.rlMostFlag = False

        #TODO: maybe add more sanity checks

        #this is now useless
        options.drop_nproj = False

        options.multi_monoling = False  # set default
        self.iterations = 1  # set default
        self.conllu = True  #default

        if not options.include:  # must specifiy explicitly train
            treebank = utils.Treebank(options.trainfile, \
                                           options.devfile, options.testfile)
            treebank.iso_id = None
            treebank.outdir = options.outdir
            treebank.modeldir = options.modeldir
            #just one model specified by train/dev and/or test
            if options.predict:
                if not options.testfile:
                    raise Exception("--testfile must be specified")
                elif not os.path.exists(options.testfile):
                    raise Exception("Test file " + options.testfile +
                                    " not found")
                else:
                    self.conllu = (os.path.splitext(
                        options.testfile.lower())[1] == '.conllu'
                                   )  # test if file in conllu format
                    treebank.test_gold = options.testfile
            else:
                self.prepareDev(treebank, options)
                if options.devfile:
                    self.conllu = (os.path.splitext(
                        options.devfile.lower())[1] == '.conllu')
                elif options.create_dev:
                    self.conllu = (os.path.splitext(
                        options.trainfile.lower())[1] == '.conllu')

            if options.debug:
                self.createDebugData(treebank, options)

            self.languages = [
                treebank
            ]  # make it a list of one element just for the sake of consistency with the "include" case

        else:
            self.conllu = True  # file is in conllu format
            language_list = utils.parse_list_arg(
                options.include
            )  # languages requested by the user via the include flag
            json_treebanks = utils.conll_dir_to_list(
                language_list,
                options.datadir,
                options.shared_task,  # list of the available treebanks
                options.shared_task_datadir)
            #            self.languages = [lang for lang in json_treebanks if lang.iso_id in language_list]
            treebank_dict = {lang.iso_id: lang for lang in json_treebanks}
            self.languages = []
            for lang in language_list:
                if lang in treebank_dict:
                    self.languages.append(treebank_dict[lang])
                else:
                    print "Warning: skipping invalid language code " + lang

            if options.multiling:
                if options.predict:
                    model = "%s/%s" % (options.modeldir, options.model)
                    if not os.path.exists(
                            model
                    ):  # in multilingual case need model to be found in first language specified
                        raise Exception("Model not found. Path tried: %s" %
                                        model)
                # if options.model_selection: # can only do model selection for monolingual case
                #     print "Warning: model selection on dev data not available for multilingual case"
                #     options.model_selection = False
            else:
                options.multi_monoling = True
                self.iterations = len(self.languages)

            for lang_index in xrange(len(self.languages)):
                language = self.languages[lang_index]

                language.outdir = "%s/%s" % (options.outdir, language.iso_id)
                if not os.path.exists(
                        language.outdir
                ):  # create language-specific output folder if it doesn't exist
                    print "Creating language-specific output directory " + language.outdir
                    os.mkdir(language.outdir)
                else:
                    print("Warning: language-specific subdirectory " +
                          language.outdir +
                          " already exists, contents may be overwritten")

                if not options.predict:
                    self.prepareDev(language, options)

                if options.debug:  # it is important that prepareDev be called before createDebugData
                    self.createDebugData(language, options)

                if options.predict and options.multi_monoling:
                    language.modeldir = "%s/%s" % (options.modeldir,
                                                   language.iso_id)
                    model = "%s/%s" % (language.modeldir, options.model)
                    if not os.path.exists(
                            model
                    ):  # in multilingual case need model to be found in first language specified
                        if not options.shared_task:
                            raise Exception("Model not found. Path tried: %s" %
                                            model)
                        else:
                            #find model for the language in question
                            for otherl in json_treebanks:
                                if otherl.lcode == language.lcode:
                                    if otherl.lcode == otherl.iso_id:
                                        language.modeldir = "%s/%s" % (
                                            options.modeldir, otherl.iso_id)
Пример #3
0
    def __init__(self,options):
        """
        input: parser options
        object to harmonise the way we deal with the parser
        """
        if options.include and not options.datadir:
            raise Exception("You need to specify the data dir to include UD\
                            languages")
        #TODO: maybe add more sanity checks 
        if not options.predictFlag and not (options.rlFlag or options.rlMostFlag or options.headFlag):
            raise Exception("You must use either --userlmost or --userl or\
                            --usehead (you can use multiple)")
            #the diff between two is one is r/l/most child / the other is
            #element in the sentence
            #Eli's paper:
                #extended feature set
                # rightmost and leftmost modifiers of s0, s1 and s2 + leftmost
                # modifier of b0

        if not options.include:
            #just one model specified by train/dev and/or test
            if options.predictFlag:
                self.conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu')
            else:
                self.conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu')
            self.treebank = utils.Treebank(options.conll_train, \
                                           options.conll_dev, options.conll_test)
            self.treebank.iso_id = None

        else:
            self.conllu = True
            language_list = utils.parse_list_arg(options.include)
            json_treebanks = utils.conll_dir_to_list(language_list,options.datadir,options.shared_task,
                                    options.shared_task_datadir)
            self.languages = [lang for lang in json_treebanks if lang.iso_id in language_list]
            for language in self.languages:
                language.removeme = False
                language.outdir= "%s/%s"%(options.output,language.iso_id)
                language.modelDir= "%s/%s"%(options.modelDir,language.iso_id)
                model = "%s/%s"%(language.modelDir,options.model)
                if options.predictFlag and not os.path.exists(model):
                    if not options.shared_task:
                        raise Exception("Model not found. Path tried: %s"%model)
                    else:
                        #find model for the language in question
                        for otherl in json_treebanks:
                            if otherl.lcode == language.lcode:
                                if otherl.lcode == otherl.iso_id:
                                    language.modelDir = "%s/%s"%(options.modelDir,otherl.iso_id)

                if not os.path.exists(language.outdir):
                    os.mkdir(language.outdir)

            for language in self.languages:
                if language.removeme:
                    self.languages.remove(language)

        if options.include and not options.multiling:
            options.multi_monoling = True
            self.iterations = len(self.languages)
        else:
            options.multi_monoling = False
            self.iterations = 1
        #this is now useless
        options.drop_proj = False