def create_UD_treebank_list(self,options):
        """
        Create list of UD Treebanks for experiments.
        Output will either be a list where each element is a single treebank (monolingual or multi-monolingual case)
        or a list where the first element is a list of treebanks (multilingual case).
        This makes it easier to loop over the outer list in our main parser function
        """
        options.conllu = True # file is in conllu format
        all_treebanks = utils.get_all_treebanks(options) # returns a UD treebank for all possible UD languages
        treebank_dict = {treebank.iso_id: treebank for treebank in all_treebanks}
        treebanks = [] # the treebanks we need
        iso_list = utils.parse_list_arg(options.include) # languages requested by the user via the include flag
        for iso in iso_list:
            proxy_tbank = None
            m = re.search(r'^(.*):(.*)$',iso)
            if m:
                iso = m.group(1)
                proxy_tbank = m.group(2)
            if iso in treebank_dict:
                treebank = treebank_dict[iso]
                treebank.proxy_tbank = proxy_tbank
                if not options.shared_task:
                    treebank.outdir= os.path.join(options.outdir,treebank.iso_id)
                else:
                    treebank.outdir = options.outdir
                if not os.path.exists(treebank.outdir): # create language-specific output folder if it doesn't exist
                    print "Creating language-specific output directory " + treebank.outdir
                    os.mkdir(treebank.outdir)
                else:
                    print ("Warning: language-specific subdirectory " + treebank.outdir
                        + " already exists, contents may be overwritten")

                if not options.predict:
                    self.prepareDev(treebank,options)

                if options.debug: # it is important that prepareDev be called before createDebugData
                    self.createDebugData(treebank,options)

                if options.predict and not options.multiling:
                    treebank.modeldir = os.path.join(options.modeldir,treebank.iso_id)
                    model = os.path.join(treebank.modeldir,options.model)
                    if not os.path.exists(model):
                        raise Exception("Model not found. Path tried: %s"%model)
                else:
                    treebank.modeldir = None

                treebanks.append(treebank)
            else:
                print "Warning: skipping invalid language code " + iso

        return treebanks

if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("--include",
                      metavar="LIST",
                      help="List of languages by ISO code to be run \
                     if using UD. If not specified need to specify trainfile at least. When used in combination with \
                     --multiling, trains a common parser for all languages. Otherwise, train monolingual parsers for \
                     each")
    parser.add_option(
        "--datadir",
        metavar="PATH",
        help=
        "Input directory with UD train/dev/test files; obligatory if using --include"
    )
    (options, args) = parser.parse_args()
    #ugly but necessary
    options.shared_task = False
    options.golddir = None
    iso_ids = utils.parse_list_arg(options.include)

    iso_dict = utils.load_iso_dict()
    treebank_metadata = [(name, iso_id) for (name, iso_id) in iso_dict.items()
                         if iso_id in iso_ids]

    treebanks = [utils.UDtreebank(ele, options) for ele in treebank_metadata]
    for treebank in treebanks:
        #get_stats([treebank])
        get_stats_c([treebank])
Пример #3
0
    def __init__(self, options):
        """
        input: parser options
        object to harmonise the way we deal with the parser
        """

        print 'Using external embedding:', options.external_embedding
        self.deal_with_multiling(options)

        if options.include and not options.datadir:
            raise Exception(
                "You need to specify the data dir to include UD languages")

        if not options.predict:
            if not options.include and not options.trainfile:
                raise Exception(
                    "If not using the --include option, you must specify your training data with --trainfile"
                )
        else:
            if not options.include and not options.testfile:
                raise Exception(
                    "If not using the --include option, you must specify your test data with --testfile"
                )
            if not options.modeldir:
                options.modeldir = options.outdir  # set model directory to output directory by default

        if not options.outdir:
            raise Exception(
                "You must specify an output directory via the --outdir option")
        elif not os.path.exists(
                options.outdir):  # create output directory if it doesn't exist
            print "Creating output directory " + options.outdir
            os.mkdir(options.outdir)

        if not options.predict and not (options.rlFlag or options.rlMostFlag
                                        or options.headFlag):
            raise Exception(
                "Must include either head, rl or rlmost (For example, if you specified --disable-head and --disable-rlmost, you must specify --userl)"
            )

        if options.rlFlag and options.rlMostFlag:
            print 'Warning: Switching off rlMostFlag to allow rlFlag to take precedence'
            options.rlMostFlag = False

        #TODO: maybe add more sanity checks

        #this is now useless
        options.drop_nproj = False

        options.multi_monoling = False  # set default
        self.iterations = 1  # set default
        self.conllu = True  #default

        if not options.include:  # must specifiy explicitly train
            treebank = utils.Treebank(options.trainfile, \
                                           options.devfile, options.testfile)
            treebank.iso_id = None
            treebank.outdir = options.outdir
            treebank.modeldir = options.modeldir
            #just one model specified by train/dev and/or test
            if options.predict:
                if not options.testfile:
                    raise Exception("--testfile must be specified")
                elif not os.path.exists(options.testfile):
                    raise Exception("Test file " + options.testfile +
                                    " not found")
                else:
                    self.conllu = (os.path.splitext(
                        options.testfile.lower())[1] == '.conllu'
                                   )  # test if file in conllu format
                    treebank.test_gold = options.testfile
            else:
                self.prepareDev(treebank, options)
                if options.devfile:
                    self.conllu = (os.path.splitext(
                        options.devfile.lower())[1] == '.conllu')
                elif options.create_dev:
                    self.conllu = (os.path.splitext(
                        options.trainfile.lower())[1] == '.conllu')

            if options.debug:
                self.createDebugData(treebank, options)

            self.languages = [
                treebank
            ]  # make it a list of one element just for the sake of consistency with the "include" case

        else:
            self.conllu = True  # file is in conllu format
            language_list = utils.parse_list_arg(
                options.include
            )  # languages requested by the user via the include flag
            json_treebanks = utils.conll_dir_to_list(
                language_list,
                options.datadir,
                options.shared_task,  # list of the available treebanks
                options.shared_task_datadir)
            #            self.languages = [lang for lang in json_treebanks if lang.iso_id in language_list]
            treebank_dict = {lang.iso_id: lang for lang in json_treebanks}
            self.languages = []
            for lang in language_list:
                if lang in treebank_dict:
                    self.languages.append(treebank_dict[lang])
                else:
                    print "Warning: skipping invalid language code " + lang

            if options.multiling:
                if options.predict:
                    model = "%s/%s" % (options.modeldir, options.model)
                    if not os.path.exists(
                            model
                    ):  # in multilingual case need model to be found in first language specified
                        raise Exception("Model not found. Path tried: %s" %
                                        model)
                # if options.model_selection: # can only do model selection for monolingual case
                #     print "Warning: model selection on dev data not available for multilingual case"
                #     options.model_selection = False
            else:
                options.multi_monoling = True
                self.iterations = len(self.languages)

            for lang_index in xrange(len(self.languages)):
                language = self.languages[lang_index]

                language.outdir = "%s/%s" % (options.outdir, language.iso_id)
                if not os.path.exists(
                        language.outdir
                ):  # create language-specific output folder if it doesn't exist
                    print "Creating language-specific output directory " + language.outdir
                    os.mkdir(language.outdir)
                else:
                    print("Warning: language-specific subdirectory " +
                          language.outdir +
                          " already exists, contents may be overwritten")

                if not options.predict:
                    self.prepareDev(language, options)

                if options.debug:  # it is important that prepareDev be called before createDebugData
                    self.createDebugData(language, options)

                if options.predict and options.multi_monoling:
                    language.modeldir = "%s/%s" % (options.modeldir,
                                                   language.iso_id)
                    model = "%s/%s" % (language.modeldir, options.model)
                    if not os.path.exists(
                            model
                    ):  # in multilingual case need model to be found in first language specified
                        if not options.shared_task:
                            raise Exception("Model not found. Path tried: %s" %
                                            model)
                        else:
                            #find model for the language in question
                            for otherl in json_treebanks:
                                if otherl.lcode == language.lcode:
                                    if otherl.lcode == otherl.iso_id:
                                        language.modeldir = "%s/%s" % (
                                            options.modeldir, otherl.iso_id)
Пример #4
0
    def __init__(self,options):
        """
        input: parser options
        object to harmonise the way we deal with the parser
        """
        if options.include and not options.datadir:
            raise Exception("You need to specify the data dir to include UD\
                            languages")
        #TODO: maybe add more sanity checks 
        if not options.predictFlag and not (options.rlFlag or options.rlMostFlag or options.headFlag):
            raise Exception("You must use either --userlmost or --userl or\
                            --usehead (you can use multiple)")
            #the diff between two is one is r/l/most child / the other is
            #element in the sentence
            #Eli's paper:
                #extended feature set
                # rightmost and leftmost modifiers of s0, s1 and s2 + leftmost
                # modifier of b0

        if not options.include:
            #just one model specified by train/dev and/or test
            if options.predictFlag:
                self.conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu')
            else:
                self.conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu')
            self.treebank = utils.Treebank(options.conll_train, \
                                           options.conll_dev, options.conll_test)
            self.treebank.iso_id = None

        else:
            self.conllu = True
            language_list = utils.parse_list_arg(options.include)
            json_treebanks = utils.conll_dir_to_list(language_list,options.datadir,options.shared_task,
                                    options.shared_task_datadir)
            self.languages = [lang for lang in json_treebanks if lang.iso_id in language_list]
            for language in self.languages:
                language.removeme = False
                language.outdir= "%s/%s"%(options.output,language.iso_id)
                language.modelDir= "%s/%s"%(options.modelDir,language.iso_id)
                model = "%s/%s"%(language.modelDir,options.model)
                if options.predictFlag and not os.path.exists(model):
                    if not options.shared_task:
                        raise Exception("Model not found. Path tried: %s"%model)
                    else:
                        #find model for the language in question
                        for otherl in json_treebanks:
                            if otherl.lcode == language.lcode:
                                if otherl.lcode == otherl.iso_id:
                                    language.modelDir = "%s/%s"%(options.modelDir,otherl.iso_id)

                if not os.path.exists(language.outdir):
                    os.mkdir(language.outdir)

            for language in self.languages:
                if language.removeme:
                    self.languages.remove(language)

        if options.include and not options.multiling:
            options.multi_monoling = True
            self.iterations = len(self.languages)
        else:
            options.multi_monoling = False
            self.iterations = 1
        #this is now useless
        options.drop_proj = False