def create_vanila_treebank_list(self, options): """ Create list of vanilla (i.e. non-UD) treebanks. Currently only one treebank is supported, so the list will always have one element. This is for consistency with the UD treebanks case where multi-monlingual experiments are allowed """ treebank = utils.Treebank(options.trainfile, \ options.devfile, options.testfile) treebank.iso_id = None treebank.outdir = options.outdir treebank.modeldir = options.modeldir #just one model specified by train/dev and/or test if options.predict: if not os.path.exists(options.testfile): raise Exception("Test file " + options.testfile + " not found") else: options.conllu = (os.path.splitext( options.testfile.lower())[1] == '.conllu' ) # test if file in conllu format treebank.test_gold = options.testfile else: self.prepareDev(treebank, options) if options.devfile: options.conllu = (os.path.splitext( options.devfile.lower())[1] == '.conllu') elif options.create_dev: options.conllu = (os.path.splitext( options.trainfile.lower())[1] == '.conllu') if options.debug: self.createDebugData(treebank, options) return [ treebank ] # make it a list of one element just for the sake of consistency with the "include" case
def __init__(self, options): """ input: parser options object to harmonise the way we deal with the parser """ print 'Using external embedding:', options.external_embedding self.deal_with_multiling(options) if options.include and not options.datadir: raise Exception( "You need to specify the data dir to include UD languages") if not options.predict: if not options.include and not options.trainfile: raise Exception( "If not using the --include option, you must specify your training data with --trainfile" ) else: if not options.include and not options.testfile: raise Exception( "If not using the --include option, you must specify your test data with --testfile" ) if not options.modeldir: options.modeldir = options.outdir # set model directory to output directory by default if not options.outdir: raise Exception( "You must specify an output directory via the --outdir option") elif not os.path.exists( options.outdir): # create output directory if it doesn't exist print "Creating output directory " + options.outdir os.mkdir(options.outdir) if not options.predict and not (options.rlFlag or options.rlMostFlag or options.headFlag): raise Exception( "Must include either head, rl or rlmost (For example, if you specified --disable-head and --disable-rlmost, you must specify --userl)" ) if options.rlFlag and options.rlMostFlag: print 'Warning: Switching off rlMostFlag to allow rlFlag to take precedence' options.rlMostFlag = False #TODO: maybe add more sanity checks #this is now useless options.drop_nproj = False options.multi_monoling = False # set default self.iterations = 1 # set default self.conllu = True #default if not options.include: # must specifiy explicitly train treebank = utils.Treebank(options.trainfile, \ options.devfile, options.testfile) treebank.iso_id = None treebank.outdir = options.outdir treebank.modeldir = options.modeldir #just one model specified by train/dev and/or test if options.predict: if not options.testfile: raise Exception("--testfile must be specified") elif not os.path.exists(options.testfile): raise Exception("Test file " + options.testfile + " not found") else: self.conllu = (os.path.splitext( options.testfile.lower())[1] == '.conllu' ) # test if file in conllu format treebank.test_gold = options.testfile else: self.prepareDev(treebank, options) if options.devfile: self.conllu = (os.path.splitext( options.devfile.lower())[1] == '.conllu') elif options.create_dev: self.conllu = (os.path.splitext( options.trainfile.lower())[1] == '.conllu') if options.debug: self.createDebugData(treebank, options) self.languages = [ treebank ] # make it a list of one element just for the sake of consistency with the "include" case else: self.conllu = True # file is in conllu format language_list = utils.parse_list_arg( options.include ) # languages requested by the user via the include flag json_treebanks = utils.conll_dir_to_list( language_list, options.datadir, options.shared_task, # list of the available treebanks options.shared_task_datadir) # self.languages = [lang for lang in json_treebanks if lang.iso_id in language_list] treebank_dict = {lang.iso_id: lang for lang in json_treebanks} self.languages = [] for lang in language_list: if lang in treebank_dict: self.languages.append(treebank_dict[lang]) else: print "Warning: skipping invalid language code " + lang if options.multiling: if options.predict: model = "%s/%s" % (options.modeldir, options.model) if not os.path.exists( model ): # in multilingual case need model to be found in first language specified raise Exception("Model not found. Path tried: %s" % model) # if options.model_selection: # can only do model selection for monolingual case # print "Warning: model selection on dev data not available for multilingual case" # options.model_selection = False else: options.multi_monoling = True self.iterations = len(self.languages) for lang_index in xrange(len(self.languages)): language = self.languages[lang_index] language.outdir = "%s/%s" % (options.outdir, language.iso_id) if not os.path.exists( language.outdir ): # create language-specific output folder if it doesn't exist print "Creating language-specific output directory " + language.outdir os.mkdir(language.outdir) else: print("Warning: language-specific subdirectory " + language.outdir + " already exists, contents may be overwritten") if not options.predict: self.prepareDev(language, options) if options.debug: # it is important that prepareDev be called before createDebugData self.createDebugData(language, options) if options.predict and options.multi_monoling: language.modeldir = "%s/%s" % (options.modeldir, language.iso_id) model = "%s/%s" % (language.modeldir, options.model) if not os.path.exists( model ): # in multilingual case need model to be found in first language specified if not options.shared_task: raise Exception("Model not found. Path tried: %s" % model) else: #find model for the language in question for otherl in json_treebanks: if otherl.lcode == language.lcode: if otherl.lcode == otherl.iso_id: language.modeldir = "%s/%s" % ( options.modeldir, otherl.iso_id)
def __init__(self,options): """ input: parser options object to harmonise the way we deal with the parser """ if options.include and not options.datadir: raise Exception("You need to specify the data dir to include UD\ languages") #TODO: maybe add more sanity checks if not options.predictFlag and not (options.rlFlag or options.rlMostFlag or options.headFlag): raise Exception("You must use either --userlmost or --userl or\ --usehead (you can use multiple)") #the diff between two is one is r/l/most child / the other is #element in the sentence #Eli's paper: #extended feature set # rightmost and leftmost modifiers of s0, s1 and s2 + leftmost # modifier of b0 if not options.include: #just one model specified by train/dev and/or test if options.predictFlag: self.conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu') else: self.conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu') self.treebank = utils.Treebank(options.conll_train, \ options.conll_dev, options.conll_test) self.treebank.iso_id = None else: self.conllu = True language_list = utils.parse_list_arg(options.include) json_treebanks = utils.conll_dir_to_list(language_list,options.datadir,options.shared_task, options.shared_task_datadir) self.languages = [lang for lang in json_treebanks if lang.iso_id in language_list] for language in self.languages: language.removeme = False language.outdir= "%s/%s"%(options.output,language.iso_id) language.modelDir= "%s/%s"%(options.modelDir,language.iso_id) model = "%s/%s"%(language.modelDir,options.model) if options.predictFlag and not os.path.exists(model): if not options.shared_task: raise Exception("Model not found. Path tried: %s"%model) else: #find model for the language in question for otherl in json_treebanks: if otherl.lcode == language.lcode: if otherl.lcode == otherl.iso_id: language.modelDir = "%s/%s"%(options.modelDir,otherl.iso_id) if not os.path.exists(language.outdir): os.mkdir(language.outdir) for language in self.languages: if language.removeme: self.languages.remove(language) if options.include and not options.multiling: options.multi_monoling = True self.iterations = len(self.languages) else: options.multi_monoling = False self.iterations = 1 #this is now useless options.drop_proj = False