def __init__(self): freeling.util_init_locale("default") self.lang= "en" self.ipath="/usr/local" self.lpath=self.ipath + "/share/freeling/" + self.lang + "/" self.tk=freeling.tokenizer(self.lpath+"tokenizer.dat") self.sp=freeling.splitter(self.lpath+"splitter.dat") # create the analyzer with the required set of maco_options self.morfo=freeling.maco(self.my_maco_options(self.lang,self.lpath)); # then, (de)activate required modules self.morfo.set_active_options (False, # UserMap False, # NumbersDetection, True, # PunctuationDetection, False, # DatesDetection, True, # DictionarySearch, True, # AffixAnalysis, False, # CompoundAnalysis, True, # RetokContractions, False, # MultiwordsDetection, True, # NERecognition, False, # QuantitiesDetection, True); # ProbabilityAssignment # create tagger self.tagger = freeling.hmm_tagger(self.lpath+"tagger.dat",True,2) # create sense annotator self.sen = freeling.senses(self.lpath+"senses.dat"); # create sense disambiguator self.wsd = freeling.ukb(self.lpath+"ukb.dat"); # create dependency parser self.parser = freeling.dep_treeler(self.lpath+"dep_treeler/dependences.dat");
def process_file(essay_lst, x): index = 1 for entry in essay_lst: # create tagger essay = entry[1] id = entry[0] tagger = freeling.hmm_tagger(lpath + "tagger.dat", True, 2) # create sense annotator sen = freeling.senses(lpath + "senses.dat") # create sense disambiguator wsd = freeling.ukb(lpath + "ukb.dat") # create dependency parser parser = freeling.chart_parser(lpath + "/chunker/grammar-chunk.dat") dep = freeling.dep_txala(lpath + "/dep_txala/dependences.dat", parser.get_start_symbol()) # tokenize input line into a list of words lw = tk.tokenize(essay) # split list of words in sentences, return list of sentences ls = sp.split(lw) # perform morphosyntactic analysis and disambiguation ls = morfo.analyze(ls) ls = tagger.analyze(ls) # annotate and disambiguate senses ls = sen.analyze(ls) ls = wsd.analyze(ls) # parse sentences ls = parser.analyze(ls) ls = dep.analyze(ls) # do whatever is needed with processed sentences if x == 2: essays_vacation_tagged.append((id, ProcessSentences(ls))) elif x == 3: essays_famous_tagged.append((id, ProcessSentences(ls))) print(index) index += 1
def __init__(self): freeling.util_init_locale("default") self.lang = "en" self.ipath = "/usr/local" self.lpath = self.ipath + "/share/freeling/" + self.lang + "/" self.tk = freeling.tokenizer(self.lpath + "tokenizer.dat") self.sp = freeling.splitter(self.lpath + "splitter.dat") # create the analyzer with the required set of maco_options self.morfo = freeling.maco(self.my_maco_options(self.lang, self.lpath)) # then, (de)activate required modules self.morfo.set_active_options( False, # UserMap False, # NumbersDetection, True, # PunctuationDetection, False, # DatesDetection, True, # DictionarySearch, True, # AffixAnalysis, False, # CompoundAnalysis, True, # RetokContractions, False, # MultiwordsDetection, True, # NERecognition, False, # QuantitiesDetection, True) # ProbabilityAssignment # create tagger self.tagger = freeling.hmm_tagger(self.lpath + "tagger.dat", True, 2) # create sense annotator self.sen = freeling.senses(self.lpath + "senses.dat") # create sense disambiguator self.wsd = freeling.ukb(self.lpath + "ukb.dat") # create dependency parser self.parser = freeling.dep_treeler(self.lpath + "dep_treeler/dependences.dat")
True, # DatesDetection, True, # DictionarySearch, True, # AffixAnalysis, False, # CompoundAnalysis, True, # RetokContractions, True, # MultiwordsDetection, True, # NERecognition, False, # QuantitiesDetection, True) # ProbabilityAssignment # create tagger tagger = pyfreeling.hmm_tagger(lpath + "tagger.dat", True, 2) # create sense annotator sen = pyfreeling.senses(lpath + "senses.dat") # create sense disambiguator wsd = pyfreeling.ukb(lpath + "ukb.dat") # create dependency parser parser = pyfreeling.dep_treeler(lpath + "treeler/dependences.dat") # process input text text = "".join(sys.stdin.readlines()) # tokenize input line into a list of words lw = tk.tokenize(text) # split list of words in sentences, return list of sentences ls = sp.split(lw) # perform morphosyntactic analysis and disambiguation ls = morfo.analyze(ls)
False, True, True, True, # select which among created True, True, True, True, # submodules are to be used. False, False, True, True) # default: all created submodules are used # create tagger, sense anotator, and parsers tg = pyfreeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) sen = pyfreeling.senses(DATA + LANG + "/senses.dat") parser = pyfreeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") dep = pyfreeling.dep_txala(DATA + LANG + "/dep_txala/dependences.dat", parser.get_start_symbol()) def distribution(hospital_files_names): dist_hospitals = {} for name, files in hospital_file_names.items(): if name != 'all': dist_hospitals[name] = len(files) return dist_hospitals def file_size(hospitals_files_name, dir_annotated_corpora):
DATA + LANG + "/probabilitats.dat"); # create analyzers tk=pyfreeling.tokenizer(DATA+LANG+"/tokenizer.dat"); sp=pyfreeling.splitter(DATA+LANG+"/splitter.dat"); sid=sp.open_session(); mf=pyfreeling.maco(op); # activate mmorpho odules to be used in next call mf.set_active_options(False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True ); # default: all created submodules are used # create tagger, sense anotator, and parsers tg=pyfreeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2); sen=pyfreeling.senses(DATA+LANG+"/senses.dat"); parser= pyfreeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat"); dep=pyfreeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol()); # process input text lin=sys.stdin.readline(); print ("Text language is: "+la.identify_language(lin)+"\n"); while (lin) : l = tk.tokenize(lin); ls = sp.split(sid,l,False); ls = mf.analyze(ls); ls = tg.analyze(ls);
def set_up_analyzer(): ## Check whether we know where to find FreeLing data files if "FREELINGDIR" not in os.environ: if sys.platform == "win32" or sys.platform == "win64": os.environ["FREELINGDIR"] = "C:\\Program Files" else: os.environ["FREELINGDIR"] = "/usr/local" #print("FREELINGDIR environment variable not defined, trying ", os.environ["FREELINGDIR"], file=sys.stderr) if not os.path.exists(os.environ["FREELINGDIR"] + "/share/freeling"): #print("Folder",os.environ["FREELINGDIR"]+"/share/freeling", "not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory", file=sys.stderr) sys.exit(1) # Location of FreeLing configuration files. DATA = os.environ["FREELINGDIR"] + "/share/freeling/" # Init locales pyfreeling.util_init_locale("default") #create language detector. Used just to show it. Results are printed # but ignored (after, it is assumed language is LANG) la = pyfreeling.lang_ident(DATA + "common/lang_ident/ident-few.dat") # create options set for maco analyzer. Default values are Ok, except for data files. LANG = "ru" op = pyfreeling.maco_options(LANG) op.set_data_files("", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat") # create analyzers tk = pyfreeling.tokenizer(DATA + LANG + "/tokenizer.dat") sp = pyfreeling.splitter(DATA + LANG + "/splitter.dat") sid = sp.open_session() mf = pyfreeling.maco(op) # activate mmorpho odules to be used in next call mf.set_active_options( False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger, sense anotator, and parsers tg = pyfreeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) sen = pyfreeling.senses(DATA + LANG + "/senses.dat") #parser= pyfreeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat"); #dep=pyfreeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol()); return (tk, sp, sid, mf, tg, sen)