def __init__(self, unsuper_file, semisuper_file, acw): self.io = morfessor.MorfessorIO() self.acw = acw # build models self.unsupervised = self.io.read_binary_model_file(unsuper_file) self.semisupervised = self.io.read_binary_model_file(unsuper_file) a = self.io.read_annotations_file(semisuper_file) annotations = {} for word in a: hypotheses = [] for hypothesis in a[word]: h = "" for morpheme in hypothesis: h += morpheme + " " hypotheses.append(h[:-1]) annotations[word] = hypotheses self.semisupervised.set_annotations(annotations, acw) # write segmentation with open('models/bible-segmentation', 'w') as f: for word in annotations: construction = "" for hypothesis in annotations[word]: construction += hypothesis + ", " construction = construction[:-2] f.write(construction + "\n") self.segs = self.io.read_segmentation_file('models/bible-segmentation', has_counts=False)
def da_trainer(datapath): io = morfessor.MorfessorIO() train_data = list(io.read_corpus_file(datapath)) model_types = morfessor.BaselineModel() model_logtokens = morfessor.BaselineModel() model_tokens = morfessor.BaselineModel() model_types.load_data(train_data, count_modifier=lambda x: 1) def log_func(x): return int(round(math.log(x + 1, 2))) model_logtokens.load_data(train_data, count_modifier=log_func) model_tokens.load_data(train_data) models = [model_types, model_logtokens, model_tokens] i = 0 for model in models: model.train_batch() io.write_binary_model_file("model" + str(i), model) i += 1
def main(d): parse_name(d) word_count = collections.Counter() parent_dir = os.path.dirname(d) for f in os.listdir(parent_dir): if f.endswith(".xz"): for line in lzma.open(os.path.join(parent_dir, f), 'rt', encoding='utf-8'): for word in line.strip().split(): word_count[word] += 1 print("Corpora read", file=sys.stderr) allowed_chars = {line.strip() for line in open(os.path.join(parent_dir, 'allowed_chars'), encoding='utf-8') if len(line.strip()) == 1} model = morfessor.MorfessorIO().read_any_model(os.path.join(d, 'model.bin')) s = set() with open(os.path.join(d,'wordmap'), 'w', encoding='utf-8') as outf: for k in word_count.keys(): parts = model.viterbi_segment(k)[0] rparts = [] for p in parts: if not all(c in allowed_chars for c in p): p = '<UNK>' s.add(p) rparts.append(p) print("{}\t{}".format(k, " ".join(rparts)), file=outf) with open(os.path.join(d,'vocab2'), 'w', encoding='utf-8') as outf: for morph in s: print(morph, file=outf)
def __init__(self, infile, outfile, modelfile, dicfile, wixlm="wixgrams.pickle", eslm="esgrams.pickle"): #F = open("../corpus/corpus.norm2.wix", "r").read() #corpus = F.split() #fq = nltk.FreqDist(corpus) #print(fq.most_common(100)) # Collect data for the classification dicwix = open(dicfile, "r").read() dic = set(dicwix.split(" \n")) self.dicw = list(dic) self.F = open(infile, "r") self.corp = [] with open(wixlm, 'rb') as f: self.wixngrams= pickle.load(f) with open(eslm, 'rb') as f: self.esngrams= pickle.load(f) self.punct = ".,;:\"{}[]()$%&/¿?¡!-" self.io = morfessor.MorfessorIO() self.model = self.io.read_binary_model_file(modelfile) self.inF = open(infile, "r") self.outF = open(outfile, "w") #Stadistics self.nonsegwords=0 self.eswords=0 self.segwords=0
def _load_baseline(): baseline = morfessor.BaselineModel() io = morfessor.MorfessorIO(encoding='latin-1') baseline.load_segmentations( io.read_segmentation_file(REFERENCE_BASELINE_SEGMENTATION)) return baseline
def main(d): parse_name(d) word_count = collections.Counter() print(d) seg_dir = os.path.dirname(d) print("seg_dir {}".format(seg_dir)) for f in os.listdir(seg_dir): if f.endswith(".xz"): print(f) for line in lzma.open(os.path.join(seg_dir, f), 'rt', encoding='utf-8'): for word in line.strip().split(): word_count[word] += 1 print("Corpora read", file=sys.stderr) model = morfessor.MorfessorIO().read_any_model(os.path.join(d, 'model.bin')) s = set() with open(os.path.join(d,'wordmap_all'), 'w', encoding='utf-8') as outf: for k in word_count.keys(): parts = model.viterbi_segment(k)[0] rparts = [] for p in parts: s.add(p) rparts.append(p) print("{}\t{}".format(k, " ".join(rparts)), file=outf) with open(os.path.join(d,'vocab_all'), 'w', encoding='utf-8') as outf: for morph in s: print(morph, file=outf)
def setUp(self): self.baseline = _load_baseline() self.model = _load_flatcat(self.baseline.get_segmentations(), init='no_emissions') io = morfessor.MorfessorIO(encoding='latin-1') line_re = re.compile(r'^[0-9]* (.*)') separator_re = re.compile(r' \+ ') tag_re = re.compile(r'([^/]*)/(.*)') self.detagged = [] self.references = [] for line in io._read_text_file(REFERENCE_BASELINE_TAGGED): m = line_re.match(line) if not m: continue segments = separator_re.split(m.group(1)) detagged_tmp = [] ref_tmp = [] for segment in segments: m = tag_re.match(segment) assert m, 'Could not parse "%s" in "%s"' % (segment, line) ref_tmp.append(flatcat.CategorizedMorph( m.group(1), m.group(2))) detagged_tmp.append(m.group(1)) self.references.append(ref_tmp) self.detagged.append(detagged_tmp)
def __init__(self,lang,add_marker=False): self.lang=lang self.add_marker=add_marker io = morfessor.MorfessorIO() self._morfessor_model=io.read_any_model(common.INDIC_RESOURCES_PATH+'/morph/morfessor/{}.model'.format(lang)) self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1])) self._script_check_re=re.compile(self._script_range_pat)
def test_data(modelpath, testpath): io = morfessor.MorfessorIO() model = io.read_binary_model_file(modelpath) test_data = list(io.read_corpus_file(testpath)) words = test_data[1:-1:6] for index, word in enumerate(words): words[index] = word[1] analyses = [] for word in words: print(model.viterbi_segment(word))
def _config(self): self.reference_file = REFERENCE_REESTIMATE_PROBS self.baseline = _load_baseline() self.model = _load_flatcat(self.baseline.get_segmentations(), init='first') self.retagged = [] io = morfessor.MorfessorIO(encoding='latin-1') segmentations = io.read_segmentation_file( REFERENCE_BASELINE_SEGMENTATION)
def Base_SegModel(data, average_morph_length): io = morfessor.MorfessorIO() train_data = list(io.read_corpus_file(data)) baseline_model = morfessor.BaselineModel(corpusweight=1.0) updater = morfessor.baseline.MorphLengthCorpusWeight(average_morph_length) baseline_model.set_corpus_weight_updater(updater) baseline_model.load_data(train_data, count_modifier=lambda x: 1) baseline_model.train_batch() return baseline_model
def main(d): freq, alpha, damp = parse_name(d) word_count = collections.Counter() parent_dir = os.path.dirname(d) for f in os.listdir(parent_dir): if f.endswith(".xz") and not f.startswith("dev") and not f.startswith( "eval") and not f.startswith("test"): print("Read {}".format(f), file=sys.stderr) for line in lzma.open(os.path.join(parent_dir, f), 'rt', encoding='utf-8'): for word in line.strip().split(): word_count[word] += 1 print("Corpora read", file=sys.stderr) allowed_chars = { line.strip() for line in open(os.path.join(parent_dir, 'allowed_chars'), encoding='utf-8') if len(line.strip()) == 1 } model = morfessor.BaselineModel(corpusweight=alpha) assert damp in {'types', 'tokens', 'logtokens'} damp_func = None if damp == 'types': damp_func = lambda x: 1 elif damp == 'logtokens': damp_func = lambda x: int(round(math.log(x + 1, 2))) data = [(v, k) for k, v in word_count.items() if all(c in allowed_chars for c in k)] model.load_data(data, freq, damp_func) model.train_batch() io = morfessor.MorfessorIO() io.write_binary_model_file(os.path.join(d, 'model.bin'), model) io.write_segmentation_file(os.path.join(d, 'model.txt'), model.get_segmentations()) s = set() with open(os.path.join(d, 'wordmap'), 'w', encoding='utf-8') as outf: for k in word_count.keys(): parts = model.viterbi_segment(k)[0] rparts = [] for p in parts: if not all(c in allowed_chars for c in p): p = '<UNK>' s.add(p) rparts.append(p) print("{}\t{}".format(k, " ".join(rparts)), file=outf) with open(os.path.join(d, 'vocab'), 'w', encoding='utf-8') as outf: for morph in s: print(morph, file=outf)
def load_morfessor_model(path): import morfessor s = tarfile.open(path) file_handler = s.extractfile(s.next()) tmp_file_ = NamedTemporaryFile(delete=False) tmp_file_.write(file_handler.read()) tmp_file_.close() io = morfessor.MorfessorIO() model = io.read_any_model(tmp_file_.name) os.remove(tmp_file_.name) return model
def Base_SegModel(data, corpusweight): io = morfessor.MorfessorIO() train_data = list(io.read_corpus_file(data)) model_types = morfessor.BaselineModel(corpusweight=corpusweight) model_types.load_data(train_data, count_modifier=lambda x: 1) model_types.train_batch() model_tokens = morfessor.BaselineModel() model_tokens.load_data(train_data) model_tokens.train_batch() return model_types, model_tokens
def __init__(self): DIR = dirname(__file__) morfessor_file = join(DIR, 'data/finnsyll-morfessor.bin') ngram_file = join(DIR, 'data/finnsyll-ngrams.pickle') io = morfessor.MorfessorIO() self.model = io.read_binary_model_file(morfessor_file) self.constraints = CONSTRAINTS self.constraint_count = len(CONSTRAINTS) with open(ngram_file, 'rb') as f: self.ngrams, self.vocab, self.total = pickle.load(f)
def test_model(model, gold_standard_file): # load IO object morf_io = morfessor.MorfessorIO() # load gold standard annotations file gold_standard = morf_io.read_annotations_file(gold_standard_file) # build evaluator object and run evaluation against gold standard evaluator = morfessor.MorfessorEvaluation(gold_standard) results = evaluator.evaluate_model(model) return results
def _morfessor_iterator_from_list(sentences): """Turns the list into the kind of iterator that morfessor likes :param sentences: A list of sentences, where each sentence is a list of words :return: A nice pretty iterator """ io = morfessor.MorfessorIO() for sentence in sentences: sentence_string = ' '.join(sentence) for compound in io.compound_sep_re.split(sentence_string): if len(compound) > 0: yield 1, io._split_atoms(compound) yield 0, ()
def main(allowed_chars_file, model): allowed_chars = { line.strip() for line in open(allowed_chars_file, encoding='utf-8') if len(line.strip()) == 1 } model = morfessor.MorfessorIO().read_any_model(model) for line in sys.stdin: word = line.strip() parts = model.viterbi_segment(word)[0] print(word, end=' ') print(" ".join(parts).replace("<unk>", "<UNK>"))
def load_morfessor_model(lang="en", version="2"): """Return a morfessor model for `lang` and of version `version` Args: lang (string): language code. version (string): version of the parameters to be used. """ src_dir = "morph{}".format(version) p = locate_resource(src_dir, lang) file_handler = _open(p) tmp_file_ = NamedTemporaryFile(delete=False) tmp_file_.write(file_handler.read()) tmp_file_.close() io = morfessor.MorfessorIO() model = io.read_any_model(tmp_file_.name) os.remove(tmp_file_.name) return model
def train_seg(infile, outfile): io = morfessor.MorfessorIO() print("Open corpus file") train_data = list(io.read_corpus_file(infile)) model_types = morfessor.BaselineModel() model_types.load_data(train_data, count_modifier=lambda x: 1) def log_func(x): return int(round(math.log(x + 1, 2))) print("Training data...") model_types.train_batch() print("Write bin file") io.write_binary_model_file(outfile, model_types)
def evaluate(gold_data, morf, acw, segs): print(str(acw) + " weighted annotated corpus") io = morfessor.MorfessorIO() gold = io.read_annotations_file(gold_data) ev = morfessor.MorfessorEvaluation(gold) models = [morf.unsupervised, morf.semisupervised] # evaluate models config = morfessor.evaluation.EvaluationConfig(10, 25) results = [ev.evaluate_model(m, config) for m in models] print("model evaluation") print(results[0]) print(results[1]) wsr = morfessor.evaluation.WilcoxonSignedRank() r = wsr.significance_test(results) wsr.print_table(r)
def load_varembed_format(cls, vectors, morfessor_model=None): """ Load the word vectors into matrix from the varembed output vector files. Using morphemes requires Python 2.7 version or above. 'vectors' is the pickle file containing the word vectors. 'morfessor_model' is the path to the trained morfessor model. 'use_morphemes' False(default) use of morpheme embeddings in output. """ result = cls() if vectors is None: raise Exception( "Please provide vectors binary to load varembed model") D = utils.unpickle(vectors) word_to_ix = D['word_to_ix'] morpho_to_ix = D['morpho_to_ix'] word_embeddings = D['word_embeddings'] morpho_embeddings = D['morpheme_embeddings'] result.load_word_embeddings(word_embeddings, word_to_ix) if morfessor_model: if sys.version_info >= ( 2, 7): #Morfessor is only supported for Python 2.7 and above. try: import morfessor morfessor_model = morfessor.MorfessorIO( ).read_binary_model_file(morfessor_model) result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) except ImportError: # Morfessor Package not found. logger.error( 'Could not import morfessor. Not using morpheme embeddings' ) raise ImportError('Could not import morfessor.') else: # Raise exception in Python 2.6 or earlier. raise Exception( 'Using Morphemes requires Python 2.7 and above. Morfessor is not supported in python 2.6' ) logger.info('Loaded varembed model vectors from %s', vectors) return result
def load_varembed_format(cls, vectors, morfessor_model=None): """Load the word vectors into matrix from the varembed output vector files. Parameters ---------- vectors : dict Pickle file containing the word vectors. morfessor_model : str, optional Path to the trained morfessor model. Returns ------- :class:`~gensim.models.wrappers.varembed.VarEmbed` Ready to use instance. """ result = cls() if vectors is None: raise Exception( "Please provide vectors binary to load varembed model") d = utils.unpickle(vectors) word_to_ix = d['word_to_ix'] morpho_to_ix = d['morpho_to_ix'] word_embeddings = d['word_embeddings'] morpho_embeddings = d['morpheme_embeddings'] result.load_word_embeddings(word_embeddings, word_to_ix) if morfessor_model: try: import morfessor morfessor_model = morfessor.MorfessorIO( ).read_binary_model_file(morfessor_model) result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) except ImportError: # Morfessor Package not found. logger.error( 'Could not import morfessor. Not using morpheme embeddings' ) raise ImportError('Could not import morfessor.') logger.info('Loaded varembed model vectors from %s', vectors) return result
def train_model(input_file, output_file=None): # setup input and model objects morf_io = morfessor.MorfessorIO() morf_model = morfessor.BaselineModel() # build a corpus from input file train_data = morf_io.read_corpus_file(input_file) # load data into model # optional param "count_modifier" can set frequency dampening; # default is each token counts morf_model.load_data(train_data) # train the model in batch form (online training also available) morf_model.train_batch() # optionally pickle model if output_file is not None: morf_io.write_binary_model_file(output_file, morf_model) return morf_model
def train_morfessor(corpus, split_prob): """ Train Morfessor Baseline model Lowercase the input text; use random skips for frequently seen compounds to speed up training; initialize new words by random splitting using the split probability of split_prob. """ io = morfessor.MorfessorIO(compound_separator=r"[^-\w]+", lowercase=True) train_data = list( io.read_corpus_file(os.path.join('data', 'corpora', corpus))) model_tokens = morfessor.BaselineModel(use_skips=True) model_tokens.load_data(train_data, init_rand_split=split_prob) model_tokens.train_batch() io.write_binary_model_file( os.path.join('data', 'models', corpus[:-4] + '_morph'), model_tokens) return model_tokens
def main(oov_file, btype, model): parent_dir = os.path.dirname(oov_file) allowed_chars = { line.strip() for line in open(os.path.join(parent_dir, 'allowed_chars'), encoding='utf-8') if len(line.strip()) == 1 } model = morfessor.MorfessorIO().read_any_model(model) between = " " prefix = "" suffix = "" assert btype in {"aff", "wma", "suf", "pre"} if btype == "wma": between = " <w> " if btype == "pre" or btype == "aff": prefix = "+" if btype == "suf" or btype == "aff": suffix = "+" for line in open(oov_file, encoding='utf-8'): word = line.strip() parts = model.viterbi_segment(word)[0] rparts = [] for p in parts: if not all(c in allowed_chars for c in p): p = '<UNK>' rparts.append(p) print("{} {}".format(suffix, prefix).join(rparts).replace( "+<unk>", "<unk>").replace("<unk>+", "<unk>").replace( "+<UNK>", "<UNK>").replace("<UNK>+", "<UNK>").replace("<unk>", "<UNK>"))
def main(model, toplist, origlex, jointlex, wordmap, morphsep): io = morfessor.MorfessorIO() model = io.read_binary_model_file(model) jointlex = read_lex(jointlex) origlex = read_lex(origlex) new_lex = {} tof = open('tmpout', 'w', encoding='utf-8') counts = [0] * 100 for k, v in origlex.items(): counts[len(v)] += 1 if len(v) == 2: print(k, file=tof) tof.close() for i, c in enumerate(counts): print("{} times {} transcriptions".format(c, i)) jcounts = [0] * 100 for k, v in jointlex.items(): jcounts[len(v)] += 1 for i, c in enumerate(jcounts): print("{} times {} morphtranscriptions".format(c, i)) words_todo = [] for word in toplist: word = word.strip().split()[0] segm = model.viterbi_segment(word)[0] if any(p not in jointlex for p in segm): print("{}\tUNK".format(word), file=wordmap) continue if word not in origlex: words_todo.append(word) continue target_trans = origlex[word] if len(target_trans) > 1: words_todo.append(word) continue target_idx = None for idx in itertools.product( *[list(range(len(jointlex[p]))) for p in segm]): trans = [] for p, i in zip(segm, idx): trans.extend(jointlex[p][i]) if tuple(trans) == target_trans[0]: target_idx = idx break if target_idx is not None: nsegm = ["{}#{}".format(p, i) for p, i in zip(segm, target_idx)] print("{}\t{}".format(word, morphsep.join(nsegm)), file=wordmap) else: words_todo.append(word) print("Still to do {} words".format(len(words_todo))) words = words_todo words_todo = [] for word in words: word_done = False if word not in origlex: words_todo.append(word) continue for segme in model.viterbi_nbest(word, 10): segm = segme[0] if any(p not in jointlex for p in segm): continue target_trans = origlex[word] trans_left = list(target_trans) target_idxs = [] for idx in itertools.product( *[list(range(len(jointlex[p]))) for p in segm]): trans = [] for p, i in zip(segm, idx): trans.extend(jointlex[p][i]) for ti in range(len(trans_left)): if tuple(trans) == trans_left[ti]: target_idxs.append(idx) del trans_left[ti] break if len(target_idxs) > 0: if len(target_idxs) != len(target_trans): print( "WARNING: {} has less phone transcriptions then expected" .format(word)) continue target_idx = [{*k} for k in zip(*target_idxs)] nsegm = [ "{}#{}".format(p, ",".join(str(a) for a in sorted(i))) for p, i in zip(segm, target_idx) ] print("{}\t{}".format(word, morphsep.join(nsegm)), file=wordmap) word_done = True break if not word_done: words_todo.append(word) print("Still to do {} words".format(len(words_todo))) for word in words_todo: segm = model.viterbi_segment(word)[0] print("{}\t{}".format(word, morphsep.join(segm)), file=wordmap)
3: 'I-ORG', 4: 'B-PRO', 5: 'B-PER', 6: 'I-PER', 7: 'I-PRO', 8: 'B-LOC', 9: 'B-DATE', 10: 'B-EVENT', 11: 'I-LOC', 12: 'I-EVENT', 13: 'I-DATE' } num_tags = len(idx2tag) + 1 whole_data_path = document_path target_data = load_data(document_path) io = morfessor.MorfessorIO() print('Loading embeddings...') #embeddings = gensim.models.fasttext.load_facebook_vectors('data/embeddings/cc.fi.300.bin') embeddings = fasttext.load_model('data/embeddings/cc.fi.300.bin') print('Finished loading embeddings') # load the morfessor model morfessor_model = io.read_binary_model_file( 'utils/subword_segmentation/output/model/morfessor_0.1.bin') whole_data = load_data(whole_data_path) # segment data into morphs whole_data_morphs = []
def setUp(self): self.perplexities = dict() self.condprobs = dict() self.posteriors = dict() self.transitions = dict() catpriors_tmp = dict() self._config() self.comments_io = morfessor.MorfessorIO(encoding='latin-1', comment_start='++++++++++') pattern_float = r'([0-9.]+)' pattern_int = r'([0-9]+)' pattern_quoted = r'"([^"]*)"' ppl_re = re.compile(r'^#Features\(' + pattern_quoted + r'\)\s+' + pattern_float + r'\s+' + pattern_float + r'\s+' + pattern_int) condprobs_re = re.compile(r'^#P\(Tag\|' + pattern_quoted + r'\)\s+' + pattern_float + r'\s+' + pattern_float + r'\s+' + pattern_float + r'\s+' + pattern_float) catpriors_re = re.compile(r'^#PTag\(' + pattern_quoted + r'\)\s+' + pattern_float) posteriors_re = re.compile(r'^(\S*)\s+' + pattern_float + r'\s+' + pattern_float + r'\s+' + pattern_float + r'\s+' + pattern_float) transitions_re = re.compile(r'^P\((\S+) .. ([^\)]+)\) = ' + pattern_float + r' \(N = ' + pattern_int + '\)') for line in self.comments_io._read_text_file(self.reference_file): m = ppl_re.match(line) if m: self.perplexities[m.group(1)] = (float(m.group(2)), float(m.group(3)), int(m.group(4))) continue m = condprobs_re.match(line) if m: self.condprobs[m.group(1)] = (float(m.group(2)), float(m.group(3)), float(m.group(4)), float(m.group(5))) continue m = catpriors_re.match(line) if m: catpriors_tmp[m.group(1)] = float(m.group(2)) continue m = posteriors_re.match(line) if m: self.posteriors[m.group(1)] = flatcat.ByCategory( float(m.group(2)), float(m.group(3)), float(m.group(4)), float(m.group(5))) continue m = transitions_re.match(line) if m: def _tr_wb(x): if x == '#': return flatcat.FlatcatModel.word_boundary return x cats = tuple([_tr_wb(x) for x in (m.group(1), m.group(2))]) self.transitions[cats] = (float(m.group(3)), int(m.group(4))) self.catpriors = flatcat.ByCategory( *(catpriors_tmp[x] for x in self.model.get_categories()))
parser.add_argument('--coarse_atoms',action='store_true') args = parser.parse_args() numKnownUnambigous=0 numKnownAmbigous=0 numUnknown=0 freqKnownUnambigous=0 freqKnownAmbigous=0 freqUnknown=0 morfessorModel=None if args.morfessor_model: if args.coarse_atoms: io = morfessor.MorfessorIO(atom_separator="■") else: io = morfessor.MorfessorIO() morfessorModel=io.read_binary_model_file(args.morfessor_model) #Load vocabulary: we need frequencies for statistics freqs=defaultdict(int) with open(args.vocabulary) as freqs_f: for line in freqs_f: line=line.rstrip("\n") parts=line.split(" ") freqs[parts[1]]=int(parts[0]) stopwords=set()