def load(filename: str, lexicon: Lexicon, rule_set: RuleSet) -> 'EdgeSet': result = EdgeSet(lexicon) edge_iter = (GraphEdge(lexicon[source], lexicon[target], rule_set[rule]) \ for source, target, rule in read_tsv_file(filename)) return EdgeSet(lexicon, edge_iter)
def load(filename: str, rule_set: RuleSet) -> 'SimpleEdgeModel': result = SimpleEdgeModel(rule_set) probs = np.zeros(len(rule_set)) for rule, prob in read_tsv_file(filename, (str, float)): r_id = rule_set.get_id(rule_set[rule]) probs[r_id] = prob result.set_probs(probs) return result
def contract_graph(graph_file: str) -> None: '''Remove any additional information needed for filtering.''' with open_to_write(graph_file + '.tmp') as graph_tmp_fp: logging.getLogger('main').info('Contracting the graph...') for w1, w2, rule, freq in read_tsv_file(graph_file, show_progressbar=True): write_line(graph_tmp_fp, (w1, w2, rule)) rename_file(graph_file + '.tmp', graph_file)
def load(filename: str) -> 'Lexicon': def _parse_entry_from_row(row :List[str], use_restr=False, use_freq=False, use_vec=False, vec_sep=' ', vec_dim=None)\ -> LexiconEntry: my_row = list(row) # copy because it will be destroyed word = my_row.pop(0) kwargs = {} if use_restr: restr = my_row.pop(0).strip() kwargs['is_possible_edge_source'] = 'L' in restr kwargs['is_possible_edge_target'] = 'R' in restr if use_freq: kwargs['freq'] = int(my_row.pop(0).strip()) if use_vec: vec_str = my_row.pop(0).strip() kwargs['vec'] = \ np.array(list(map(float, vec_str.split(vec_sep)))) if kwargs['vec'] is None: raise Exception("%s vec=None" % word) if kwargs['vec'].shape[0] != vec_dim: raise Exception("%s dim=%d" % \ (word, kwargs['vec'].shape[0])) return LexiconEntry(word, **kwargs) lexicon = Lexicon() # determine the file format use_restr = \ shared.config['General'].getboolean('use_edge_restrictions') use_freq = \ shared.config['Models'].get('root_frequency_model') != 'none' or \ shared.config['Models'].get('edge_frequency_model') != 'none' use_vec = \ shared.config['Models'].get('root_feature_model') != 'none' or \ shared.config['Models'].get('edge_feature_model') != 'none' supervised = shared.config['General'].getboolean('supervised') vec_sep = shared.format['vector_sep'] vec_dim = shared.config['Features'].getint('word_vec_dim') kwargs = { 'use_restr': use_restr, 'use_freq': use_freq, 'use_vec': use_vec, 'vec_dim': vec_dim } items_to_add = [] for row in read_tsv_file(filename): try: if supervised: row.pop(0) # the first item is the base/lemma -> ignore entry = _parse_entry_from_row(row, **kwargs) items_to_add.append(entry) except Exception as e: # raise e logging.getLogger('main').warning('ignoring %s: %s' %\ (row[0], str(e))) lexicon.add(items_to_add) return lexicon
def load_raw_vocabulary(filename: str) -> Lexicon: lexicon = Lexicon() for (word, ) in read_tsv_file(filename): try: lexicon.add(LexiconEntry(word)) except Exception as e: logging.getLogger('main').warning('ignoring %s: %s' %\ (word, str(e))) return lexicon
def load(filename: str) -> 'SimpleTagModel': result = SimpleTagModel() for tag_str, prob in read_tsv_file(filename, (str, float)): if tag_str: tag = tuple(shared.compiled_patterns['tag'].findall(tag_str)) result.probs[tag] = prob else: result.smoothing_prob = prob return result
def load(filename :str) -> 'UnigramRuleModel': result = UnigramRuleModel() result.probs = {} for row in read_tsv_file(filename): if len(row) == 2: result.probs[row[0]] = float(row[1]) elif len(row) == 3: result.probs[(row[0], row[1])] = float(row[2]) else: logging.getLogger('main').warning(\ 'Cannot parse row: {} in {}'\ .format(str(row), filename))
def build_graph_from_training_edges(lexicon, training_file, graph_file): with open_to_write(graph_file) as fp: for word_1, word_2 in read_tsv_file(training_file, (str, str)): if word_1: try: n1, n2 = lexicon[word_1], lexicon[word_2] for rule in extract_all_rules(n1, n2): write_line(fp, (str(n1), str(n2), str(rule))) except KeyError: if word_1 not in lexicon: logging.getLogger('main').warning('%s not in lexicon' % word_1)
def load_rules() -> List[Tuple[Rule, float]]: rules_filename = None if shared.config['compile'].getboolean('weighted'): if shared.config['Models'].get('edge_model') == 'simple': rules_filename = shared.filenames['edge-model'] max_cost = None \ if shared.config['compile'].get('max_cost') == 'none' \ else shared.config['compile'].getfloat('max_cost') rules = [(Rule.from_string(rule), -math.log(prod))\ for rule, prod in\ read_tsv_file(rules_filename, (str, float))\ if max_cost is None or -math.log(prod) < max_cost ] +\ [(Rule.from_string(':/:___:'), 0.0)] return rules else: raise Exception('Compiling a weighted analyzer is only possible' ' for the Bernoulli edge model.') else: rules_filename = shared.filenames['rules-modsel'] if not file_exists(rules_filename): rules_filename = shared.filenames['rules'] return [(Rule.from_string(rule), 0.0)\ for (rule,) in read_tsv_file(rules_filename, (str,))] +\ [(Rule.from_string(':/:___:'), 0.0)]
def load_graph(filename, lexicon, threshold=0.0): edge_set = EdgeSet(lexicon) weights = [] rules = {} for word_1, word_2, rule_str, edge_freq_str in read_tsv_file(filename): try: edge_freq = float(edge_freq_str) if edge_freq < threshold: continue if rule_str not in rules: rules[rule_str] = Rule.from_string(rule_str) edge = GraphEdge(lexicon[word_1], lexicon[word_2], rules[rule_str], weight=edge_freq) edge_set.add(edge) weights.append(edge_freq) except ValueError: pass return FullGraph(lexicon, edge_set), np.array(weights)
def run(): lexicon = Lexicon.load(shared.filenames['wordlist']) lexicon_tr = FST.load_transducer(shared.filenames['lexicon-tr']) rules_tr = FST.load_transducer(shared.filenames['rules-tr']) rules_tr.convert(hfst.ImplementationType.HFST_OLW_TYPE) alphabet = lexicon_tr.get_alphabet() model = ModelSuite.load() max_results = shared.config['inflect'].getint('max_results') if shared.options['interactive']: for line in sys.stdin: try: lemma_str, tag = line.rstrip().split() lemma = LexiconEntry(lemma_str) for analysis in inflect_word(lemma, tag, rules_tr, model, max_results=max_results): print(*analysis, sep='\t') except Exception as e: logging.getLogger('main').warning(e) else: pairs = [] # FIXME is there a better solution for creating lists of LexiconEntry # objects and skipping the ones for which exceptions are thrown? for lemma, tag in read_tsv_file(shared.filenames['analyze.wordlist']): try: pairs.append((LexiconEntry(lemma), tag)) except Exception as e: logging.warning(e) for lemma, tag in tqdm.tqdm(pairs): for analysis in inflect_word(lemma, tag, rules_tr, model, max_results=max_results): print(*analysis, sep='\t')
def load(filename: str) -> 'NGramFeatureExtractor': result = NGramFeatureExtractor() result.ngrams = [ngram for (ngram, ) in read_tsv_file(filename)] result.feature_idx = \ { ngram: i for i, ngram in enumerate(result.ngrams) } return result
def load(filename :str) -> 'UnigramRootModel': result = UnigramRootModel() for sym, prob in read_tsv_file(filename, types=(str, float)): result.probs[sym] = prob return result
def root_reader(): col = 0 for row in read_tsv_file(shared.filenames['wordlist']): if col < len(row) and row[col]: yield row[col]
def load_normalized_wordlist(filename: str) -> Iterable[str]: results = [] for (word, ) in read_tsv_file(filename): results.append(LexiconEntry(word).normalized) return results