def process_entries(self, words): entry_preprocessor = EntryPreprocessor(self.cfg) entries = map(entry_preprocessor.preprocess_entry, (self.raw_dict[word] for word in words)) stanford_wrapper = StanfordWrapper(self.cfg) entries = stanford_wrapper.parse_sentences(entries, definitions=True) dependency_processor = DependencyProcessor(self.cfg) for entry in entries: if entry['to_filter']: continue word = entry['hw'] for sense in entry['senses']: definition = sense['definition'] if definition is None: continue definition['deps'] = dependency_processor.process_dependencies( definition['deps']) if word in self.dictionary: logging.warning( "entries with identical headwords:\n{0}\n{1}".format( entry, self.dictionary[word])) self.unify(self.dictionary[word], entry) else: self.dictionary[word] = entry
def __init__(self, cfg): self.cfg = cfg self.dfl = DepTo4lang(cfg) self.dep_processor = DependencyProcessor(cfg) self.vocabulary = {} self.words = [] self.binary_vocab = {} self.binary_words = [] self.coocc = [], [], [] self.zero_array = None self.binary_array = None
def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {}
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if (not direct_parse): self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.read_dep_map(dep_map_fn) self.word2lemma = {} self.first_only = cfg.getboolean('filter', 'first_only')