def process_entries(self, words): entry_preprocessor = EntryPreprocessor(self.cfg) entries = map(entry_preprocessor.preprocess_entry, (self.raw_dict[word] for word in words)) stanford_wrapper = StanfordWrapper(self.cfg) entries = stanford_wrapper.parse_sentences(entries, definitions=True) dependency_processor = DependencyProcessor(self.cfg) for entry in entries: if entry['to_filter']: continue word = entry['hw'] for sense in entry['senses']: definition = sense['definition'] if definition is None: continue definition['deps'] = dependency_processor.process_dependencies( definition['deps']) if word in self.dictionary: logging.warning( "entries with identical headwords:\n{0}\n{1}".format( entry, self.dictionary[word])) self.unify(self.dictionary[word], entry) else: self.dictionary[word] = entry
def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {}
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if (not direct_parse): self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.read_dep_map(dep_map_fn) self.word2lemma = {} self.first_only = cfg.getboolean('filter', 'first_only')
def __init__(self, cfg): self.cfg = cfg self.dfl = DepTo4lang(cfg) self.dep_processor = DependencyProcessor(cfg) self.vocabulary = {} self.words = [] self.binary_vocab = {} self.binary_words = [] self.coocc = [], [], [] self.zero_array = None self.binary_array = None
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if(not direct_parse): self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.read_dep_map(dep_map_fn) self.word2lemma = {} self.first_n = cfg.getint('filter', 'first_n') self.graph_dir = self.cfg.get('machine', 'graph_dir') ensure_dir(self.graph_dir)
class DepTo4lang(): dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)") def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {} def read_dep_map(self, dep_map_fn): self.dependencies = defaultdict(list) for line in file(dep_map_fn): l = line.strip() if not l or l.startswith('#'): continue dep = Dependency.create_from_line(l) self.dependencies[dep.name].append(dep) def apply_dep(self, dep, machine1, machine2): dep_type = dep['type'] msd1 = dep['gov'].get('msd') msd2 = dep['dep'].get('msd') if dep_type not in self.dependencies: if dep_type not in self.undefined: self.undefined.add(dep_type) logging.warning( 'skipping dependency not in dep_to_4lang map: {0}'.format( dep_type)) return False # not that anyone cares for dep in self.dependencies[dep_type]: dep.apply(msd1, msd2, machine1, machine2) def dep_to_4lang(self): dict_fn = self.cfg.get("dict", "output_file") logging.info('reading dependencies from {0}...'.format(dict_fn)) longman = json.load(open(dict_fn)) for c, (word, entry) in enumerate(longman.iteritems()): if c % 1000 == 0: logging.info("added {0}...".format(c)) try: if entry["to_filter"]: continue if not entry['senses']: # TODO these are words that only have pointers to an MWE # that they are part of. continue definition = entry['senses'][0]['definition'] if definition is None: continue deps = definition['deps'] if not deps: # TODO see previous comment continue machine = self.get_dep_definition(word, deps) if machine is None: continue # logging.info('adding: {0}'.format(word)) # logging.info('ext_lex_keys: {0}'.format( # self.lexicon.ext_lexicon.keys())) self.lexicon.add(word, machine) except Exception: logging.error(u"exception caused by: '{0}'".format(word)) # logging.error( # u'skipping "{0}" because of an exception:'.format( # word)) # logging.info("entry: {0}".format(entry)) traceback.print_exc() sys.exit(-1) continue logging.info('added {0}, done!'.format(c + 1)) def print_graphs(self): print_4lang_graphs(self.lexicon.ext_lexicon, self.cfg.get('machine', 'graph_dir')) def save_machines(self): self.lexicon.save_to_binary(self.out_fn) @staticmethod def parse_dependency(string): dep_match = DepTo4lang.dep_regex.match(string) if not dep_match: raise Exception('cannot parse dependency: {0}'.format(string)) dep, word1, id1, word2, id2 = dep_match.groups() return dep, (word1, id1), (word2, id2) def get_root_lemmas(self, deps): return [ d['dep'].setdefault('lemma', self.lemmatizer.lemmatize(d['dep']['word'])) for d in deps if d['type'] == 'root' ] # TODO def get_dep_definition(self, word, deps): deps = self.dependency_processor.process_dependencies(deps) root_lemmas = self.get_root_lemmas(deps) if not root_lemmas: logging.warning( u'no root dependency, skipping word "{0}"'.format(word)) return None word2machine = self.get_machines_from_deps_and_corefs( [deps], [], process_deps=False) root_machines = filter(None, map(word2machine.get, root_lemmas)) if not root_machines: logging.info("failed to find root machine") logging.info('root lemmas: {0}'.format(root_lemmas)) logging.info('word2machine: {0}'.format(word2machine)) sys.exit(-1) word_machine = self.lexicon.get_new_machine(word) for root_machine in root_machines: word_machine.unify(root_machine) word_machine.append(root_machine, 0) return word_machine def get_machines_from_deps_and_corefs(self, dep_lists, corefs, process_deps=True): if process_deps: dep_lists = map(self.dependency_processor.process_dependencies, dep_lists) coref_index = defaultdict(dict) for (word, sen_no), mentions in corefs: for m_word, m_sen_no in mentions: coref_index[m_word][m_sen_no - 1] = word # logging.info('coref index: {0}'.format(coref_index)) word2machine = {} for deps in dep_lists: for dep in deps: for t in (dep['gov'], dep['dep']): self.word2lemma[t['word']] = t.setdefault( 'lemma', self.lemmatizer.lemmatize(t['word'])) for i, deps in enumerate(dep_lists): try: for dep in deps: word1 = dep['gov']['word'] word2 = dep['dep']['word'] # logging.info('dep: {0}, w1: {1}, w2: {2}'.format( # repr(dep), repr(word1), repr(word2))) c_word1 = coref_index[word1].get(i, word1) c_word2 = coref_index[word2].get(i, word2) """ if c_word1 != word1: logging.warning( "unifying '{0}' with canonical '{1}'".format( word1, c_word1)) if c_word2 != word2: logging.warning( "unifying '{0}' with canonical '{1}'".format( word2, c_word2)) """ lemma1 = self.word2lemma[c_word1] lemma2 = self.word2lemma[c_word2] # TODO # lemma1 = lemma1.replace('/', '_PER_') # lemma2 = lemma2.replace('/', '_PER_') # logging.info( # 'lemma1: {0}, lemma2: {1}'.format( # repr(lemma1), repr(lemma2))) for lemma in (lemma1, lemma2): if lemma not in word2machine: word2machine[lemma] = self.lexicon.get_new_machine( lemma) self.apply_dep(dep, word2machine[lemma1], word2machine[lemma2]) except: logging.error(u"failure on dep: {0}({1}, {2})".format( dep, word1, word2)) traceback.print_exc() raise Exception("adding dependencies failed") return word2machine
class DepTo4lang(): dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)") def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {} def read_dep_map(self, dep_map_fn): self.dependencies = defaultdict(list) for line in file(dep_map_fn): l = line.strip() if not l or l.startswith('#'): continue dep = Dependency.create_from_line(l) self.dependencies[dep.name].append(dep) def apply_dep(self, dep, machine1, machine2): dep_type = dep['type'] msd1 = dep['gov'].get('msd') msd2 = dep['dep'].get('msd') if dep_type not in self.dependencies: if dep_type not in self.undefined: self.undefined.add(dep_type) logging.warning( 'skipping dependency not in dep_to_4lang map: {0}'.format( dep_type)) return False # not that anyone cares for dep in self.dependencies[dep_type]: dep.apply(msd1, msd2, machine1, machine2) def dep_to_4lang(self): dict_fn = self.cfg.get("dict", "output_file") logging.info('reading dependencies from {0}...'.format(dict_fn)) longman = json.load(open(dict_fn)) for c, (word, entry) in enumerate(longman.iteritems()): if c % 1000 == 0: logging.info("added {0}...".format(c)) try: if entry["to_filter"]: continue if not entry['senses']: # TODO these are words that only have pointers to an MWE # that they are part of. continue definition = entry['senses'][0]['definition'] if definition is None: continue deps = definition['deps'] if not deps: # TODO see previous comment continue machine = self.get_dep_definition(word, deps) if machine is None: continue # logging.info('adding: {0}'.format(word)) # logging.info('ext_lex_keys: {0}'.format( # self.lexicon.ext_lexicon.keys())) self.lexicon.add(word, machine) except Exception: logging.error(u"exception caused by: '{0}'".format(word)) # logging.error( # u'skipping "{0}" because of an exception:'.format( # word)) # logging.info("entry: {0}".format(entry)) traceback.print_exc() sys.exit(-1) continue logging.info('added {0}, done!'.format(c + 1)) def print_graphs(self): print_4lang_graphs( self.lexicon.ext_lexicon, self.cfg.get('machine', 'graph_dir')) def save_machines(self): self.lexicon.save_to_binary(self.out_fn) @staticmethod def parse_dependency(string): dep_match = DepTo4lang.dep_regex.match(string) if not dep_match: raise Exception('cannot parse dependency: {0}'.format(string)) dep, word1, id1, word2, id2 = dep_match.groups() return dep, (word1, id1), (word2, id2) def get_root_lemmas(self, deps): return [ d['dep'].setdefault( 'lemma', self.lemmatizer.lemmatize(d['dep']['word'])) for d in deps if d['type'] == 'root'] # TODO def get_dep_definition(self, word, deps): deps = self.dependency_processor.process_dependencies(deps) root_lemmas = self.get_root_lemmas(deps) if not root_lemmas: logging.warning( u'no root dependency, skipping word "{0}"'.format(word)) return None word2machine = self.get_machines_from_deps_and_corefs( [deps], [], process_deps=False) root_machines = filter(None, map(word2machine.get, root_lemmas)) if not root_machines: logging.info("failed to find root machine") logging.info('root lemmas: {0}'.format(root_lemmas)) logging.info('word2machine: {0}'.format(word2machine)) sys.exit(-1) word_machine = self.lexicon.get_new_machine(word) for root_machine in root_machines: word_machine.unify(root_machine) word_machine.append(root_machine, 0) return word_machine def get_machines_from_deps_and_corefs( self, dep_lists, corefs, process_deps=True): if process_deps: dep_lists = map( self.dependency_processor.process_dependencies, dep_lists) coref_index = defaultdict(dict) for (word, sen_no), mentions in corefs: for m_word, m_sen_no in mentions: coref_index[m_word][m_sen_no-1] = word # logging.info('coref index: {0}'.format(coref_index)) word2machine = {} for deps in dep_lists: for dep in deps: for t in (dep['gov'], dep['dep']): self.word2lemma[t['word']] = t.setdefault( 'lemma', self.lemmatizer.lemmatize(t['word'])) for i, deps in enumerate(dep_lists): try: for dep in deps: word1 = dep['gov']['word'] word2 = dep['dep']['word'] # logging.info('dep: {0}, w1: {1}, w2: {2}'.format( # repr(dep), repr(word1), repr(word2))) c_word1 = coref_index[word1].get(i, word1) c_word2 = coref_index[word2].get(i, word2) """ if c_word1 != word1: logging.warning( "unifying '{0}' with canonical '{1}'".format( word1, c_word1)) if c_word2 != word2: logging.warning( "unifying '{0}' with canonical '{1}'".format( word2, c_word2)) """ lemma1 = self.word2lemma[c_word1] lemma2 = self.word2lemma[c_word2] # TODO # lemma1 = lemma1.replace('/', '_PER_') # lemma2 = lemma2.replace('/', '_PER_') # logging.info( # 'lemma1: {0}, lemma2: {1}'.format( # repr(lemma1), repr(lemma2))) for lemma in (lemma1, lemma2): if lemma not in word2machine: word2machine[lemma] = self.lexicon.get_new_machine( lemma) self.apply_dep( dep, word2machine[lemma1], word2machine[lemma2]) except: logging.error(u"failure on dep: {0}({1}, {2})".format( dep, word1, word2)) traceback.print_exc() raise Exception("adding dependencies failed") return word2machine