Пример #1
0
    def process_entries(self, words):
        entry_preprocessor = EntryPreprocessor(self.cfg)
        entries = map(entry_preprocessor.preprocess_entry,
                      (self.raw_dict[word] for word in words))

        stanford_wrapper = StanfordWrapper(self.cfg)
        entries = stanford_wrapper.parse_sentences(entries, definitions=True)

        dependency_processor = DependencyProcessor(self.cfg)

        for entry in entries:
            if entry['to_filter']:
                continue
            word = entry['hw']
            for sense in entry['senses']:
                definition = sense['definition']
                if definition is None:
                    continue
                definition['deps'] = dependency_processor.process_dependencies(
                    definition['deps'])

            if word in self.dictionary:
                logging.warning(
                    "entries with identical headwords:\n{0}\n{1}".format(
                        entry, self.dictionary[word]))

                self.unify(self.dictionary[word], entry)
            else:
                self.dictionary[word] = entry
Пример #2
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.out_fn = self.cfg.get("machine", "definitions_binary_out")
     ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.word2lemma = {}
Пример #3
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if (not direct_parse):
         self.out_fn = self.cfg.get("machine", "definitions_binary_out")
         ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.read_dep_map(dep_map_fn)
     self.word2lemma = {}
     self.first_only = cfg.getboolean('filter', 'first_only')
Пример #4
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.dfl = DepTo4lang(cfg)
     self.dep_processor = DependencyProcessor(cfg)
     self.vocabulary = {}
     self.words = []
     self.binary_vocab = {}
     self.binary_words = []
     self.coocc = [], [], []
     self.zero_array = None
     self.binary_array = None
Пример #5
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.out_fn = self.cfg.get("machine", "definitions_binary_out")
     ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.word2lemma = {}
Пример #6
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if(not direct_parse):
         self.out_fn = self.cfg.get("machine", "definitions_binary_out")
         ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.read_dep_map(dep_map_fn)
     self.word2lemma = {}
     self.first_n = cfg.getint('filter', 'first_n')
     self.graph_dir = self.cfg.get('machine', 'graph_dir')
     ensure_dir(self.graph_dir)
Пример #7
0
class DepTo4lang():

    dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)")

    def __init__(self, cfg):
        self.cfg = cfg
        self.lang = self.cfg.get("deps", "lang")
        self.out_fn = self.cfg.get("machine", "definitions_binary_out")
        ensure_dir(os.path.dirname(self.out_fn))
        self.dependency_processor = DependencyProcessor(self.cfg)
        dep_map_fn = cfg.get("deps", "dep_map")
        self.read_dep_map(dep_map_fn)
        self.undefined = set()
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.word2lemma = {}

    def read_dep_map(self, dep_map_fn):
        self.dependencies = defaultdict(list)
        for line in file(dep_map_fn):
            l = line.strip()
            if not l or l.startswith('#'):
                continue
            dep = Dependency.create_from_line(l)
            self.dependencies[dep.name].append(dep)

    def apply_dep(self, dep, machine1, machine2):
        dep_type = dep['type']
        msd1 = dep['gov'].get('msd')
        msd2 = dep['dep'].get('msd')
        if dep_type not in self.dependencies:
            if dep_type not in self.undefined:
                self.undefined.add(dep_type)
                logging.warning(
                    'skipping dependency not in dep_to_4lang map: {0}'.format(
                        dep_type))
            return False  # not that anyone cares
        for dep in self.dependencies[dep_type]:
            dep.apply(msd1, msd2, machine1, machine2)

    def dep_to_4lang(self):
        dict_fn = self.cfg.get("dict", "output_file")
        logging.info('reading dependencies from {0}...'.format(dict_fn))
        longman = json.load(open(dict_fn))
        for c, (word, entry) in enumerate(longman.iteritems()):
            if c % 1000 == 0:
                logging.info("added {0}...".format(c))
            try:
                if entry["to_filter"]:
                    continue
                if not entry['senses']:
                    #  TODO these are words that only have pointers to an MWE
                    #  that they are part of.
                    continue
                definition = entry['senses'][0]['definition']
                if definition is None:
                    continue
                deps = definition['deps']
                if not deps:
                    #  TODO see previous comment
                    continue
                machine = self.get_dep_definition(word, deps)
                if machine is None:
                    continue

                # logging.info('adding: {0}'.format(word))
                # logging.info('ext_lex_keys: {0}'.format(
                # self.lexicon.ext_lexicon.keys()))
                self.lexicon.add(word, machine)
            except Exception:
                logging.error(u"exception caused by: '{0}'".format(word))
                # logging.error(
                #     u'skipping "{0}" because of an exception:'.format(
                #         word))
                # logging.info("entry: {0}".format(entry))
                traceback.print_exc()
                sys.exit(-1)
                continue

        logging.info('added {0}, done!'.format(c + 1))

    def print_graphs(self):
        print_4lang_graphs(self.lexicon.ext_lexicon,
                           self.cfg.get('machine', 'graph_dir'))

    def save_machines(self):
        self.lexicon.save_to_binary(self.out_fn)

    @staticmethod
    def parse_dependency(string):
        dep_match = DepTo4lang.dep_regex.match(string)
        if not dep_match:
            raise Exception('cannot parse dependency: {0}'.format(string))
        dep, word1, id1, word2, id2 = dep_match.groups()
        return dep, (word1, id1), (word2, id2)

    def get_root_lemmas(self, deps):
        return [
            d['dep'].setdefault('lemma',
                                self.lemmatizer.lemmatize(d['dep']['word']))
            for d in deps if d['type'] == 'root'
        ]  # TODO

    def get_dep_definition(self, word, deps):
        deps = self.dependency_processor.process_dependencies(deps)
        root_lemmas = self.get_root_lemmas(deps)
        if not root_lemmas:
            logging.warning(
                u'no root dependency, skipping word "{0}"'.format(word))
            return None

        word2machine = self.get_machines_from_deps_and_corefs(
            [deps], [], process_deps=False)

        root_machines = filter(None, map(word2machine.get, root_lemmas))
        if not root_machines:
            logging.info("failed to find root machine")
            logging.info('root lemmas: {0}'.format(root_lemmas))
            logging.info('word2machine: {0}'.format(word2machine))
            sys.exit(-1)

        word_machine = self.lexicon.get_new_machine(word)

        for root_machine in root_machines:
            word_machine.unify(root_machine)
            word_machine.append(root_machine, 0)
        return word_machine

    def get_machines_from_deps_and_corefs(self,
                                          dep_lists,
                                          corefs,
                                          process_deps=True):
        if process_deps:
            dep_lists = map(self.dependency_processor.process_dependencies,
                            dep_lists)
        coref_index = defaultdict(dict)
        for (word, sen_no), mentions in corefs:
            for m_word, m_sen_no in mentions:
                coref_index[m_word][m_sen_no - 1] = word

        # logging.info('coref index: {0}'.format(coref_index))

        word2machine = {}
        for deps in dep_lists:
            for dep in deps:
                for t in (dep['gov'], dep['dep']):
                    self.word2lemma[t['word']] = t.setdefault(
                        'lemma', self.lemmatizer.lemmatize(t['word']))

        for i, deps in enumerate(dep_lists):
            try:
                for dep in deps:
                    word1 = dep['gov']['word']
                    word2 = dep['dep']['word']
                    # logging.info('dep: {0}, w1: {1}, w2: {2}'.format(
                    #     repr(dep), repr(word1), repr(word2)))
                    c_word1 = coref_index[word1].get(i, word1)
                    c_word2 = coref_index[word2].get(i, word2)
                    """
                    if c_word1 != word1:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word1, c_word1))
                    if c_word2 != word2:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word2, c_word2))
                    """
                    lemma1 = self.word2lemma[c_word1]
                    lemma2 = self.word2lemma[c_word2]

                    # TODO
                    # lemma1 = lemma1.replace('/', '_PER_')
                    # lemma2 = lemma2.replace('/', '_PER_')

                    # logging.info(
                    #     'lemma1: {0}, lemma2: {1}'.format(
                    #         repr(lemma1), repr(lemma2)))

                    for lemma in (lemma1, lemma2):
                        if lemma not in word2machine:
                            word2machine[lemma] = self.lexicon.get_new_machine(
                                lemma)

                    self.apply_dep(dep, word2machine[lemma1],
                                   word2machine[lemma2])

            except:
                logging.error(u"failure on dep: {0}({1}, {2})".format(
                    dep, word1, word2))
                traceback.print_exc()
                raise Exception("adding dependencies failed")

        return word2machine
Пример #8
0
class DepTo4lang():

    dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)")

    def __init__(self, cfg):
        self.cfg = cfg
        self.lang = self.cfg.get("deps", "lang")
        self.out_fn = self.cfg.get("machine", "definitions_binary_out")
        ensure_dir(os.path.dirname(self.out_fn))
        self.dependency_processor = DependencyProcessor(self.cfg)
        dep_map_fn = cfg.get("deps", "dep_map")
        self.read_dep_map(dep_map_fn)
        self.undefined = set()
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.word2lemma = {}

    def read_dep_map(self, dep_map_fn):
        self.dependencies = defaultdict(list)
        for line in file(dep_map_fn):
            l = line.strip()
            if not l or l.startswith('#'):
                continue
            dep = Dependency.create_from_line(l)
            self.dependencies[dep.name].append(dep)

    def apply_dep(self, dep, machine1, machine2):
        dep_type = dep['type']
        msd1 = dep['gov'].get('msd')
        msd2 = dep['dep'].get('msd')
        if dep_type not in self.dependencies:
            if dep_type not in self.undefined:
                self.undefined.add(dep_type)
                logging.warning(
                    'skipping dependency not in dep_to_4lang map: {0}'.format(
                        dep_type))
            return False  # not that anyone cares
        for dep in self.dependencies[dep_type]:
            dep.apply(msd1, msd2, machine1, machine2)

    def dep_to_4lang(self):
        dict_fn = self.cfg.get("dict", "output_file")
        logging.info('reading dependencies from {0}...'.format(dict_fn))
        longman = json.load(open(dict_fn))
        for c, (word, entry) in enumerate(longman.iteritems()):
            if c % 1000 == 0:
                logging.info("added {0}...".format(c))
            try:
                if entry["to_filter"]:
                    continue
                if not entry['senses']:
                    #  TODO these are words that only have pointers to an MWE
                    #  that they are part of.
                    continue
                definition = entry['senses'][0]['definition']
                if definition is None:
                    continue
                deps = definition['deps']
                if not deps:
                    #  TODO see previous comment
                    continue
                machine = self.get_dep_definition(word, deps)
                if machine is None:
                    continue

                # logging.info('adding: {0}'.format(word))
                # logging.info('ext_lex_keys: {0}'.format(
                    # self.lexicon.ext_lexicon.keys()))
                self.lexicon.add(word, machine)
            except Exception:
                logging.error(u"exception caused by: '{0}'".format(word))
                # logging.error(
                #     u'skipping "{0}" because of an exception:'.format(
                #         word))
                # logging.info("entry: {0}".format(entry))
                traceback.print_exc()
                sys.exit(-1)
                continue

        logging.info('added {0}, done!'.format(c + 1))

    def print_graphs(self):
        print_4lang_graphs(
            self.lexicon.ext_lexicon,
            self.cfg.get('machine', 'graph_dir'))

    def save_machines(self):
        self.lexicon.save_to_binary(self.out_fn)

    @staticmethod
    def parse_dependency(string):
        dep_match = DepTo4lang.dep_regex.match(string)
        if not dep_match:
            raise Exception('cannot parse dependency: {0}'.format(string))
        dep, word1, id1, word2, id2 = dep_match.groups()
        return dep, (word1, id1), (word2, id2)

    def get_root_lemmas(self, deps):
        return [
            d['dep'].setdefault(
                'lemma', self.lemmatizer.lemmatize(d['dep']['word']))
            for d in deps if d['type'] == 'root']  # TODO

    def get_dep_definition(self, word, deps):
        deps = self.dependency_processor.process_dependencies(deps)
        root_lemmas = self.get_root_lemmas(deps)
        if not root_lemmas:
            logging.warning(
                u'no root dependency, skipping word "{0}"'.format(word))
            return None

        word2machine = self.get_machines_from_deps_and_corefs(
            [deps], [], process_deps=False)

        root_machines = filter(None, map(word2machine.get, root_lemmas))
        if not root_machines:
            logging.info("failed to find root machine")
            logging.info('root lemmas: {0}'.format(root_lemmas))
            logging.info('word2machine: {0}'.format(word2machine))
            sys.exit(-1)

        word_machine = self.lexicon.get_new_machine(word)

        for root_machine in root_machines:
            word_machine.unify(root_machine)
            word_machine.append(root_machine, 0)
        return word_machine

    def get_machines_from_deps_and_corefs(
            self, dep_lists, corefs, process_deps=True):
        if process_deps:
            dep_lists = map(
                self.dependency_processor.process_dependencies, dep_lists)
        coref_index = defaultdict(dict)
        for (word, sen_no), mentions in corefs:
            for m_word, m_sen_no in mentions:
                coref_index[m_word][m_sen_no-1] = word

        # logging.info('coref index: {0}'.format(coref_index))

        word2machine = {}
        for deps in dep_lists:
            for dep in deps:
                for t in (dep['gov'], dep['dep']):
                    self.word2lemma[t['word']] = t.setdefault(
                        'lemma', self.lemmatizer.lemmatize(t['word']))

        for i, deps in enumerate(dep_lists):
            try:
                for dep in deps:
                    word1 = dep['gov']['word']
                    word2 = dep['dep']['word']
                    # logging.info('dep: {0}, w1: {1}, w2: {2}'.format(
                    #     repr(dep), repr(word1), repr(word2)))
                    c_word1 = coref_index[word1].get(i, word1)
                    c_word2 = coref_index[word2].get(i, word2)

                    """
                    if c_word1 != word1:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word1, c_word1))
                    if c_word2 != word2:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word2, c_word2))
                    """
                    lemma1 = self.word2lemma[c_word1]
                    lemma2 = self.word2lemma[c_word2]

                    # TODO
                    # lemma1 = lemma1.replace('/', '_PER_')
                    # lemma2 = lemma2.replace('/', '_PER_')

                    # logging.info(
                    #     'lemma1: {0}, lemma2: {1}'.format(
                    #         repr(lemma1), repr(lemma2)))

                    for lemma in (lemma1, lemma2):
                        if lemma not in word2machine:
                            word2machine[lemma] = self.lexicon.get_new_machine(
                                lemma)

                    self.apply_dep(
                        dep, word2machine[lemma1], word2machine[lemma2])

            except:
                logging.error(u"failure on dep: {0}({1}, {2})".format(
                    dep, word1, word2))
                traceback.print_exc()
                raise Exception("adding dependencies failed")

        return word2machine