Пример #1
0
    def index_raw_forms(self):
        for letter in string.ascii_lowercase:
            print('Indexing %s...' % letter)
            blocks = []
            for entry in entry_iterator(letters=letter):
                if (entry.date().end < ENTRY_MINIMUM_END_DATE or
                        entry.primary_wordclass() in ('NP', 'NPS') or
                        len(entry.lemma) > MAX_WORDLENGTH):
                    continue
                entry_type = entry.oed_entry_type()
                if entry_type is None:
                    continue
                seen = set()
                for block in entry.wordclass_sets():
                    # Check that this block is in OED, and does not shadow
                    #  something already covered (as e.g. vast adv. shadows
                    #  vast adj.).
                    refentry, refid = block.link(target='oed', asTuple=True)
                    if not refentry or (refentry, refid) in seen:
                        continue
                    block_data = _store_forms(block, entry, entry_type, letter)
                    if block_data.standard_types:
                        blocks.append(block_data)
                    seen.add((refentry, refid))

            out_file = os.path.join(FORM_INDEX_DIR, 'raw', letter)
            with open(out_file, 'wb') as filehandle:
                for block in blocks:
                    pickle.dump(block, filehandle)
Пример #2
0
    def index_proper_names(self):
        allnames = set()
        for name_type in ('firstname', 'surname', 'placename'):
            for name in propernames.names_list(name_type):
                if ' ' in name:
                    continue
                allnames.add(name)

        for letter in string.ascii_lowercase:
            print('Indexing proper names in %s...' % letter)
            for entry in entry_iterator(letters=letter):
                if entry.primary_wordclass() not in ('NP', 'NPS'):
                    continue
                for typeunit in entry.types():
                    if (' ' in typeunit.form or
                        not typeunit.lemma_manager().capitalization_type() == 'capitalized'):
                        continue
                    allnames.add(typeunit.form)

        out_file = os.path.join(FORM_INDEX_DIR, 'proper_names', 'all.txt')
        with open(out_file, 'w') as filehandle:
            for name in allnames:
                sortable = stringtools.lexical_sort(name)
                if (not sortable or
                        len(sortable) > MAX_WORDLENGTH or
                        len(name) > MAX_WORDLENGTH):
                    continue
                filehandle.write('%s\t%s\t%s\n' % (sortable,
                                                   name,
                                                   str(propernames.is_common(name))))
Пример #3
0
def index_raw_forms():
    """
    Build an index of all OED lemmas and their various inflections/variants.
    Based on GEL.
    Outputs a series of pickle files (one file per letter).
    """
    for letter in string.ascii_lowercase:
        stdout.write('Indexing %s...\n' % letter)
        blocks = []
        for entry in entry_iterator(letters=letter):
            # Skip proper names and over-long lemmas
            if (entry.date().end < ENTRY_MINIMUM_END_DATE or
                    entry.primary_wordclass() in ('NP', 'NPS') or
                    len(entry.lemma) > MAX_WORDLENGTH):
                continue

            entry_type = entry.oed_entry_type()
            if entry_type is None:
                continue
            seen = set()
            for block in entry.wordclass_sets():
                # Check that this block is in OED, and does not shadow
                #  something already covered within this entry
                # (as e.g. _vast_ adv. shadows _vast_ adj.).
                refentry, refid = block.link(target='oed', asTuple=True)
                if not refentry or (refentry, refid) in seen:
                    continue
                block_data = _store_forms(block, entry, entry_type, letter)
                if block_data.standard_types:
                    blocks.append(block_data)
                seen.add((refentry, refid))

        out_file = os.path.join(FORM_INDEX_DIR, 'raw', letter)
        with open(out_file, 'wb') as filehandle:
            for block in blocks:
                pickle.dump(block, filehandle)