Пример #1
0
 def load_morphgroups(self):
     iterator = EntryIterator(dictType='ode')
     for entry in iterator.iterate():
         for morphgroup in entry.morphgroups():
             if (WORDCLASS_PATTERN.search(morphgroup.baseclass) and
                     not morphgroup.variant_type == 'deprecated'):
                 self._process_morphgroup(morphgroup)
Пример #2
0
    def process(self):
        self.clear_outdir()
        self.initialize_root()
        previous = None
        iterator = EntryIterator(dict_type="oed", verbosity="low", fix_ligatures=True)

        # Iterate through all entries in OED, processing each and storing
        #   the results in a buffer
        for entry in iterator.iterate():
            self.entry = entry

            # Write the buffer to a file when it gets to a certain size, and
            #   there's an appropriate break, e.g. not in the middle
            #   of homographs.
            if self.buffersize() >= FILESIZE and entry.lemma_manager().lexical_sort() != previous:
                self.writebuffer()
                self.initialize_root()

            # Process the current entry -> buffer
            self.process_entry()

            # Keep track of the previous entry's headword (to help find a good
            #   opportunity to write the buffer to a file)
            previous = entry.lemma_manager().lexical_sort()

        # Write a file for anything still left in the buffer after the
        #  entry iterator has completed
        self.writebuffer()
Пример #3
0
 def distil(self):
     iterator = EntryIterator(dictType=self.dict_name)
     with open(self.pickle_file, 'wb') as filehandle:
         for entry in iterator.iterate():
             distilled = _parse_source_entry(entry, self.definition_length)
             if distilled.wordclass_blocks:
                 pickle.dump(distilled, filehandle)
Пример #4
0
def test_oed_parser():
    names = []
    for letter in LETTERS:
        print('Collecting names in %s...' % letter)
        filter_pattern = 'oed_%s.xml' % letter.upper()
        iterator = EntryIterator(dictType='oed',
                                 fixLigatures=True,
                                 fileFilter=filter_pattern,
                                 verbosity=None)
        for entry in iterator.iterate():
            if (('personal name' in entry.characteristic_nodes('etymonLanguage') or
                    'place name' in entry.characteristic_nodes('etymonLanguage')) and
                    re.search(r'^([A-Z]|[A-Z]\'[A-Z])[a-z]+$', entry.headword) and
                    not entry.headword.endswith('ism') and
                    not entry.headword.endswith('ist') and
                    not entry.headword.endswith('ian') and
                    not entry.headword.endswith('ite') and
                    entry.primary_wordclass().penn == 'NN'):
                print(entry.headword)
                names.append(entry.headword)
                #for et in entry.etymology().etyma():
                #    print(et)
            #for s1 in entry.s1blocks():
            #    s1.share_quotations()
            #    for i, s in enumerate(s1.senses()):
            #        _process_sense(s, i, len(s1.senses()))

    with open('somenames.txt', 'w') as filehandle:
        for name in names:
            filehandle.write(name + '\n')
Пример #5
0
def count():
    counts = defaultdict(lambda: defaultdict(int))

    ei = EntryIterator(path=oed_dir,
                       dictType="oed",
                       fixLigatures=True,
                       verbosity="low")
    for entry in ei.iterate():
        entry.share_quotations()
        senses = {"all": entry.senses, "missed": [s for s in
            entry.senses if not s.thesaurus_categories()]}
        for j in ("all", "missed"):
            for s in senses[j]:
                counts[j]["all"] += 1
                if s.is_subentry() and s.definition():
                    counts[j]["defined_subentry"] += 1
                elif s.is_subentry():
                    counts[j]["undefined_subentry"] += 1
                else:
                    counts[j]["main_sense"] += 1
                counts[j]["quotations"] += s.num_quotations
                if s.primary_wordclass.penn in ('NN', 'VB', 'JJ', 'RB'):
                    counts[j][s.primary_wordclass.penn] += 1
                else:
                    counts[j]['other_wordclass'] += 1
                if entry.is_revised:
                    counts[j]["revised"] += 1
                else:
                    counts[j]["unrevised"] += 1

    for j in ("all", "missed"):
        print j
        for k, v in counts[j].items():
            print "\t%s\t%d" % (k, v)
Пример #6
0
    def store_vital_statistics(self):
        for letter in LETTERS:
            print('Collecting vital statistics in %s...' % letter)
            filter_pattern = 'oed_%s.xml' % letter.upper()
            iterator = EntryIterator(path=self.oed_dir,
                                     dictType='oed',
                                     fixLigatures=True,
                                     fileFilter=filter_pattern,
                                     verbosity=None)

            self.doc = etree.Element('entries')
            for entry in iterator.iterate():
                entry_node = etree.SubElement(
                    self.doc,
                    'e',
                    xrid=entry.id,
                    quotations=str(entry.num_quotations(force_recount=True)),
                    weightedSize='%0.2g' % entry.weighted_size(),
                    obsolete=str(entry.is_marked_obsolete()),
                    revised=str(entry.is_revised),
                    firstDate=str(entry.date().start),
                    lastDate=str(entry.date().end)
                )
                label_node = etree.SubElement(entry_node, 'label')
                label_node.text = entry.label()
                hw_node = etree.SubElement(entry_node, 'headword')
                hw_node.text = entry.headword

                if entry.header() is not None:
                    header_node = etree.SubElement(entry_node, 'header')
                    header_node.text = entry.header()

                etym_node = etree.SubElement(entry_node, 'etyma')
                for etymon in entry.etymology().etyma():
                    if etymon.type() == 'cross-reference':
                        etymon_node = etree.SubElement(etym_node, 'etymon')
                        etymon_node.set('xrid', str(etymon.refentry()))
                        etymon_node.text = etymon.lemma

                lang_node = etree.SubElement(entry_node, 'language')
                language = (entry.characteristic_first('etymonLanguage') or
                            entry.characteristic_first('sourceLanguage'))
                if language:
                    lang_node.text = language

                def_node = etree.SubElement(entry_node, 'def')
                definition = entry.definition(length=100, current=True)
                if definition:
                    def_node.text = definition

                if entry.senses():
                    for label_type in ('subject', 'region', 'usage'):
                        label_text = entry.senses()[0].characteristic_first(label_type)
                        label_text = label_text.split('/')[-1]
                        if label_text:
                            label_node = etree.SubElement(entry_node, label_type)
                            label_node.text = label_text

            self._write_output(letter)
Пример #7
0
def store_main_senses(**kwargs):
    """
    Store main-sense data for OED entries as XML documents.
    """
    from lex.entryiterator import EntryIterator

    oed_dir = kwargs.get("oed_dir") or DEFAULT_INPUT
    out_dir = kwargs.get("out_dir") or DEFAULT_OUTPUT

    for letter in LETTERS:
        print("Collecting main-sense data in %s..." % letter)
        filter_pattern = "oed_%s.xml" % letter.upper()
        iterator = EntryIterator(
            path=oed_dir, dictType="oed", fixLigatures=True, fileFilter=filter_pattern, verbosity=None
        )

        doc = etree.Element("entries")
        for entry in iterator.iterate():
            entry.check_revised_status()
            entry_node = etree.SubElement(doc, "e", refentry=entry.id)
            label_node = etree.SubElement(entry_node, "label")
            label_node.text = entry.label()
            hw_node = etree.SubElement(entry_node, "headword")
            hw_node.text = entry.headword

            for block in entry.s1blocks():
                ranking, num_current, num_large, num_quotations = calculate_main_sense(block)

                if ranking:
                    wordclass = block.primary_wordclass().penn or "null"
                    num_senses = len(block.senses())
                    s1_node = etree.SubElement(
                        entry_node,
                        "s1",
                        wordclass=wordclass,
                        refid=block.node_id(),
                        senses=str(num_senses),
                        currentSenses=str(num_current),
                        largeSenses=str(num_large),
                        quotations=str(num_quotations),
                    )
                    for sense in ranking[0:3]:
                        sense_num = sense.sense_number() or "null"
                        thes_links = "|".join(sense.thesaurus_nodes())
                        sense_node = etree.SubElement(
                            s1_node, "sense", refid=sense.node_id(), number=sense_num, quotations=str(sense.qcount)
                        )
                        if sense.marked:
                            sense_node.set("marked", "true")
                        sense_node.text = sense.definition(length=100)
                        if thes_links:
                            sense_node.set("thesaurus", thes_links)

        with open(os.path.join(out_dir, letter + ".xml"), "w") as filehandle:
            filehandle.write(etree.tounicode(doc, pretty_print=True))
Пример #8
0
def _load_oed_entries():
    iterator = EntryIterator(dictType='oed', verbosity='low')
    oed_entries = defaultdict(list)
    for entry in iterator.iterate():
        headword = entry.lemma_manager().asciified()
        if (re.search(r'^[A-Z][a-z]', headword) and
                LINK_MANAGERS['ode'].translate_id(entry.id) is None):
            oed_entries[headword].append(OedData(entry.id,
                                                 entry.headword,
                                                 entry.definition(length=100)))
    return oed_entries
Пример #9
0
 def _process_entries(self, file_filter):
     iterator = EntryIterator(dictType='oed',
                              fixLigatures=True,
                              verbosity='low',
                              fileFilter=file_filter)
     for entry in iterator.iterate():
         self.current_entry = entry
         for s1 in entry.s1blocks():
             s1.share_quotations()
             for i, s in enumerate(s1.senses()):
                 self._process_sense(s, i, len(s1.senses()))
         for s in entry.lemsect_senses():
             self._process_sense(s, 5, 10)
         for s in entry.revsect_senses():
             self._process_sense(s, 5, 10)
Пример #10
0
    def store_features_by_sense(self):
        """
        Iterate through each sense (both training and new data), parsing
        and storing the set of features that will be used by the Bayes
        classifier. This store is later used both to build the classifiers
        (picking out the training senses only) and as a cache of data
        for classifying the new senses.

        Features include:
        - definition keywords
        - quotation keywords
        - author names or titles from quotations
        - keywords from titles
        - subject labels
        - usage labels
        - first date
        - wordclass

        Lemma words (components derived by decomposing the lemma) are also
        collected here. Lemma words are not used directly for the
        Bayes classifier itself (they're folded into the set of definition
        keywords), but are used separately to help classify compounds.
        It's just more efficient to parse them along with everything else
        as part of this process.
        """
        sense_parser = SenseParser(self.parent_dir, self.subject_map_file)
        for letter in string.ascii_uppercase:
            file_filter = 'oed_%s.xml' % letter
            ei = EntryIterator(dictType='oed',
                               fixLigatures=True,
                               verbosity='low',
                               fileFilter=file_filter)

            outfile = os.path.join(self.senses_dir, letter)
            with open(outfile, 'wb') as filehandle:
                for entry in ei.iterate():
                    entry.share_quotations()
                    etyma = entry.etymology().etyma()
                    for sense in entry.senses():
                        sense_data_object = sense_parser.parse_sense(
                            sense,
                            etyma,
                            entry.id)
                        pickle.dump(sense_data_object, filehandle)
    def process(self):
        for letter in string.ascii_lowercase:
            _clear_dir(self.out_dir, letter)
            frequencies, subfrequencies = _load_frequency_data(letter, self.include_subentries)

            print('Listing frequencies for entries in %s...' % letter)
            file_filter = 'oed_%s.xml' % letter.upper()
            iterator = EntryIterator(dictType='oed',
                                     fixLigatures=True,
                                     fileFilter=file_filter,
                                     verbosity=None)

            self.filecount = 0
            previous = None
            self.initialize_doc()
            for e in iterator.iterate():
                sortcode = e.lemma_manager().lexical_sort()

                if e.id in frequencies:
                    frequency_blocks = frequencies[e.id]
                else:
                    frequency_blocks = []
                enode = _construct_node(e, 'entry', e.id, 0, e.label(),
                                        e.label(), frequency_blocks, self.terse)
                self.doc.append(enode)

                if self.include_subentries:
                    for sense in e.senses():
                        sig = (e.id, sense.node_id())
                        if sig in subfrequencies:
                            frequency_blocks = subfrequencies[sig]
                            subnode = _construct_node(sense, 'subentry',
                                e.id, sense.node_id(), sense.lemma, e.label(),
                                frequency_blocks, self.terse)
                            self.doc.append(subnode)

                if self.buffersize() >= MAX_BUFFER and sortcode != previous:
                    self.write_buffer(letter)
                    self.initialize_doc()
                previous = sortcode
            self.write_buffer(letter)
Пример #12
0
def store_content(content_dir):
    ThesInstance.__table__.drop(DB_ENGINE, checkfirst=True)
    ThesInstance.__table__.create(DB_ENGINE, checkfirst=True)

    # Store the lemmas for each thesaurus instance (using
    #  refentry+refid+classid as the identifier)
    lemmas = {}  # = _cache_thesaurus_lemmas(content_dir)

    from lex.entryiterator import EntryIterator
    iterator = EntryIterator(dictType='oed',
                             fixLigatures=True,
                             verbosity='low')
    buffer_size = 0
    for entry in iterator.iterate():
        entry.check_revised_status()
        for block in entry.s1blocks():
            block.share_quotations()
            entry_size = block.weighted_size()
            senses = [s for s in block.senses() if not s.is_xref_sense()]
            senses.sort(key=_sortable_date)
            for i, s in enumerate(senses):
                records = _prepare_records(s, entry.id, entry.node_id(),
                                           lemmas, i + 1, entry_size,)
                for r in records:
                    DB_SESSION.add(r)
                    buffer_size += 1
        for s in [s for s in entry.senses() if not s.is_in_sensesect()
                  and not s.is_xref_sense()]:
            records = _prepare_records(s, entry.id, entry.node_id(),
                                       lemmas, 5, 1.0,)
            for r in records:
                DB_SESSION.add(r)
                buffer_size += 1

        if buffer_size > 1000:
            DB_SESSION.commit()
            buffer_size = 0
    DB_SESSION.commit()
Пример #13
0
    def list_variants(self):
        """
        Main process for iterating through OED and writing output
        XML documents.

        >>> VariantsLister(in_dir, out_dir).list_variants()
        """
        self._clear_outdir()
        self._initialize_root()
        iterator = EntryIterator(path=self.in_dir,
                                 dictType='oed',
                                 verbosity='low',
                                 # fileFilter='oed_[K].xml',
                                 fixLigatures=True)
        for entry in iterator.iterate():
            self.entry = entry
            self._process_entry()
            if self.buffersize >= FILE_SIZE:
                self._writebuffer()
                self._initialize_root()
        # Write a file for anything still left in the buffer after the
        #  entry iterator has completed
        self._writebuffer()
Пример #14
0
def compile_features():
    level4_nodes = load_level4_nodes()
    elements = {}
    for feature in FEATURES:
        directory = os.path.join(FEATURESET_ROOT, feature)
        elements_list = os.path.join(directory, 'elements.csv')
        elements[feature] = load_element_list(elements_list)

    matches = {feature: {} for feature in FEATURES}
    for feature in FEATURES:
        for element in elements[feature]:
            matches[feature][element.id] = []
    salient_senses = []

    sense_id = 0
    iterator = EntryIterator(dict_type='oed',
                             fix_ligatures=True,
                             #file_filter='oed_[R].xml',
                             verbosity='low')
    for entry in iterator.iterate():
        generic_language_matches = _test_languages(entry, elements['language'])
        entry.share_quotations()

        # Number all the senses (so that we know which is the first sense,
        #  which we need for the language feature), then pull out just
        #  those that have thesaurus links
        senses = entry.senses()
        for i, sense in enumerate(senses):
            sense.count = i
        senses = [s for s in senses if s.thesaurus_categories()]

        for sense in senses:
            thes_nodes = sense.thesaurus_nodes().intersection(level4_nodes)
            if not thes_nodes:
                continue

            local_matches = dict()
            local_matches['author'] = _test_authors(sense, elements['author'])
            local_matches['compound'] = _test_compounds(sense, entry, elements['compound'])
            local_matches['language'] = {}
            if (local_matches['author'] or
                    local_matches['compound'] or
                    (generic_language_matches and sense.count == 0)):
                sense_id += 1
                date = sense.date().start
                if sense.count == 0:
                    local_matches['language'] = {id: date for id in
                                                 generic_language_matches}
                local_matches['compound'] = {id: date for id in
                                             local_matches['compound']}
                for feature in FEATURES:
                    for element_id, date in local_matches[feature].items():
                        for node in thes_nodes:
                            matches[feature][element_id].append((sense_id,
                                                                 date,
                                                                 node))

                salient_senses.append((sense_id,
                                       entry.id,
                                       sense.lexid(),
                                       sense.lemma))

    # Print a csv file for each element in each featureset; the file
    #  is a list of all the matching senses
    for feature, elements in matches.items():
        for element_id, localmatches in elements.items():
            out_file = os.path.join(FEATURESET_ROOT,
                                    feature,
                                    'raw',
                                    '%d.csv' % element_id)
            with open(out_file, 'w') as csvfile:
                csvwriter = csv.writer(csvfile)
                for match in localmatches:
                    csvwriter.writerow(match)

    # Print list of all the salient senses (those which are linked
    #  from at least one match in one of the featuresets)
    with open(SENSES_FILE, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        for sense in salient_senses:
            csvwriter.writerow(sense)
Пример #15
0
def build_weighted_size_index():
    for letter in string.ascii_uppercase:
        iterator = EntryIterator(dict_type='oed',
                                 file_filter='oed_%s.xml' % letter,
                                 verbosity='low',
                                 fix_ligatures=True)

        entries = []
        for entry in iterator.iterate():
            blocks = []
            for block in entry.s1blocks():
                if (block.primary_wordclass() and
                        block.primary_wordclass().penn):
                    wordclass = block.primary_wordclass().penn
                else:
                    wordclass = '?'
                if len(entry.s1blocks()) == 1:
                    # If there's only one <s1> block, it's effectively
                    #  equivalent to the parent entry. So we make a dummy
                    #  entry, and later let it inherit from the parent entry.
                    block_data = EntryData(int(entry.id),
                                           int(block.node_id()),
                                           wordclass,
                                           0,
                                           [],
                                           0,
                                           entry.is_revised,
                                           True,
                                           )
                else:
                    block_sizes = [(d, block.weighted_size(
                                   revised=entry.is_revised,
                                   disregard_obsolete=True,
                                   currentYear=d)) for d in DATES]
                    block_sizes = [(d, round(n, 2)) for d, n in block_sizes]
                    block_data = EntryData(int(entry.id),
                                           int(block.node_id()),
                                           wordclass,
                                           block.num_quotations(),
                                           block_sizes,
                                           block.date().start,
                                           entry.is_revised,
                                           False,
                                           )
                blocks.append(block_data)

            try:
                entry_wordclass = blocks[0].wordclass
            except IndexError:
                entry_wordclass = '?'
            sizes = [(d, entry.weighted_size(revised=entry.is_revised,
                     disregard_obsolete=True, currentYear=d)) for d in DATES]
            sizes = [(d, round(n, 2)) for d, n in sizes]
            num_quotations = entry.num_quotations(force_recount=True,
                                                  include_derivatives=False)
            entry_data = EntryData(int(entry.id),
                                   0,
                                   entry_wordclass,
                                   num_quotations,
                                   sizes,
                                   entry.date().start,
                                   entry.is_revised,
                                   False,
                                   )

            if len(blocks) > 1:
                # Adjust block sizes to fit the entry size. We only need
                #  bother if there's more than one block; if there's only
                #  one block, it'll be inheriting from the entry anyway.
                blocks = _adjust_block_sizes(blocks, entry_data)

            entries.append(entry_data)
            entries.extend(blocks)

        out_file = os.path.join(PICKLE_DIR, letter)
        with open(out_file, 'wb') as filehandle:
            for entry in entries:
                pickle.dump(entry, filehandle)
Пример #16
0
    def collect(self):

        # Initialize the buffers where quotations will be stored
        self.buffers = {}
        for year in range(DATE_MIN, DATE_MAX+1):
            decade = (year // 10) * 10
            self.buffers[decade] = []

        iterator = EntryIterator(dict_type='oed', verbosity='low')
        for entry in iterator.iterate():

            # Skip entries which may be obscenities
            if any([token in entry.lemma for token in OBSCENITIES]):
                continue

            for sense in entry.senses():
                if sense.is_subentry() or sense.is_subentry_like():
                    pass
                elif entry.num_quotations() > ENTRY_MAX_SIZE:
                    continue

                if sense.is_subentry() or sense.is_subentry_like():
                    lemma = sense.lemma
                else:
                    lemma = entry.lemma
                lemma = lemma[0:LEMMA_LENGTH_MAX]

                for quotation in sense.quotations(strip_suppressed=True):
                    if (quotation.is_textless() or
                            quotation.is_bracketed() or
                            quotation.is_suppressed() or
                            quotation.year < DATE_MIN or
                            quotation.year > DATE_MAX or
                            quotation.citation.date_qualifier or
                            not quotation.citation.author() or
                            quotation.citation.is_glossary() or
                            quotation.text.comments() or
                            quotation.is_modernized_text() or
                            quotation.is_electronic_text() or
                            quotation.is_title_quotation()):
                        continue
                    if quotation.text.node.findall('.//i'):
                        continue

                    if quotation.citation.edition:
                        status = 30
                    elif quotation.citation.bibsub is not None:
                        status = 20
                    else:
                        status = 10
                    if quotation.citation.publication_datestring:
                        status += 10
                    if quotation.citation.is_translation():
                        status += 10

                    text = quotation.text.plaintext
                    text_lower = text.lower()
                    if (len(text) < TEXT_LENGTH_MIN or
                            len(text) > TEXT_LENGTH_MAX):
                        continue
                    if any([token in text_lower for token in OBSCENITIES]):
                        continue

                    citation = quotation.citation.html_lite
                    if len(citation) > CITATION_LENGTH_MAX:
                        continue

                    decade = (quotation.year // 10) * 10
                    row = [quotation.year,
                           citation,
                           text,
                           status,
                           lemma,
                           entry.id,
                           sense.node_id(),]
                    self.buffers[decade].append(row)
                    if len(self.buffers[decade]) > BUFFER_SIZE:
                        self._flush_buffer(decade)

        # Flush anything left in the buffers
        for decade in self.buffers:
            self._flush_buffer(decade)
Пример #17
0
 def setUp(self):
     iterator = EntryIterator(path=FIXTURE_DIR,
                              dictType='oed',
                              verbosity=None,
                              fixLigatures=True,)
     self.entries = {int(e.id): e for e in iterator.iterate()}