Python EntryIterator.iterate示例，lex.entryiterator.EntryIterator.iterate Python示例

示例#1

0

显示文件

文件： mmhwriter.py 项目： necrop/pylib3.2

 def load_morphgroups(self):
     iterator = EntryIterator(dictType='ode')
     for entry in iterator.iterate():
         for morphgroup in entry.morphgroups():
             if (WORDCLASS_PATTERN.search(morphgroup.baseclass) and
                     not morphgroup.variant_type == 'deprecated'):
                 self._process_morphgroup(morphgroup)

示例#2

0

显示文件

文件： generatebase.py 项目： necrop/gel_build

    def process(self):
        self.clear_outdir()
        self.initialize_root()
        previous = None
        iterator = EntryIterator(dict_type="oed", verbosity="low", fix_ligatures=True)

        # Iterate through all entries in OED, processing each and storing
        #   the results in a buffer
        for entry in iterator.iterate():
            self.entry = entry

            # Write the buffer to a file when it gets to a certain size, and
            #   there's an appropriate break, e.g. not in the middle
            #   of homographs.
            if self.buffersize() >= FILESIZE and entry.lemma_manager().lexical_sort() != previous:
                self.writebuffer()
                self.initialize_root()

            # Process the current entry -> buffer
            self.process_entry()

            # Keep track of the previous entry's headword (to help find a good
            #   opportunity to write the buffer to a file)
            previous = entry.lemma_manager().lexical_sort()

        # Write a file for anything still left in the buffer after the
        #  entry iterator has completed
        self.writebuffer()

示例#3

0

显示文件

文件： distiller.py 项目： necrop/pylib3.2

 def distil(self):
     iterator = EntryIterator(dictType=self.dict_name)
     with open(self.pickle_file, 'wb') as filehandle:
         for entry in iterator.iterate():
             distilled = _parse_source_entry(entry, self.definition_length)
             if distilled.wordclass_blocks:
                 pickle.dump(distilled, filehandle)

示例#4

0

显示文件

文件： test.py 项目： necrop/pylib3.2

def test_oed_parser():
    names = []
    for letter in LETTERS:
        print('Collecting names in %s...' % letter)
        filter_pattern = 'oed_%s.xml' % letter.upper()
        iterator = EntryIterator(dictType='oed',
                                 fixLigatures=True,
                                 fileFilter=filter_pattern,
                                 verbosity=None)
        for entry in iterator.iterate():
            if (('personal name' in entry.characteristic_nodes('etymonLanguage') or
                    'place name' in entry.characteristic_nodes('etymonLanguage')) and
                    re.search(r'^([A-Z]|[A-Z]\'[A-Z])[a-z]+$', entry.headword) and
                    not entry.headword.endswith('ism') and
                    not entry.headword.endswith('ist') and
                    not entry.headword.endswith('ian') and
                    not entry.headword.endswith('ite') and
                    entry.primary_wordclass().penn == 'NN'):
                print(entry.headword)
                names.append(entry.headword)
                #for et in entry.etymology().etyma():
                #    print(et)
            #for s1 in entry.s1blocks():
            #    s1.share_quotations()
            #    for i, s in enumerate(s1.senses()):
            #        _process_sense(s, i, len(s1.senses()))

    with open('somenames.txt', 'w') as filehandle:
        for name in names:
            filehandle.write(name + '\n')

示例#5

0

显示文件

文件： count_misses.py 项目： necrop/htclassifier_build

def count():
    counts = defaultdict(lambda: defaultdict(int))

    ei = EntryIterator(path=oed_dir,
                       dictType="oed",
                       fixLigatures=True,
                       verbosity="low")
    for entry in ei.iterate():
        entry.share_quotations()
        senses = {"all": entry.senses, "missed": [s for s in
            entry.senses if not s.thesaurus_categories()]}
        for j in ("all", "missed"):
            for s in senses[j]:
                counts[j]["all"] += 1
                if s.is_subentry() and s.definition():
                    counts[j]["defined_subentry"] += 1
                elif s.is_subentry():
                    counts[j]["undefined_subentry"] += 1
                else:
                    counts[j]["main_sense"] += 1
                counts[j]["quotations"] += s.num_quotations
                if s.primary_wordclass.penn in ('NN', 'VB', 'JJ', 'RB'):
                    counts[j][s.primary_wordclass.penn] += 1
                else:
                    counts[j]['other_wordclass'] += 1
                if entry.is_revised:
                    counts[j]["revised"] += 1
                else:
                    counts[j]["unrevised"] += 1

    for j in ("all", "missed"):
        print j
        for k, v in counts[j].items():
            print "\t%s\t%d" % (k, v)

示例#6

0

显示文件

文件： vitalstatistics.py 项目： necrop/pylib3.2

    def store_vital_statistics(self):
        for letter in LETTERS:
            print('Collecting vital statistics in %s...' % letter)
            filter_pattern = 'oed_%s.xml' % letter.upper()
            iterator = EntryIterator(path=self.oed_dir,
                                     dictType='oed',
                                     fixLigatures=True,
                                     fileFilter=filter_pattern,
                                     verbosity=None)

            self.doc = etree.Element('entries')
            for entry in iterator.iterate():
                entry_node = etree.SubElement(
                    self.doc,
                    'e',
                    xrid=entry.id,
                    quotations=str(entry.num_quotations(force_recount=True)),
                    weightedSize='%0.2g' % entry.weighted_size(),
                    obsolete=str(entry.is_marked_obsolete()),
                    revised=str(entry.is_revised),
                    firstDate=str(entry.date().start),
                    lastDate=str(entry.date().end)
                )
                label_node = etree.SubElement(entry_node, 'label')
                label_node.text = entry.label()
                hw_node = etree.SubElement(entry_node, 'headword')
                hw_node.text = entry.headword

                if entry.header() is not None:
                    header_node = etree.SubElement(entry_node, 'header')
                    header_node.text = entry.header()

                etym_node = etree.SubElement(entry_node, 'etyma')
                for etymon in entry.etymology().etyma():
                    if etymon.type() == 'cross-reference':
                        etymon_node = etree.SubElement(etym_node, 'etymon')
                        etymon_node.set('xrid', str(etymon.refentry()))
                        etymon_node.text = etymon.lemma

                lang_node = etree.SubElement(entry_node, 'language')
                language = (entry.characteristic_first('etymonLanguage') or
                            entry.characteristic_first('sourceLanguage'))
                if language:
                    lang_node.text = language

                def_node = etree.SubElement(entry_node, 'def')
                definition = entry.definition(length=100, current=True)
                if definition:
                    def_node.text = definition

                if entry.senses():
                    for label_type in ('subject', 'region', 'usage'):
                        label_text = entry.senses()[0].characteristic_first(label_type)
                        label_text = label_text.split('/')[-1]
                        if label_text:
                            label_node = etree.SubElement(entry_node, label_type)
                            label_node.text = label_text

            self._write_output(letter)

示例#7

0

显示文件

文件： mainsenses.py 项目： necrop/pylib3.2

def store_main_senses(**kwargs):
    """
    Store main-sense data for OED entries as XML documents.
    """
    from lex.entryiterator import EntryIterator

    oed_dir = kwargs.get("oed_dir") or DEFAULT_INPUT
    out_dir = kwargs.get("out_dir") or DEFAULT_OUTPUT

    for letter in LETTERS:
        print("Collecting main-sense data in %s..." % letter)
        filter_pattern = "oed_%s.xml" % letter.upper()
        iterator = EntryIterator(
            path=oed_dir, dictType="oed", fixLigatures=True, fileFilter=filter_pattern, verbosity=None
        )

        doc = etree.Element("entries")
        for entry in iterator.iterate():
            entry.check_revised_status()
            entry_node = etree.SubElement(doc, "e", refentry=entry.id)
            label_node = etree.SubElement(entry_node, "label")
            label_node.text = entry.label()
            hw_node = etree.SubElement(entry_node, "headword")
            hw_node.text = entry.headword

            for block in entry.s1blocks():
                ranking, num_current, num_large, num_quotations = calculate_main_sense(block)

                if ranking:
                    wordclass = block.primary_wordclass().penn or "null"
                    num_senses = len(block.senses())
                    s1_node = etree.SubElement(
                        entry_node,
                        "s1",
                        wordclass=wordclass,
                        refid=block.node_id(),
                        senses=str(num_senses),
                        currentSenses=str(num_current),
                        largeSenses=str(num_large),
                        quotations=str(num_quotations),
                    )
                    for sense in ranking[0:3]:
                        sense_num = sense.sense_number() or "null"
                        thes_links = "|".join(sense.thesaurus_nodes())
                        sense_node = etree.SubElement(
                            s1_node, "sense", refid=sense.node_id(), number=sense_num, quotations=str(sense.qcount)
                        )
                        if sense.marked:
                            sense_node.set("marked", "true")
                        sense_node.text = sense.definition(length=100)
                        if thes_links:
                            sense_node.set("thesaurus", thes_links)

        with open(os.path.join(out_dir, letter + ".xml"), "w") as filehandle:
            filehandle.write(etree.tounicode(doc, pretty_print=True))

示例#8

0

显示文件

文件： findmorelinks.py 项目： necrop/pylib3.2

def _load_oed_entries():
    iterator = EntryIterator(dictType='oed', verbosity='low')
    oed_entries = defaultdict(list)
    for entry in iterator.iterate():
        headword = entry.lemma_manager().asciified()
        if (re.search(r'^[A-Z][a-z]', headword) and
                LINK_MANAGERS['ode'].translate_id(entry.id) is None):
            oed_entries[headword].append(OedData(entry.id,
                                                 entry.headword,
                                                 entry.definition(length=100)))
    return oed_entries

示例#9

0

显示文件

文件： sensemanager.py 项目： necrop/htclassifier_build

 def _process_entries(self, file_filter):
     iterator = EntryIterator(dictType='oed',
                              fixLigatures=True,
                              verbosity='low',
                              fileFilter=file_filter)
     for entry in iterator.iterate():
         self.current_entry = entry
         for s1 in entry.s1blocks():
             s1.share_quotations()
             for i, s in enumerate(s1.senses()):
                 self._process_sense(s, i, len(s1.senses()))
         for s in entry.lemsect_senses():
             self._process_sense(s, 5, 10)
         for s in entry.revsect_senses():
             self._process_sense(s, 5, 10)

示例#10

0

显示文件

文件： bayesclassifier.py 项目： necrop/htclassifier_build

    def store_features_by_sense(self):
        """
        Iterate through each sense (both training and new data), parsing
        and storing the set of features that will be used by the Bayes
        classifier. This store is later used both to build the classifiers
        (picking out the training senses only) and as a cache of data
        for classifying the new senses.

        Features include:
        - definition keywords
        - quotation keywords
        - author names or titles from quotations
        - keywords from titles
        - subject labels
        - usage labels
        - first date
        - wordclass

        Lemma words (components derived by decomposing the lemma) are also
        collected here. Lemma words are not used directly for the
        Bayes classifier itself (they're folded into the set of definition
        keywords), but are used separately to help classify compounds.
        It's just more efficient to parse them along with everything else
        as part of this process.
        """
        sense_parser = SenseParser(self.parent_dir, self.subject_map_file)
        for letter in string.ascii_uppercase:
            file_filter = 'oed_%s.xml' % letter
            ei = EntryIterator(dictType='oed',
                               fixLigatures=True,
                               verbosity='low',
                               fileFilter=file_filter)

            outfile = os.path.join(self.senses_dir, letter)
            with open(outfile, 'wb') as filehandle:
                for entry in ei.iterate():
                    entry.share_quotations()
                    etyma = entry.etymology().etyma()
                    for sense in entry.senses():
                        sense_data_object = sense_parser.parse_sense(
                            sense,
                            etyma,
                            entry.id)
                        pickle.dump(sense_data_object, filehandle)

示例#11

0

显示文件

文件： frequencycollector.py 项目： necrop/oedfrequency_build

    def process(self):
        for letter in string.ascii_lowercase:
            _clear_dir(self.out_dir, letter)
            frequencies, subfrequencies = _load_frequency_data(letter, self.include_subentries)

            print('Listing frequencies for entries in %s...' % letter)
            file_filter = 'oed_%s.xml' % letter.upper()
            iterator = EntryIterator(dictType='oed',
                                     fixLigatures=True,
                                     fileFilter=file_filter,
                                     verbosity=None)

            self.filecount = 0
            previous = None
            self.initialize_doc()
            for e in iterator.iterate():
                sortcode = e.lemma_manager().lexical_sort()

                if e.id in frequencies:
                    frequency_blocks = frequencies[e.id]
                else:
                    frequency_blocks = []
                enode = _construct_node(e, 'entry', e.id, 0, e.label(),
                                        e.label(), frequency_blocks, self.terse)
                self.doc.append(enode)

                if self.include_subentries:
                    for sense in e.senses():
                        sig = (e.id, sense.node_id())
                        if sig in subfrequencies:
                            frequency_blocks = subfrequencies[sig]
                            subnode = _construct_node(sense, 'subentry',
                                e.id, sense.node_id(), sense.lemma, e.label(),
                                frequency_blocks, self.terse)
                            self.doc.append(subnode)

                if self.buffersize() >= MAX_BUFFER and sortcode != previous:
                    self.write_buffer(letter)
                    self.initialize_doc()
                previous = sortcode
            self.write_buffer(letter)

示例#12

0

显示文件

文件： populator.py 项目： necrop/pylib3.2

def store_content(content_dir):
    ThesInstance.__table__.drop(DB_ENGINE, checkfirst=True)
    ThesInstance.__table__.create(DB_ENGINE, checkfirst=True)

    # Store the lemmas for each thesaurus instance (using
    #  refentry+refid+classid as the identifier)
    lemmas = {}  # = _cache_thesaurus_lemmas(content_dir)

    from lex.entryiterator import EntryIterator
    iterator = EntryIterator(dictType='oed',
                             fixLigatures=True,
                             verbosity='low')
    buffer_size = 0
    for entry in iterator.iterate():
        entry.check_revised_status()
        for block in entry.s1blocks():
            block.share_quotations()
            entry_size = block.weighted_size()
            senses = [s for s in block.senses() if not s.is_xref_sense()]
            senses.sort(key=_sortable_date)
            for i, s in enumerate(senses):
                records = _prepare_records(s, entry.id, entry.node_id(),
                                           lemmas, i + 1, entry_size,)
                for r in records:
                    DB_SESSION.add(r)
                    buffer_size += 1
        for s in [s for s in entry.senses() if not s.is_in_sensesect()
                  and not s.is_xref_sense()]:
            records = _prepare_records(s, entry.id, entry.node_id(),
                                       lemmas, 5, 1.0,)
            for r in records:
                DB_SESSION.add(r)
                buffer_size += 1

        if buffer_size > 1000:
            DB_SESSION.commit()
            buffer_size = 0
    DB_SESSION.commit()

示例#13

0

显示文件

文件： variantslister.py 项目： necrop/pylib3.2

    def list_variants(self):
        """
        Main process for iterating through OED and writing output
        XML documents.

        >>> VariantsLister(in_dir, out_dir).list_variants()
        """
        self._clear_outdir()
        self._initialize_root()
        iterator = EntryIterator(path=self.in_dir,
                                 dictType='oed',
                                 verbosity='low',
                                 # fileFilter='oed_[K].xml',
                                 fixLigatures=True)
        for entry in iterator.iterate():
            self.entry = entry
            self._process_entry()
            if self.buffersize >= FILE_SIZE:
                self._writebuffer()
                self._initialize_root()
        # Write a file for anything still left in the buffer after the
        #  entry iterator has completed
        self._writebuffer()

示例#14

0

显示文件

文件： compilefeatures.py 项目： necrop/htdistribution

def compile_features():
    level4_nodes = load_level4_nodes()
    elements = {}
    for feature in FEATURES:
        directory = os.path.join(FEATURESET_ROOT, feature)
        elements_list = os.path.join(directory, 'elements.csv')
        elements[feature] = load_element_list(elements_list)

    matches = {feature: {} for feature in FEATURES}
    for feature in FEATURES:
        for element in elements[feature]:
            matches[feature][element.id] = []
    salient_senses = []

    sense_id = 0
    iterator = EntryIterator(dict_type='oed',
                             fix_ligatures=True,
                             #file_filter='oed_[R].xml',
                             verbosity='low')
    for entry in iterator.iterate():
        generic_language_matches = _test_languages(entry, elements['language'])
        entry.share_quotations()

        # Number all the senses (so that we know which is the first sense,
        #  which we need for the language feature), then pull out just
        #  those that have thesaurus links
        senses = entry.senses()
        for i, sense in enumerate(senses):
            sense.count = i
        senses = [s for s in senses if s.thesaurus_categories()]

        for sense in senses:
            thes_nodes = sense.thesaurus_nodes().intersection(level4_nodes)
            if not thes_nodes:
                continue

            local_matches = dict()
            local_matches['author'] = _test_authors(sense, elements['author'])
            local_matches['compound'] = _test_compounds(sense, entry, elements['compound'])
            local_matches['language'] = {}
            if (local_matches['author'] or
                    local_matches['compound'] or
                    (generic_language_matches and sense.count == 0)):
                sense_id += 1
                date = sense.date().start
                if sense.count == 0:
                    local_matches['language'] = {id: date for id in
                                                 generic_language_matches}
                local_matches['compound'] = {id: date for id in
                                             local_matches['compound']}
                for feature in FEATURES:
                    for element_id, date in local_matches[feature].items():
                        for node in thes_nodes:
                            matches[feature][element_id].append((sense_id,
                                                                 date,
                                                                 node))

                salient_senses.append((sense_id,
                                       entry.id,
                                       sense.lexid(),
                                       sense.lemma))

    # Print a csv file for each element in each featureset; the file
    #  is a list of all the matching senses
    for feature, elements in matches.items():
        for element_id, localmatches in elements.items():
            out_file = os.path.join(FEATURESET_ROOT,
                                    feature,
                                    'raw',
                                    '%d.csv' % element_id)
            with open(out_file, 'w') as csvfile:
                csvwriter = csv.writer(csvfile)
                for match in localmatches:
                    csvwriter.writerow(match)

    # Print list of all the salient senses (those which are linked
    #  from at least one match in one of the featuresets)
    with open(SENSES_FILE, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        for sense in salient_senses:
            csvwriter.writerow(sense)

示例#15

0

显示文件

文件： oedentrysize.py 项目： necrop/gel_build

def build_weighted_size_index():
    for letter in string.ascii_uppercase:
        iterator = EntryIterator(dict_type='oed',
                                 file_filter='oed_%s.xml' % letter,
                                 verbosity='low',
                                 fix_ligatures=True)

        entries = []
        for entry in iterator.iterate():
            blocks = []
            for block in entry.s1blocks():
                if (block.primary_wordclass() and
                        block.primary_wordclass().penn):
                    wordclass = block.primary_wordclass().penn
                else:
                    wordclass = '?'
                if len(entry.s1blocks()) == 1:
                    # If there's only one <s1> block, it's effectively
                    #  equivalent to the parent entry. So we make a dummy
                    #  entry, and later let it inherit from the parent entry.
                    block_data = EntryData(int(entry.id),
                                           int(block.node_id()),
                                           wordclass,
                                           0,
                                           [],
                                           0,
                                           entry.is_revised,
                                           True,
                                           )
                else:
                    block_sizes = [(d, block.weighted_size(
                                   revised=entry.is_revised,
                                   disregard_obsolete=True,
                                   currentYear=d)) for d in DATES]
                    block_sizes = [(d, round(n, 2)) for d, n in block_sizes]
                    block_data = EntryData(int(entry.id),
                                           int(block.node_id()),
                                           wordclass,
                                           block.num_quotations(),
                                           block_sizes,
                                           block.date().start,
                                           entry.is_revised,
                                           False,
                                           )
                blocks.append(block_data)

            try:
                entry_wordclass = blocks[0].wordclass
            except IndexError:
                entry_wordclass = '?'
            sizes = [(d, entry.weighted_size(revised=entry.is_revised,
                     disregard_obsolete=True, currentYear=d)) for d in DATES]
            sizes = [(d, round(n, 2)) for d, n in sizes]
            num_quotations = entry.num_quotations(force_recount=True,
                                                  include_derivatives=False)
            entry_data = EntryData(int(entry.id),
                                   0,
                                   entry_wordclass,
                                   num_quotations,
                                   sizes,
                                   entry.date().start,
                                   entry.is_revised,
                                   False,
                                   )

            if len(blocks) > 1:
                # Adjust block sizes to fit the entry size. We only need
                #  bother if there's more than one block; if there's only
                #  one block, it'll be inheriting from the entry anyway.
                blocks = _adjust_block_sizes(blocks, entry_data)

            entries.append(entry_data)
            entries.extend(blocks)

        out_file = os.path.join(PICKLE_DIR, letter)
        with open(out_file, 'wb') as filehandle:
            for entry in entries:
                pickle.dump(entry, filehandle)

示例#16

0

显示文件

文件： collector.py 项目： necrop/oed_quotation_quiz

    def collect(self):

        # Initialize the buffers where quotations will be stored
        self.buffers = {}
        for year in range(DATE_MIN, DATE_MAX+1):
            decade = (year // 10) * 10
            self.buffers[decade] = []

        iterator = EntryIterator(dict_type='oed', verbosity='low')
        for entry in iterator.iterate():

            # Skip entries which may be obscenities
            if any([token in entry.lemma for token in OBSCENITIES]):
                continue

            for sense in entry.senses():
                if sense.is_subentry() or sense.is_subentry_like():
                    pass
                elif entry.num_quotations() > ENTRY_MAX_SIZE:
                    continue

                if sense.is_subentry() or sense.is_subentry_like():
                    lemma = sense.lemma
                else:
                    lemma = entry.lemma
                lemma = lemma[0:LEMMA_LENGTH_MAX]

                for quotation in sense.quotations(strip_suppressed=True):
                    if (quotation.is_textless() or
                            quotation.is_bracketed() or
                            quotation.is_suppressed() or
                            quotation.year < DATE_MIN or
                            quotation.year > DATE_MAX or
                            quotation.citation.date_qualifier or
                            not quotation.citation.author() or
                            quotation.citation.is_glossary() or
                            quotation.text.comments() or
                            quotation.is_modernized_text() or
                            quotation.is_electronic_text() or
                            quotation.is_title_quotation()):
                        continue
                    if quotation.text.node.findall('.//i'):
                        continue

                    if quotation.citation.edition:
                        status = 30
                    elif quotation.citation.bibsub is not None:
                        status = 20
                    else:
                        status = 10
                    if quotation.citation.publication_datestring:
                        status += 10
                    if quotation.citation.is_translation():
                        status += 10

                    text = quotation.text.plaintext
                    text_lower = text.lower()
                    if (len(text) < TEXT_LENGTH_MIN or
                            len(text) > TEXT_LENGTH_MAX):
                        continue
                    if any([token in text_lower for token in OBSCENITIES]):
                        continue

                    citation = quotation.citation.html_lite
                    if len(citation) > CITATION_LENGTH_MAX:
                        continue

                    decade = (quotation.year // 10) * 10
                    row = [quotation.year,
                           citation,
                           text,
                           status,
                           lemma,
                           entry.id,
                           sense.node_id(),]
                    self.buffers[decade].append(row)
                    if len(self.buffers[decade]) > BUFFER_SIZE:
                        self._flush_buffer(decade)

        # Flush anything left in the buffers
        for decade in self.buffers:
            self._flush_buffer(decade)

示例#17

0

显示文件

文件： test_oed.py 项目： necrop/pylib3.2

 def setUp(self):
     iterator = EntryIterator(path=FIXTURE_DIR,
                              dictType='oed',
                              verbosity=None,
                              fixLigatures=True,)
     self.entries = {int(e.id): e for e in iterator.iterate()}