Пример #1
0
def insert_frequency(in_dir, out_dir, freq_dir):
    """
    Find frequency values in the frequency_build data, and inserts
    them in the GEL data.
    """
    iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low')
    frequency_finder = FrequencyMemo(freq_dir)

    for filecontent in iterator.iterate():
        for entry in filecontent.entries:
            for wordclass_set in entry.wordclass_sets():
                etree.strip_attributes(wordclass_set.node, 'size')

                tables = {}
                for type in wordclass_set.types():
                    frequencies = frequency_finder.find_frequencies(type.id)
                    if frequencies:
                        tables[type.id] = FrequencyTable(data=frequencies)
                    else:
                        tables[type.id] = None

                for type in wordclass_set.types():
                    if tables[type.id]:
                        type.node.append(tables[type.id].to_xml())

                non_null_tables = [table for table in tables.values() if table]
                if non_null_tables:
                    wcs_table = sum_frequency_tables(non_null_tables)
                    wordclass_set.node.append(wcs_table.to_xml())
 def compare_singular_to_plural(self, e):
     for wcs in e.wordclass_sets():
         if (wcs.wordclass == 'NN' and
                 wcs.frequency_table().frequency() > 1):
             groups = defaultdict(list)
             for type in wcs.types():
                 groups[type.wordclass].append(type)
             if 'NN' in groups and 'NNS' in groups:
                 summed_nn = sum_frequency_tables([t.frequency_table()
                     for t in groups['NN']
                     if t.frequency_table() is not None])
                 summed_nns = sum_frequency_tables([t.frequency_table()
                     for t in groups['NNS']
                     if t.frequency_table() is not None])
                 f_nn = summed_nn.frequency()
                 f_nns = summed_nns.frequency()
                 if f_nn and f_nns / f_nn > 1:
                     self.track['plural_to_singular'].append({
                         'label': e.label,
                         'id': e.id,
                         'fpm': wcs.frequency_table().frequency(),
                         'ratio': f_nns / f_nn
                     })
def _construct_node(block, block_type, entry_id, node_id, label, parent_label,
                    frequency_blocks, terse):
    enode = etree.Element('e',
                          type=block_type,
                          xrid=str(entry_id),
                          xrnode=str(node_id),
                          obsolete=str(block.is_marked_obsolete()),
                          revised=str(block.is_revised),
                          firstDate=str(block.date().start),
                          lastDate=str(block.date().end))
    hwnode = etree.SubElement(enode, 'label')
    hwnode.text = label
    pnode = etree.SubElement(enode, 'parentLabel')
    pnode.text = parent_label
    lemnode = etree.SubElement(enode, 'lemma')
    lemnode.text = block.lemma
    defnode = etree.SubElement(enode, 'definition')
    defnode.text = block.definition(length=DEF_LENGTH, current=True)

    if frequency_blocks:
        # Create a frequency node for the entry as a whole, by
        # summing frequencies for each wordclass
        if len(frequency_blocks) > 1 or not terse:
            sumtable = sum_frequency_tables(
                [blockdata.frequency_table for blockdata in
                 frequency_blocks])
            enode.append(sumtable.to_xml())

        for blockdata in frequency_blocks:
            wordclass = blockdata.wordclass
            frequency_table = blockdata.frequency_table
            types = blockdata.types

            wcnode = etree.SubElement(enode, 'wordclass',
                                      penn=wordclass)
            if len(types) > 1 or not terse:
                wcnode.append(frequency_table.to_xml())
            wrapnode = etree.SubElement(wcnode, 'types')
            for typeunit in types:
                tnode = etree.SubElement(wrapnode, 'type',
                                         penn=typeunit.wordclass)
                z = etree.SubElement(tnode, 'form')
                z.text = typeunit.form
                tnode.append(typeunit.frequency_table.to_xml())

    return enode