예제 #1
0
def insert_frequency(in_dir, out_dir, freq_dir):
    """
    Find frequency values in the frequency_build data, and inserts
    them in the GEL data.
    """
    iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low')
    frequency_finder = FrequencyMemo(freq_dir)

    for filecontent in iterator.iterate():
        for entry in filecontent.entries:
            for wordclass_set in entry.wordclass_sets():
                etree.strip_attributes(wordclass_set.node, 'size')

                tables = {}
                for type in wordclass_set.types():
                    frequencies = frequency_finder.find_frequencies(type.id)
                    if frequencies:
                        tables[type.id] = FrequencyTable(data=frequencies)
                    else:
                        tables[type.id] = None

                for type in wordclass_set.types():
                    if tables[type.id]:
                        type.node.append(tables[type.id].to_xml())

                non_null_tables = [table for table in tables.values() if table]
                if non_null_tables:
                    wcs_table = sum_frequency_tables(non_null_tables)
                    wordclass_set.node.append(wcs_table.to_xml())
예제 #2
0
    def compile_data(self):
        self.data = {}
        for letter in alphabet:
            print('Compiling index for %s...' % letter)
            self.data[letter] = []
            sub_dir = os.path.join(self.in_dir, letter)
            iterator = FileIterator(in_dir=sub_dir,
                                    out_dir=None,
                                    verbosity=None)
            for filecontent in iterator.iterate():
                filedata = _filedata_factory(iterator.in_file,
                                             filecontent.entries)
                self.data[letter].append(filedata)

        self.stats = {}
        for letter in alphabet:
            self.stats[letter] = {'entries': 0,
                                  'types': 0,
                                  'distinct_types': 0,
                                  'files': 0}
            for filedata in self.data[letter]:
                self.stats[letter]['files'] += 1
                self.stats[letter]['entries'] += filedata.num_entries
                self.stats[letter]['types'] += filedata.num_types
                self.stats[letter]['distinct_types'] += filedata.num_distinct_types
        self.stats['total'] = {'entries': 0,
                               'types': 0,
                               'files': 0,
                               'distinct_types': 0}
        for letter in alphabet:
            for z in ('entries', 'types', 'distinct_types', 'files'):
                self.stats['total'][z] += self.stats[letter][z]
예제 #3
0
def merge_entries(in_dir, out_dir):
    """
    Merge separate GEL entries generated for separate OED entries which
    are really different wordclasses of the same lemma.

    E.g. anger n. and anger v.
    """
    iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low')
    for filecontent in iterator.iterate():
        target_log = set()
        ode_linked_parallels = _find_parallels(filecontent)
        for entry in filecontent.entries:
            target_id = None
            if entry.attribute('parentId'):
                # Avoid loops (two entries treating each other
                #  as parent)
                if not entry.oed_id() in target_log:
                    target_id = entry.attribute('parentId')
                    target_log.add(entry.attribute('parentId'))
            elif entry.oed_lexid() in ode_linked_parallels:
                target_id = ode_linked_parallels[entry.oed_lexid()]

            if target_id:
                targets = filecontent.entry_by_id(target_id)
                targets = [t for t in targets if t.tag() == 's1']
                if targets:
                    for wc in entry.wordclass_sets():
                        targets[0].node.append(wc.node)
                    entry.node.getparent().remove(entry.node)
예제 #4
0
def clean_attributes(in_dir, out_dir):
    """
    Clean up GEL data by adding/removing/adjusting various attributes.

    - Add a unique ID to every entry, wordclass set, morphset, and type;
    - Add a sort code to every entry, morphset, and type;
    - Fuzz start and end dates (approximate to nearest 50 or 100 years);
    - Remove unnecessary attributes from entry tags.
    """

    iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low')
    for filecontent in iterator.iterate():
        for entry in filecontent.entries:
            for att in REMOVABLE:
                if att in entry.node.attrib:
                    entry.node.attrib.pop(att)
            entry.node.set('id', next_id())
            entry.node.set('sort', entry.sort)

            for block in entry.wordclass_sets():
                block.node.set('id', next_id())
                block.fuzz_dates()
                for morphset in block.morphsets():
                    morphset.node.set('id', next_id())
                    morphset.node.set('sort', morphset.sort)
                    morphset.fuzz_dates()
                    for typeunit in morphset.types():
                        typeunit.node.set('id', next_id())
                        typeunit.node.set('sort', typeunit.sort)
예제 #5
0
def add_inflections(in_dir, out_dir):
    iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low')
    for filecontent in iterator.iterate():
        for entry in filecontent.entries:
            for wordclass_set in [wcs for wcs in entry.wordclass_sets()
                                  if wcs.wordclass() in INFLECTABLE]:
                _process_wordclass_set(wordclass_set)
예제 #6
0
 def iterate(self, **kwargs):
     if kwargs.get('letter'):
         alphabet = [kwargs.get('letter').lower(), ]
     else:
         alphabet = LETTERS
     for letter in alphabet:
         directory = os.path.join(self.in_dir, letter)
         iterator = FileIterator(in_dir=directory,
                                 out_dir=None,
                                 verbosity='low')
         for file_contents in iterator.iterate():
             for entry in file_contents.entries:
                 yield entry
예제 #7
0
def add_missing_inflections(in_dir, out_dir):
    """
    Add any inflections that are given in the entry's variants list, but
    which so far have not been generated by the inflection process.
    These get added as 'fragment' morphsets (@fragment=true)
    """
    iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low')

    for filecontent in iterator.iterate():
        for entry in filecontent.entries:
            id = entry.oed_id()
            if entry.tag() == 's1' and VARIANTS_CACHE.id_exists(id):
                for wordclass_set in entry.wordclass_sets():
                    if wordclass_set.wordclass() in ('NN', 'VB'):
                        _process_wordclass_set(wordclass_set, id)
예제 #8
0
def index_build_files(dir, out_file):
    iterator = FileIterator(in_dir=dir, out_dir=None, verbosity=None)

    index = []
    for filecontent in iterator.iterate():
        headwords = []
        for entry in filecontent.entries:
            headwords.append(entry.lemma)
        index.append((iterator.file_number(),
                      headwords[0],
                      headwords[-1]))

    with open(out_file, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerows(index)
예제 #9
0
class AlphaSort(object):

    def __init__(self, in_dir, out_dir):
        self.iterator = FileIterator(in_dir=in_dir,
                                     out_dir=None,
                                     verbosity='low')
        self.out_dir = out_dir
        self.streams = {}

    def process(self):
        self.initialize()
        for filecontent in self.iterator.iterate():
            for entry in filecontent.entries:
                sortcode = entry.attribute('sort') or 'zzz'
                initial = sortcode[0]
                self.streams[initial].add_to_buffer(entry.node)
        # finish off writing anything left in the buffer
        for initial in alphabet:
            self.streams[initial].write()
        for initial in alphabet:
            print('sorting %s...' % initial)
            self.streams[initial].sort_in_place()

    def initialize(self):
        self.streams = {}
        for initial in alphabet:
            self.streams[initial] = LetterSet(initial, self.out_dir)
            self.streams[initial].purge_directory()
예제 #10
0
 def sort_in_place(self):
     self.filecount = 0
     iterator = FileIterator(in_dir=self.out_dir,
                             out_dir=None,
                             verbosity=None)
     entries = []
     for filecontent in iterator.iterate():
         for entry in filecontent.entries:
             sortcode = entry.attribute('sort') or 'zzz'
             entries.append((sortcode, entry.tostring()))
     self.purge_directory()
     self.clear_buffer()
     entries.sort(key=lambda e: e[0])
     for entry in entries:
         node = etree.fromstring(entry[1])
         self.add_to_buffer(node)
     self.write()
예제 #11
0
    def process(self):
        iterator = FileIterator(in_dir=self.in_dir,
                                out_dir=None,
                                verbosity='low')

        forms = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
        for filecontent in iterator.iterate():
            for entry in filecontent.entries:
                for wordclass_forms in _process_entry(entry):
                    for item in wordclass_forms:
                        initial = item.sort[0]
                        forms[initial][item.sort][item.form].append(item)

        for initial, sortcode_set in forms.items():
            self.subdir = os.path.join(self.out_dir, initial)
            self.clear_dir()
            self.filecount = 0
            self.initialize_doc()
            for sortcode, form_set in sorted(sortcode_set.items()):
                for form, items in sorted(form_set.items()):
                    entry = etree.Element('lemma', sort=sortcode)
                    form_node = etree.SubElement(entry, 'form')
                    form_node.text = form
                    lex_node = etree.Element('lex')
                    for item in items:
                        instance_node = etree.Element(
                            'instance',
                            wordclassId=item.wordclass_id,
                            typeId=item.type_id,
                            wordclass=item.wordclass,
                            start=str(item.start),
                            end=str(item.end),
                            base=item.baseform,
                        )
                        if item.xrid:
                            instance_node.set('xrid', item.xrid)
                        if item.xnode:
                            instance_node.set('xnode', item.xnode)
                        if item.is_variant:
                            instance_node.set('variant', 'true')
                        lex_node.append(instance_node)
                    entry.append(lex_node)
                    self.doc.append(entry)
                    if len(self.doc) > 10000:
                        self.writebuffer()
            self.writebuffer()
예제 #12
0
 def __init__(self, in_dir, out_dir):
     self.iterator = FileIterator(in_dir=in_dir,
                                  out_dir=None,
                                  verbosity='low')
     self.out_dir = out_dir
     self.streams = {}