def insert_frequency(in_dir, out_dir, freq_dir): """ Find frequency values in the frequency_build data, and inserts them in the GEL data. """ iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low') frequency_finder = FrequencyMemo(freq_dir) for filecontent in iterator.iterate(): for entry in filecontent.entries: for wordclass_set in entry.wordclass_sets(): etree.strip_attributes(wordclass_set.node, 'size') tables = {} for type in wordclass_set.types(): frequencies = frequency_finder.find_frequencies(type.id) if frequencies: tables[type.id] = FrequencyTable(data=frequencies) else: tables[type.id] = None for type in wordclass_set.types(): if tables[type.id]: type.node.append(tables[type.id].to_xml()) non_null_tables = [table for table in tables.values() if table] if non_null_tables: wcs_table = sum_frequency_tables(non_null_tables) wordclass_set.node.append(wcs_table.to_xml())
def compile_data(self): self.data = {} for letter in alphabet: print('Compiling index for %s...' % letter) self.data[letter] = [] sub_dir = os.path.join(self.in_dir, letter) iterator = FileIterator(in_dir=sub_dir, out_dir=None, verbosity=None) for filecontent in iterator.iterate(): filedata = _filedata_factory(iterator.in_file, filecontent.entries) self.data[letter].append(filedata) self.stats = {} for letter in alphabet: self.stats[letter] = {'entries': 0, 'types': 0, 'distinct_types': 0, 'files': 0} for filedata in self.data[letter]: self.stats[letter]['files'] += 1 self.stats[letter]['entries'] += filedata.num_entries self.stats[letter]['types'] += filedata.num_types self.stats[letter]['distinct_types'] += filedata.num_distinct_types self.stats['total'] = {'entries': 0, 'types': 0, 'files': 0, 'distinct_types': 0} for letter in alphabet: for z in ('entries', 'types', 'distinct_types', 'files'): self.stats['total'][z] += self.stats[letter][z]
def merge_entries(in_dir, out_dir): """ Merge separate GEL entries generated for separate OED entries which are really different wordclasses of the same lemma. E.g. anger n. and anger v. """ iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low') for filecontent in iterator.iterate(): target_log = set() ode_linked_parallels = _find_parallels(filecontent) for entry in filecontent.entries: target_id = None if entry.attribute('parentId'): # Avoid loops (two entries treating each other # as parent) if not entry.oed_id() in target_log: target_id = entry.attribute('parentId') target_log.add(entry.attribute('parentId')) elif entry.oed_lexid() in ode_linked_parallels: target_id = ode_linked_parallels[entry.oed_lexid()] if target_id: targets = filecontent.entry_by_id(target_id) targets = [t for t in targets if t.tag() == 's1'] if targets: for wc in entry.wordclass_sets(): targets[0].node.append(wc.node) entry.node.getparent().remove(entry.node)
def clean_attributes(in_dir, out_dir): """ Clean up GEL data by adding/removing/adjusting various attributes. - Add a unique ID to every entry, wordclass set, morphset, and type; - Add a sort code to every entry, morphset, and type; - Fuzz start and end dates (approximate to nearest 50 or 100 years); - Remove unnecessary attributes from entry tags. """ iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low') for filecontent in iterator.iterate(): for entry in filecontent.entries: for att in REMOVABLE: if att in entry.node.attrib: entry.node.attrib.pop(att) entry.node.set('id', next_id()) entry.node.set('sort', entry.sort) for block in entry.wordclass_sets(): block.node.set('id', next_id()) block.fuzz_dates() for morphset in block.morphsets(): morphset.node.set('id', next_id()) morphset.node.set('sort', morphset.sort) morphset.fuzz_dates() for typeunit in morphset.types(): typeunit.node.set('id', next_id()) typeunit.node.set('sort', typeunit.sort)
def add_inflections(in_dir, out_dir): iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low') for filecontent in iterator.iterate(): for entry in filecontent.entries: for wordclass_set in [wcs for wcs in entry.wordclass_sets() if wcs.wordclass() in INFLECTABLE]: _process_wordclass_set(wordclass_set)
def iterate(self, **kwargs): if kwargs.get('letter'): alphabet = [kwargs.get('letter').lower(), ] else: alphabet = LETTERS for letter in alphabet: directory = os.path.join(self.in_dir, letter) iterator = FileIterator(in_dir=directory, out_dir=None, verbosity='low') for file_contents in iterator.iterate(): for entry in file_contents.entries: yield entry
def add_missing_inflections(in_dir, out_dir): """ Add any inflections that are given in the entry's variants list, but which so far have not been generated by the inflection process. These get added as 'fragment' morphsets (@fragment=true) """ iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low') for filecontent in iterator.iterate(): for entry in filecontent.entries: id = entry.oed_id() if entry.tag() == 's1' and VARIANTS_CACHE.id_exists(id): for wordclass_set in entry.wordclass_sets(): if wordclass_set.wordclass() in ('NN', 'VB'): _process_wordclass_set(wordclass_set, id)
def index_build_files(dir, out_file): iterator = FileIterator(in_dir=dir, out_dir=None, verbosity=None) index = [] for filecontent in iterator.iterate(): headwords = [] for entry in filecontent.entries: headwords.append(entry.lemma) index.append((iterator.file_number(), headwords[0], headwords[-1])) with open(out_file, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerows(index)
class AlphaSort(object): def __init__(self, in_dir, out_dir): self.iterator = FileIterator(in_dir=in_dir, out_dir=None, verbosity='low') self.out_dir = out_dir self.streams = {} def process(self): self.initialize() for filecontent in self.iterator.iterate(): for entry in filecontent.entries: sortcode = entry.attribute('sort') or 'zzz' initial = sortcode[0] self.streams[initial].add_to_buffer(entry.node) # finish off writing anything left in the buffer for initial in alphabet: self.streams[initial].write() for initial in alphabet: print('sorting %s...' % initial) self.streams[initial].sort_in_place() def initialize(self): self.streams = {} for initial in alphabet: self.streams[initial] = LetterSet(initial, self.out_dir) self.streams[initial].purge_directory()
def sort_in_place(self): self.filecount = 0 iterator = FileIterator(in_dir=self.out_dir, out_dir=None, verbosity=None) entries = [] for filecontent in iterator.iterate(): for entry in filecontent.entries: sortcode = entry.attribute('sort') or 'zzz' entries.append((sortcode, entry.tostring())) self.purge_directory() self.clear_buffer() entries.sort(key=lambda e: e[0]) for entry in entries: node = etree.fromstring(entry[1]) self.add_to_buffer(node) self.write()
def process(self): iterator = FileIterator(in_dir=self.in_dir, out_dir=None, verbosity='low') forms = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) for filecontent in iterator.iterate(): for entry in filecontent.entries: for wordclass_forms in _process_entry(entry): for item in wordclass_forms: initial = item.sort[0] forms[initial][item.sort][item.form].append(item) for initial, sortcode_set in forms.items(): self.subdir = os.path.join(self.out_dir, initial) self.clear_dir() self.filecount = 0 self.initialize_doc() for sortcode, form_set in sorted(sortcode_set.items()): for form, items in sorted(form_set.items()): entry = etree.Element('lemma', sort=sortcode) form_node = etree.SubElement(entry, 'form') form_node.text = form lex_node = etree.Element('lex') for item in items: instance_node = etree.Element( 'instance', wordclassId=item.wordclass_id, typeId=item.type_id, wordclass=item.wordclass, start=str(item.start), end=str(item.end), base=item.baseform, ) if item.xrid: instance_node.set('xrid', item.xrid) if item.xnode: instance_node.set('xnode', item.xnode) if item.is_variant: instance_node.set('variant', 'true') lex_node.append(instance_node) entry.append(lex_node) self.doc.append(entry) if len(self.doc) > 10000: self.writebuffer() self.writebuffer()
def __init__(self, in_dir, out_dir): self.iterator = FileIterator(in_dir=in_dir, out_dir=None, verbosity='low') self.out_dir = out_dir self.streams = {}