def compile_metadata(dirty_metadata): number_pattern = re.compile(r'[0-9]+') metadata = {} # Detect duplicate keys, and prepare dummy list assign values to ~~~~~~~~~~> key_count = Counter(x.split('[')[0] for x in dirty_metadata.keys()) duplicate_keys = dict((k,v) for k,v in key_count.items() if v > 1) for k,v in duplicate_keys.items(): metadata[k] = [None for x in range(v)] # Merge duplicate items, and clean out empty values ~~~~~~~~~~~~~~~~~~~~~~~> for k,v in dirty_metadata.items(): if v == '' or v is None: continue unique_key = k.split('[')[0] if unique_key in duplicate_keys: number_match = number_pattern.search(k) if not number_match: k_index = 0 else: k_index = int(number_match.group()) metadata[unique_key][k_index] = v else: metadata[k] = v # Filter out None sub-values ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~> for k,v in metadata.items(): if type(v) == list: metadata[k] = [x for x in v if x is not None] # Filter out None values, and return clean dictionary ~~~~~~~~~~~~~~~~~~~~~> return dict((k,v) for k,v in metadata.items() if v)
def format_text(dict_format, entries, accent=None, phoneset=None, encoding='windows-1252'): fmt = dict_formats[dict_format] if not accent: accent = fmt['accent'] if not phoneset: phoneset = fmt['phoneset'] if phoneset == 'ipa': encoding = 'utf-8' phonemeset = load_phonemes(accent, phoneset) for word, context, phonemes, comment, metadata, error in entries: if error: print(error, file=sys.stderr) continue components = [] if word: components.append('entry') word = fmt['word'](word) if context: components.append('context') if comment != None: if metadata != None: meta = [] for key, values in sorted(metadata.items()): meta.extend(['{0}={1}'.format(key, value) for value in values]) comment = '@@ {0} @@{1}'.format(' '.join(meta), comment) components.append('comment') if phonemes: phonemes = phonemeset.format(phonemes) if len(components) == 0: print() else: printf(fmt['-'.join(components)], encoding, word, context, phonemes, comment)
def format_text(dict_format, entries, accent=None, phoneset=None, encoding='windows-1252'): fmt = dict_formats[dict_format] if not accent: accent = fmt['accent'] if not phoneset: phoneset = fmt['phoneset'] if phoneset == 'ipa': encoding = 'utf-8' phonemeset = load_phonemes(accent, phoneset) for word, context, phonemes, comment, metadata, error in entries: if error: print(error, file=sys.stderr) continue components = [] if word: components.append('entry') word = fmt['word'](word) if context: components.append('context') if comment != None: if metadata != None: meta = [] for key, values in sorted(metadata.items()): meta.extend( ['{0}={1}'.format(key, value) for value in values]) comment = '@@ {0} @@{1}'.format(' '.join(meta), comment) components.append('comment') if phonemes: phonemes = phonemeset.format(phonemes) if len(components) == 0: print() else: printf(fmt['-'.join(components)], encoding, word, context, phonemes, comment)
def merge(metadatas): """merge given a list of metadatas return a dictionary of the union of all of them raise an exception if there is a conflict """ all_items = [] for metadata in metadatas: try: all_items += metadata.items() except AttributeError: continue if all_items == []: return None all_items.sort() cur_item = all_items[0] for item in all_items: if cur_item == item: continue if cur_item[0] == item[0]: raise MergeConflict(cur_item, item) cur_item = item return dict(all_items)