def format_type(lst, string): """ Formats word type for a Wiktionary page. Takes lst as unformatted raw list of words describing the word type and string which represents formatted simple word type (without any other qualifications). The function handles the special cases when word type is a noun or verb and adds additional data such as gender and type of verb. Returns formatted word type block as a string. """ formatted = ['{{српски-'] formatted.append(string.lower()) if string == 'Именица': gender = [] if "['m']" in lst: if gender == []: gender.append('|род=м') else: gender.append(' м') if "['ž']" in lst: if gender == []: gender.append('|род=ж') else: gender.append(' ж') if "['s']" in lst: if gender == []: gender.append('|род=с') else: gender.append(' с') formatted.append(''.join(gender)) elif string == 'Глагол': asp = ''.join([x for x in lst if x in ["['svrš.']", "['nesvrš.']"]]) asp = transliterate(re.sub("[\[\]'\,]", "", asp)) gen = ''.join([x for x in lst if x in ["['prel.']", "['neprel.']"]]) gen = transliterate(re.sub("[\[\]'\,]", "", gen)) if asp: formatted.append('|вид=' + asp) if gen: formatted.append('|род=' + gen) formatted.append('}}\n') return ''.join(formatted)
def __init__(self, name, lat=False): if lat == True: self.name = transliterate(name, lat) self.script = 'lat' else: self.name = name self.script = 'cyr' self.original_name = name self.type = [] self.keys = 0 self.unique = True self.syn = {} self.des = {} self.asc = {}
def __init__(self, name, lat = False): if lat == True: self.name = transliterate(name, lat) self.script = 'lat' else: self.name = name self.script = 'cyr' self.original_name = name self.type = [] self.keys = 0 self.unique = True self.syn = {} self.des = {} self.asc = {}
def format_syn_asc(word_list, lat = False): """ Formats the sysnonyms and associations block of the entry by finding their appropriate references and inserting words and categories in tags. """ string = [] flip = {x:False for x in LINKS.keys()} for k in word_list: s = k.split() if s[-1].endswith('.') and s[-1] in LINKS.keys() and flip[s[-1]] == False: flip[s[-1]] = True string.append(LINKS[s[-1]] + ' ') string.append(' [[' + ' '.join([transliterate(x, lat) for x in s if x.endswith('.') == False]) + ']]') for i in range(len(s)): if s[i].endswith('.'): string.append(' [['+s[i]+']]') if k != word_list[-1]: string.append(',') return ''.join(string)
def format_syn_asc(word_list, lat=False): """ Formats the sysnonyms and associations block of the entry by finding their appropriate references and inserting words and categories in tags. """ string = [] flip = {x: False for x in LINKS.keys()} for k in word_list: s = k.split() if s[-1].endswith('.') and s[-1] in LINKS.keys() and flip[ s[-1]] == False: flip[s[-1]] = True string.append(LINKS[s[-1]] + ' ') string.append(' [[' + ' '.join( [transliterate(x, lat) for x in s if x.endswith('.') == False]) + ']]') for i in range(len(s)): if s[i].endswith('.'): string.append(' [[' + s[i] + ']]') if k != word_list[-1]: string.append(',') return ''.join(string)
def make_entries(sinonimi, to_text, to_pickle, debug, breakpoint, lat=False): """ Iterate over dictionary in order of entries. If entry is a duplicate, put the object that represents it in the list which contains all objects with the same duplicate name. """ if debug: names = [] entries = {} duplicates = find_duplicate_keys(sinonimi, len(sinonimi["Rečnik sinonima"])) for i, s in enumerate(sinonimi["Rečnik sinonima"].keys()): if debug: names.append(" ".join( [x for x in sinonimi["Rečnik sinonima"][s].keys()])) entry_name = re.sub(r'\([^)]*\)', '', s).strip() entry = Entry(entry_name, lat) if entry_name in duplicates: entry.not_unique() if entry_name in entries: entries[entry_name].append(entry) else: entries[entry_name] = [entry] else: entries[s] = entry """ Getting the type of the word """ typ = list(sinonimi["Rečnik sinonima"][s].keys())[0] entry.set_type(typ) """ Iterate over the meanings of the dictionary entry. Only 'reference' subdictionary is not ordered, so we use this fact to distinguish between them. Pass submeaning for further processing. """ for j, body in enumerate(sinonimi["Rečnik sinonima"][s][typ]): if isinstance(body, OrderedDict): for meaning in sinonimi["Rečnik sinonima"][s][typ][0]: entry.increase_key() extract_meaning( sinonimi["Rečnik sinonima"][s][typ][0][meaning], entry, sinonimi, skey=meaning) else: # it's reference which is not needed because it contains no data pass """ Enable for debugging """ if breakpoint: if i == breakpoint: break """ To print the entries to a text file. """ if to_text: out = codecs.open('out/test0.txt', 'w', encoding='utf8') for k in entries: out.write('\n{{-start-}}\n') out.write('\'\'\'%s\'\'\'\n' % (transliterate(k, lat))) if isinstance(entries[k], list): string = [] for i, e in enumerate(entries[k]): if i == 0: string.extend(e.to_wiki(True, False)) elif i == (len(entries[k]) - 1): string.extend(e.to_wiki(False, True)) else: string.extend(e.to_wiki()) string = concat_entry(string) out.write(string) elif isinstance(entries[k], object): string = concat_entry(entries[k].to_wiki(True, True)) out.write(string) out.write('\n{{-stop-}}\n') """ To print the entries to the console. """ if debug: print(Counter(names)) for k in entries: if isinstance(entries[k], list): for i in range(len(entries[k])): entries[k][i].debug() else: entries[k].debug() """ To Pickle """ if to_pickle: pickle.dump(entries, open('out/synonymsX', 'wb')) print('Finish')
def make_entries(sinonimi, to_text, to_pickle, debug, breakpoint, lat = False): """ Iterate over dictionary in order of entries. If entry is a duplicate, put the object that represents it in the list which contains all objects with the same duplicate name. """ if debug: names = [] entries = {} duplicates = find_duplicate_keys(sinonimi, len(sinonimi["Rečnik sinonima"])) for i, s in enumerate(sinonimi["Rečnik sinonima"].keys()): if debug: names.append(" ".join([x for x in sinonimi["Rečnik sinonima"][s].keys()])) entry_name = re.sub(r'\([^)]*\)', '', s).strip() entry = Entry(entry_name, lat) if entry_name in duplicates: entry.not_unique() if entry_name in entries: entries[entry_name].append(entry) else: entries[entry_name] = [entry] else: entries[s] = entry """ Getting the type of the word """ typ = list(sinonimi["Rečnik sinonima"][s].keys())[0] entry.set_type(typ) """ Iterate over the meanings of the dictionary entry. Only 'reference' subdictionary is not ordered, so we use this fact to distinguish between them. Pass submeaning for further processing. """ for j, body in enumerate(sinonimi["Rečnik sinonima"][s][typ]): if isinstance(body, OrderedDict): for meaning in sinonimi["Rečnik sinonima"][s][typ][0]: entry.increase_key() extract_meaning(sinonimi["Rečnik sinonima"][s][typ][0][meaning], entry, sinonimi, skey = meaning) else: # it's reference which is not needed because it contains no data pass """ Enable for debugging """ if breakpoint: if i == breakpoint: break """ To print the entries to a text file. """ if to_text: out = codecs.open('out/test0.txt', 'w', encoding = 'utf8') for k in entries: out.write('\n{{-start-}}\n') out.write('\'\'\'%s\'\'\'\n' % (transliterate(k, lat))) if isinstance(entries[k], list): string = [] for i, e in enumerate(entries[k]): if i == 0: string.extend(e.to_wiki(True, False)) elif i == (len(entries[k]) - 1): string.extend(e.to_wiki(False, True)) else: string.extend(e.to_wiki()) string = concat_entry(string) out.write(string) elif isinstance(entries[k], object): string = concat_entry(entries[k].to_wiki(True, True)) out.write(string) out.write('\n{{-stop-}}\n') """ To print the entries to the console. """ if debug: print (Counter(names)) for k in entries: if isinstance(entries[k], list): for i in range(len(entries[k])): entries[k][i].debug() else: entries[k].debug() """ To Pickle """ if to_pickle: pickle.dump(entries, open('out/synonymsX', 'wb')) print('Finish')