Exemplo n.º 1
0
def format_type(lst, string):
    """
    Formats word type for a Wiktionary page. Takes lst as unformatted raw list of
    words describing the word type and string which represents formatted 
    simple word type (without any other qualifications). 
    The function handles the special cases when word type is a noun or verb
    and adds additional data such as gender and type of verb.
    Returns formatted word type block as a string.
    
    """
    formatted = ['{{српски-']
    formatted.append(string.lower())
    if string == 'Именица':
        gender = []
        if "['m']" in lst:
            if gender == []:
                gender.append('|род=м')
            else:
                gender.append(' м')
        if "['ž']" in lst:
            if gender == []:
                gender.append('|род=ж')
            else:
                gender.append(' ж')
        if "['s']" in lst:
            if gender == []:
                gender.append('|род=с')
            else:
                gender.append(' с')
        formatted.append(''.join(gender))
    elif string == 'Глагол':
        asp = ''.join([x for x in lst if x in ["['svrš.']", "['nesvrš.']"]])
        asp = transliterate(re.sub("[\[\]'\,]", "", asp))
        gen = ''.join([x for x in lst if x in ["['prel.']", "['neprel.']"]])
        gen = transliterate(re.sub("[\[\]'\,]", "", gen))
        if asp:
            formatted.append('|вид=' + asp)
        if gen:
            formatted.append('|род=' + gen)
    formatted.append('}}\n')
    return ''.join(formatted)
Exemplo n.º 2
0
def format_type(lst, string):
    """
    Formats word type for a Wiktionary page. Takes lst as unformatted raw list of
    words describing the word type and string which represents formatted 
    simple word type (without any other qualifications). 
    The function handles the special cases when word type is a noun or verb
    and adds additional data such as gender and type of verb.
    Returns formatted word type block as a string.
    
    """
    formatted = ['{{српски-']
    formatted.append(string.lower())
    if string == 'Именица':
        gender = []
        if "['m']" in lst:
            if gender == []:
                gender.append('|род=м')
            else:
                gender.append(' м') 
        if "['ž']" in lst:
            if gender == []:
                gender.append('|род=ж')
            else:
                gender.append(' ж') 
        if "['s']" in lst:
            if gender == []:
                gender.append('|род=с')
            else:
                gender.append(' с')
        formatted.append(''.join(gender))
    elif string == 'Глагол':        
        asp = ''.join([x for x in lst if x in ["['svrš.']", "['nesvrš.']"]])
        asp = transliterate(re.sub("[\[\]'\,]", "", asp))
        gen = ''.join([x for x in lst if x in ["['prel.']", "['neprel.']"]])
        gen = transliterate(re.sub("[\[\]'\,]", "", gen))
        if asp:
            formatted.append('|вид=' + asp)
        if gen:
            formatted.append('|род=' + gen)
    formatted.append('}}\n')
    return ''.join(formatted)
Exemplo n.º 3
0
 def __init__(self, name, lat=False):
     if lat == True:
         self.name = transliterate(name, lat)
         self.script = 'lat'
     else:
         self.name = name
         self.script = 'cyr'
     self.original_name = name
     self.type = []
     self.keys = 0
     self.unique = True
     self.syn = {}
     self.des = {}
     self.asc = {}
Exemplo n.º 4
0
 def __init__(self, name, lat = False):
     if lat == True:
         self.name = transliterate(name, lat)
         self.script = 'lat'
     else:
         self.name = name
         self.script = 'cyr'
     self.original_name = name
     self.type = []
     self.keys = 0
     self.unique = True
     self.syn = {}
     self.des = {}
     self.asc = {}
Exemplo n.º 5
0
def format_syn_asc(word_list, lat = False):
    """
    Formats the sysnonyms and associations block of the entry by finding their 
    appropriate references and inserting words and categories in tags.
    """
    string = []
    flip = {x:False for x in LINKS.keys()}
    for k in word_list:
        s = k.split()
        if s[-1].endswith('.') and s[-1] in LINKS.keys() and flip[s[-1]] == False:
            flip[s[-1]] = True
            string.append(LINKS[s[-1]] + ' ') 
        string.append(' [[' + ' '.join([transliterate(x, lat) for x in s if x.endswith('.') == False]) + ']]')
        for i in range(len(s)):
            if s[i].endswith('.'):
                string.append(' [['+s[i]+']]')
        if k != word_list[-1]:
            string.append(',')
    return ''.join(string)
Exemplo n.º 6
0
def format_syn_asc(word_list, lat=False):
    """
    Formats the sysnonyms and associations block of the entry by finding their 
    appropriate references and inserting words and categories in tags.
    """
    string = []
    flip = {x: False for x in LINKS.keys()}
    for k in word_list:
        s = k.split()
        if s[-1].endswith('.') and s[-1] in LINKS.keys() and flip[
                s[-1]] == False:
            flip[s[-1]] = True
            string.append(LINKS[s[-1]] + ' ')
        string.append(' [[' + ' '.join(
            [transliterate(x, lat)
             for x in s if x.endswith('.') == False]) + ']]')
        for i in range(len(s)):
            if s[i].endswith('.'):
                string.append(' [[' + s[i] + ']]')
        if k != word_list[-1]:
            string.append(',')
    return ''.join(string)
Exemplo n.º 7
0
def make_entries(sinonimi, to_text, to_pickle, debug, breakpoint, lat=False):
    """
    Iterate over dictionary in order of entries. If entry is a duplicate, put 
    the object that represents it in the list which contains all objects with
    the same duplicate name. 
    """
    if debug:
        names = []
    entries = {}
    duplicates = find_duplicate_keys(sinonimi,
                                     len(sinonimi["Rečnik sinonima"]))
    for i, s in enumerate(sinonimi["Rečnik sinonima"].keys()):
        if debug:
            names.append(" ".join(
                [x for x in sinonimi["Rečnik sinonima"][s].keys()]))
        entry_name = re.sub(r'\([^)]*\)', '', s).strip()
        entry = Entry(entry_name, lat)
        if entry_name in duplicates:
            entry.not_unique()
            if entry_name in entries:
                entries[entry_name].append(entry)
            else:
                entries[entry_name] = [entry]
        else:
            entries[s] = entry
        """
        Getting the type of the word
        """
        typ = list(sinonimi["Rečnik sinonima"][s].keys())[0]
        entry.set_type(typ)
        """
        Iterate over the meanings of the dictionary entry. Only 'reference'
        subdictionary is not ordered, so we use this fact to distinguish
        between them. Pass submeaning for further processing.
        """
        for j, body in enumerate(sinonimi["Rečnik sinonima"][s][typ]):
            if isinstance(body, OrderedDict):
                for meaning in sinonimi["Rečnik sinonima"][s][typ][0]:
                    entry.increase_key()
                    extract_meaning(
                        sinonimi["Rečnik sinonima"][s][typ][0][meaning],
                        entry,
                        sinonimi,
                        skey=meaning)
            else:
                #                it's reference which is not needed because it contains no data
                pass
        """
        Enable for debugging
    
        """
        if breakpoint:
            if i == breakpoint:
                break
    """
    To print the entries to a text file.
    """
    if to_text:
        out = codecs.open('out/test0.txt', 'w', encoding='utf8')
        for k in entries:
            out.write('\n{{-start-}}\n')
            out.write('\'\'\'%s\'\'\'\n' % (transliterate(k, lat)))
            if isinstance(entries[k], list):
                string = []
                for i, e in enumerate(entries[k]):
                    if i == 0:
                        string.extend(e.to_wiki(True, False))
                    elif i == (len(entries[k]) - 1):
                        string.extend(e.to_wiki(False, True))
                    else:
                        string.extend(e.to_wiki())

                string = concat_entry(string)
                out.write(string)
            elif isinstance(entries[k], object):
                string = concat_entry(entries[k].to_wiki(True, True))
                out.write(string)
            out.write('\n{{-stop-}}\n')
    """
    To print the entries to the console.
    """
    if debug:
        print(Counter(names))
        for k in entries:
            if isinstance(entries[k], list):
                for i in range(len(entries[k])):
                    entries[k][i].debug()
            else:
                entries[k].debug()
    """
    To Pickle
    """
    if to_pickle:
        pickle.dump(entries, open('out/synonymsX', 'wb'))

    print('Finish')
Exemplo n.º 8
0
def make_entries(sinonimi, to_text, to_pickle, debug, breakpoint, lat = False):
    """
    Iterate over dictionary in order of entries. If entry is a duplicate, put 
    the object that represents it in the list which contains all objects with
    the same duplicate name. 
    """
    if debug:
        names = []
    entries = {}
    duplicates = find_duplicate_keys(sinonimi, len(sinonimi["Rečnik sinonima"]))
    for i, s in enumerate(sinonimi["Rečnik sinonima"].keys()):
        if debug:        
            names.append(" ".join([x for x in sinonimi["Rečnik sinonima"][s].keys()]))
        entry_name = re.sub(r'\([^)]*\)', '', s).strip()
        entry = Entry(entry_name, lat)
        if entry_name in duplicates:
            entry.not_unique()
            if entry_name in entries:
                entries[entry_name].append(entry)                
            else:
                entries[entry_name] = [entry]
        else:
            entries[s] = entry       
        
        """
        Getting the type of the word
        """
        typ = list(sinonimi["Rečnik sinonima"][s].keys())[0] 
        entry.set_type(typ)
        """
        Iterate over the meanings of the dictionary entry. Only 'reference'
        subdictionary is not ordered, so we use this fact to distinguish
        between them. Pass submeaning for further processing.
        """
        for j, body in enumerate(sinonimi["Rečnik sinonima"][s][typ]):
            if isinstance(body, OrderedDict):
                for meaning in sinonimi["Rečnik sinonima"][s][typ][0]:
                    entry.increase_key()
                    extract_meaning(sinonimi["Rečnik sinonima"][s][typ][0][meaning], entry, sinonimi, skey = meaning)
            else:
#                it's reference which is not needed because it contains no data
                pass

        
        """
        Enable for debugging
    
        """
        if breakpoint:
            if i == breakpoint:
                break

    """
    To print the entries to a text file.
    """
    if to_text:
        out = codecs.open('out/test0.txt', 'w', encoding = 'utf8')
        for k in entries:
            out.write('\n{{-start-}}\n')
            out.write('\'\'\'%s\'\'\'\n' % (transliterate(k, lat)))
            if isinstance(entries[k], list):
                string = []
                for i, e in enumerate(entries[k]):
                    if i == 0:
                        string.extend(e.to_wiki(True, False))
                    elif i == (len(entries[k]) - 1):
                        string.extend(e.to_wiki(False, True))
                    else:
                        string.extend(e.to_wiki())
                
                string = concat_entry(string)
                out.write(string)
            elif isinstance(entries[k], object):
                string = concat_entry(entries[k].to_wiki(True, True))
                out.write(string)
            out.write('\n{{-stop-}}\n')
    """
    To print the entries to the console.
    """
    if debug:
        print (Counter(names))
        for k in entries:
            if isinstance(entries[k], list):
                for i in range(len(entries[k])):
                    entries[k][i].debug()
            else:
                entries[k].debug()
    """
    To Pickle
    """
    if to_pickle:
        pickle.dump(entries, open('out/synonymsX', 'wb')) 
    
    print('Finish')