Exemplo n.º 1
0
def make_files():
    # Set analyses variable for creation of further resources
    # If it does not already exist, it will be created by importing modules from Reassign_POS above
    # If it is deleted by the code above, it will be recreated
    if path.exists("SG. Combined Data.xlsx"):
        analyses = list_xlsx("SG. Combined Data", "Sheet 1")
    else:
        print(create_data_combo())
        analyses = list_xlsx("SG. Combined Data", "Sheet 1")

    # Create 'Clean_GlossDict.pkl' and 'Clean_WordDict.pkl'
    print(create_clean_glossdict())
    print(create_clean_worddict())

    # Create 'A1 List.pkl', 'A2 List.pkl', 'A3 List.pkl', 'Active_Passive List.pkl',
    # 'Relative Options List.pkl', and 'Translations List.pkl'
    print(save_sorted_tags(sort_tag_levels(list_tag_levels(analyses))))

    # Create 'All POS Combos Used.pkl' and 'POS_taglist.pkl'
    print(save_all_pos_combos_list(analyses))
    print(create_pos_taglist())

    # Create 'Gloss_List.pkl' and 'Words_List.pkl'
    print(create_glosslist(analyses))
    print(create_wordlist(analyses))

    # Create 'SG POS-tagged combined.pkl' and 'SG POS-tagged separated.pkl'
    print(save_poslist(True))
    print(save_poslist(False))

    # Create 'sga_dipsgg-ud-test1.conllu' and 'sga_dipsgg-ud-test2.conllu'
    print(compile_SGG(open_obj("SG POS-tagged combined.pkl"), True))
    print(compile_SGG(open_obj("SG POS-tagged separated.pkl"), False))

    return ""
Exemplo n.º 2
0
def create_clean_worddict():
    sgData = list_xlsx("SG. Combined Data", "Sheet 1")
    wordlist = list()
    for i in sgData:
        thisword = i[2]
        if thisword:
            if thisword not in wordlist:
                wordlist.append(thisword)
    worddict = {}
    for i in wordlist:
        worddict[i] = clean_word(i)
    save_obj("Clean_WordDict", worddict)
    return "Created file: 'Clean_WordDict.pkl'"
Exemplo n.º 3
0
def create_clean_glossdict():
    sgData = list_xlsx("SG. Combined Data", "Sheet 1")
    glosslist = list()
    lastgloss = ""
    for i in sgData:
        thisgloss = i[10]
        if thisgloss != lastgloss:
            glosslist.append(thisgloss)
            lastgloss = thisgloss
    glossdict = {}
    for i in glosslist:
        glossdict[i] = clean_gloss(i)
    save_obj("Clean_GlossDict", glossdict)
    return "Created file: 'Clean_GlossDict.pkl'"
Exemplo n.º 4
0
   Match tokenisation of OI material in gloss-lists to words

   Implement appropriate spacing in gloss-lists based on tokenisation

   Sequence gloss-lists
"""

from Clean_ExcelLists import create_data_combo
from Pickle import open_obj
from OpenXlsx import list_xlsx
from Clean_Glosses import clean_gloss, clean_word, clean_lemma
from Reassign_POS import clean_analysis, clean_onetag, create_glosslist, create_wordlist

try:
    analyses = list_xlsx("SG. Combined Data", "Sheet 1")
except FileNotFoundError:
    print(create_data_combo())
    analyses = list_xlsx("SG. Combined Data", "Sheet 1")
# # Run the functions below to create the following .pkl files from spreadsheet, "SG. Combined Data"
try:
    glosslist = open_obj("Gloss_List.pkl")
    wordslist = open_obj("Words_List.pkl")
except FileNotFoundError:
    print(create_glosslist(analyses))
    print(create_wordlist(analyses))
    glosslist = open_obj("Gloss_List.pkl")
    wordslist = open_obj("Words_List.pkl")


# Map a word-separated gloss from the Hofman corpus to a list of POS-tagged words from the Bauer corpus
Exemplo n.º 5
0
    for i in somelist:
        tempi = []
        for j in i:
            try:
                if math.isnan(j):
                    tempi.append(False)
                else:
                    tempi.append(j)
            except TypeError:
                tempi.append(j)
        templist.append(tempi)
    return templist


# Gets only required fields from gloss spreadsheet, puts them in preferable order, removes replaces NaN instances
glosslist = [gloss_keeplist] + list_xlsx("glosses_full", "glosses",
                                         gloss_droptup)
glosslist = [[g[0], g[1], g[3], g[2], g[5], g[4], g[6]] for g in glosslist]
glosslist = clean_nan(glosslist)
fix_trans_list = list()
for i in glosslist:
    if i[4]:
        fix_trans_list.append(i)
    else:
        i[4] = "* no translation available *"
        fix_trans_list.append(i)
glosslist = fix_trans_list
# Gets only required fields from analysis spreadsheet, puts them in preferable order, removes replaces NaN instances
wordlist = [word_keeplist] + list_xlsx("glosses_words", "words", word_droptup)
wordlist = [[w[0], w[1], w[8], w[4], w[2], w[3], w[5], w[6], w[7]]
            for w in wordlist]
wordlist = clean_nan(wordlist)