Exemplo n.º 1
0
def lemmabase_wordforms(database, IGC_folder, prop_names):
    """
    Iterates through the IGC, outputting a list of lemmas
    and their frequencies as well as all wordforms that appear
    alongside the lemma in the corpus. Useful for detecting whether
    a word only appears in certain context (e.g. fixed expressions)
    or whether a certain wordform never appears. Can be modified to 
    fit the user's need.
    """
    dci = SQLDatabase(db_name='gagnagrunnar/nmo.db')
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db'
                          )  # Predefined stop-word list based on the RMH
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    IGC = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}

    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)

    for word in IGC.extract(forms=True, lemmas=True, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.lemma)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.lemma
                   ) < 3:  # Ignore very short words, likely to be particles
                continue
            if '-' in [
                    word.lemma[0], word.lemma[1], word.lemma[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.lemma,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                if database == 'NMO':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DCI_ELEMENT',
                        cursor=dci.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DCI_ELEMENT',
                                              cursor=dci.cursor)
                elif database == 'BIN':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DIM_ELEMENT',
                        cursor=dim.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DIM_ELEMENT',
                                              cursor=dim.cursor)
                if not query.exists and not query_lower.exists:
                    if word.lemma in freqdic:
                        if word.word_form not in freqdic[
                                word.lemma]['orðmyndir']:
                            freqdic[word.lemma]['orðmyndir'].append(
                                word.word_form)
                        freqdic[word.lemma]['tíðni'] += 1
                    else:
                        freqdic[word.lemma] = {}
                        freqdic[word.lemma]['tíðni'] = 1
                        freqdic[word.lemma]['orðmyndir'] = [word.word_form]
        except IndexError:
            continue
        except ET.ParseError:
            continue

    print("""
    ============================================================
    Flokkar orð eftir tíðni.
    ============================================================
    """)

    if IGC_folder == "malheildir/RMH/":
        with open(f'uttak/{database}/RMH_lemmur_med_ordmyndum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið RMH_lemmur_med_ordmyndum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/CC_BY/":
        with open(f'uttak/{database}/CC_BY_lemmur_med_ordmyndum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið CC_BY_lemmur_med_ordmyndum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/MIM/":
        with open(f'uttak/{database}/MIM_lemmur_med_ordmyndum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið MIM_lemmur_med_ordmyndum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/    
    ============================================================
        """)
    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'uttak/{database}/' + namefolder +
                  '_lemmur_med_ordmyndum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Úttaksskjalið {database}_lemmur_med_ordmyndum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
Exemplo n.º 2
0
def texttype_freqs(database, folder, prop_names):
    """
    Used to collect lemmas by the types of text they appear in and sort
    them by frequency. Filters the IGC in order to retrieve the desired
    results. The script can be modified according to the user's need 
    and to fit another corpus.  
    """
    dci = SQLDatabase(db_name='databases/dci.db')
    dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db')
    filters = SQLDatabase(db_name='databases/IGC_filters.db') # Predefined stop-word list based on the IGC

    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)
    xml_files = glob.glob(folder+'/**/*.xml', recursive=True)

    alltexttypes = []
    freqdic1 = {}
    freqdic2 = {}
    filebar = IncrementalBar('Progress', max = len(xml_files))
    for file in xml_files:
        with open(file, 'r', encoding='utf-8') as content:
            try:
                tree = ET.parse(content)
                root = tree.getroot()
                textClass = root[0][2][0][0][0][0] # Retrieve the texttype tag from the XML file
                texttype = textClass.text 
                if texttype not in alltexttypes:
                    alltexttypes.append(texttype) # Collect all unique texttypes
                pos_to_ignore = ['e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'] # The POS tags that should not be displayed in the results
                for word in tree.iter():
                    pos = word.attrib.get('type')
                    if pos is not None:
                        if prop_names==False:
                            if pos.startswith('n') and pos.endswith('s'): # Ignore proper names
                                continue
                        if pos in pos_to_ignore:
                            continue
                        if (not all(i.isalpha() or i == '-' for i in word.text)): # Ignore all that are not alphabetic letters or hyphen 
                            continue
                        if len(word.text) < 3: # Ignore very short words, likely to be particles
                            continue
                        if word.text[-1] == '-': # Ignore words starting or ending with a hypen (likely OCR errors)
                            continue
                        if word.text[0] == '-':
                            continue
                        if word.attrib.get('lemma') is not None:
                            lemma = word.attrib.get('lemma')
                            filter_query = SQLiteQuery(lemma,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) # Ignore stop words
                            if filter_query.exists:
                                continue
                            else:
                                if database == 'DCI':
                                    query = SQLiteQuery(lemma, 'lemma','DCI_ELEMENT', cursor = dci.cursor) # Capitalized words included
                                    query_lower = SQLiteQuery(lemma.lower(),'lemma','DCI_ELEMENT', cursor = dci.cursor)
                                elif database == 'DIM':
                                    query = SQLiteQuery(lemma, 'lemma','DIM_ELEMENT', cursor = dim.cursor) # Capitalized words included
                                    query_lower = SQLiteQuery(lemma.lower(),'lemma','DIM_ELEMENT', cursor = dim.cursor)
                                if not query.exists and not query_lower.exists: # If the word is not found in the DIM or the stopwords
                                    if lemma not in freqdic1: # Collect total freqs
                                        freqdic1[lemma] = 1
                                    else:
                                        freqdic1[lemma] += 1
                                    if (lemma,texttype) not in freqdic2: # Collect texttype freqs
                                        freqdic2[(lemma,texttype)] = 1
                                    else:
                                        freqdic2[(lemma,texttype)] += 1
            except IndexError:
                continue
            except ET.ParseError:
                continue

        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    print("""
    ============================================================
    Sorting frequencies by text types. 
    ============================================================
    """)

    tempfinal = []
    bar1 = IncrementalBar('Progress', max = len(freqdic1))
    for key, value in sorted(freqdic1.items()): # Lemma, total freq
        tempf = []
        tempf.append(key)
        temp = []
        for k, v in freqdic2.items(): 
            if k[0] == key:
                temp.append((k[1], v)) # A list of all possible texttypes that appear with the lemma
        for tt in alltexttypes:
            if tt in [item[0] for item in temp]:
                continue
            else:
                temp.append((tt, 0)) 
        tempf.append(value)
        for tup in sorted(temp):
            tempf.append(tup[1]) 
        tempfinal.append(tempf) # The format of this list is [lemma, totalfreq, texttype_a freq, texttype_b freq...]
        bar1.next()
        sys.stdout.flush()
    bar1.finish()

    header = ['Word', 'Total freq'] + sorted(alltexttypes)

    if folder == "corpora/IGC/":
        with open(f"output/{database}/IGC_texttypes.csv", mode='w+') as outputfile:
            csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(header)
            for i in tempfinal:
                csvwriter.writerow(i)
        print(f"""
    ============================================================
    Output file IGC_texttypes.csv is ready and can be found
    in the output/{database}/ directory.
    ============================================================
        """)
    elif folder == "corpora/IGC/CC_BY/":
        with open(f'output/{database}/CC_BY_texttypes.csv', mode='w+') as outputfile:
            csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(header)
            for i in tempfinal:
                csvwriter.writerow(i)
        print(f"""
    ============================================================
    Output file CC_BY_texttypes.csv is ready and can be found
    in the output/{database}/ directory.
    ============================================================
        """)
    elif folder == "corpora/IGC/TIC/":
        with open(f'output/{database}/TIC_texttypes.csv', mode='w+') as outputfile:
            csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(header)
            for i in tempfinal:
                csvwriter.writerow(i)
        print(f"""
    ============================================================
    Output file TIC_texttypes.csv is ready and can be found
    in the output/{database}/ directory.
    ============================================================
        """)
    else:
        namefolder = folder.split("/")[3]
        with open(f'output/{database}/'+namefolder+"_texttypes.csv", mode='w+') as outputfile:
           csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
           csvwriter.writerow(header)
           for i in tempfinal:
               csvwriter.writerow(i)

        print(f"""
    ============================================================
    Output file {namefolder}_texttypes.csv is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
Exemplo n.º 3
0
def wordform_output(IGC_folder, prop_names):
    """
    Iterates over the input corpus and returns the word forms not found 
    in the DIM, ordered by frequency. Can be altered for other 
    databases or corpora. 
    """
    dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db')
    filters = SQLDatabase(db_name='databases/IGC_filters.db'
                          )  # Predefined stop-word list based on the IGC
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    IGC = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}
    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)
    for word in IGC.extract(forms=True, lemmas=False, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.word_form)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.word_form) < 3:
                continue
            if '-' in [
                    word.word_form[0], word.word_form[1], word.word_form[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.word_form,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                query = SQLiteQuery(
                    word.word_form,
                    'word_form',
                    'DIM_ELEMENT',
                    cursor=dim.cursor)  # Capitalized words included
                query_lower = SQLiteQuery(word.word_form.lower(),
                                          'word_form',
                                          'DIM_ELEMENT',
                                          cursor=dim.cursor)
                if not query.exists and not query_lower.exists:  # If the word is not found in the DIM or the stopwords
                    if word.word_form in freqdic:
                        freqdic[word.word_form] += 1
                    else:
                        freqdic[word.word_form] = 1
        except IndexError:
            continue
        except ET.ParseError:
            continue
    print("""
    ============================================================
    Sorting candidate frequencies.
    ============================================================
    """)

    if IGC_folder == "corpora/IGC/":
        with open('output/DIM/IGC_wordform.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file IGC_wordforms.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/CC_BY/":
        with open('output/DIM/CC_BY_wordform.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file CC_BY_wordforms.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)

    elif IGC_folder == "corpora/IGC/TIC/":
        with open('output/DIM/TIC_wordform.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file TIC_wordforms.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)

    else:
        namefolder = IGC_folder.split("/")[3]
        with open('output/DIM/' + namefolder + '_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Output file {namefolder}_wordforms.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
Exemplo n.º 4
0
def lemma_output(database, IGC_folder, prop_names):
    """
    Iterates over the input corpus and returns the lemmas not found 
    in the input database, ordered by frequency. Also includes 
    information on the number of nouns, indicating if a noun only 
    exists in either the singular or the plural form (and whether 
    the automatic lemmatization/pos tagging is off). Can be altered 
    for other databases or corpora. 
    """
    dci = SQLDatabase(db_name='databases/dci.db')
    dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db')
    filters = SQLDatabase(db_name='databases/IGC_filters.db'
                          )  # Predefined stop-word list based on the IGC
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    IGC = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}

    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)
    for word in IGC.extract(forms=False, lemmas=True, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.lemma)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.lemma
                   ) < 3:  # Ignore very short words, likely to be particles
                continue
            if '-' in [
                    word.lemma[0], word.lemma[1], word.lemma[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.lemma,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                if database == 'DCI':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DCI_ELEMENT',
                        cursor=dci.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DCI_ELEMENT',
                                              cursor=dci.cursor)
                elif database == 'DIM':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DIM_ELEMENT',
                        cursor=dim.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DIM_ELEMENT',
                                              cursor=dim.cursor)

                if not query.exists and not query_lower.exists:  # If the word is not found in the DIM or the stopwords
                    if word.lemma in freqdic:
                        if word.pos[0] == 'n':  # if word is a noun
                            freqdic[word.lemma]['freq'] += 1
                            if word.pos[
                                    2] == 'e':  # if the noun is singular (eintala)
                                freqdic[word.lemma]['number']['sing'] += 1
                            elif word.pos[
                                    2] == 'f':  # if the noun is plural (fleirtala)
                                freqdic[word.lemma]['number']['plur'] += 1
                        else:
                            freqdic[word.lemma][
                                'freq'] += 1  # Necessary for proper names, nouns with no number
                            freqdic[word.lemma]['number']['no_number'] += 1
                    else:
                        if word.pos[0] == 'n':
                            if word.pos[2] == 'e':
                                freqdic[word.lemma] = {
                                    'freq': 0,
                                    'number': {
                                        'sing': 1,
                                        'plur': 0,
                                        'no_number': 0
                                    }
                                }
                            elif word.pos[2] == 'f':
                                freqdic[word.lemma] = {
                                    'freq': 0,
                                    'number': {
                                        'sing': 0,
                                        'plur': 1,
                                        'no_number': 0
                                    }
                                }
                            else:
                                freqdic[word.lemma] = {
                                    'freq': 0,
                                    'number': {
                                        'sing': 0,
                                        'plur': 0,
                                        'no_number': 1
                                    }
                                }
                        else:
                            freqdic[word.lemma] = {
                                'freq': 0,
                                'number': {
                                    'sing': 0,
                                    'plur': 0,
                                    'no_number': 1
                                }
                            }
                        freqdic[word.lemma]['freq'] = 1
        except IndexError:
            continue
        except ET.ParseError:
            continue

    print("""
    ============================================================
    Sorting candidate frequencies.
    ============================================================
    """)
    if IGC_folder == "corpora/IGC/":
        with open(f'output/{database}/IGC_lemma.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Output file IGC_lemma.freq is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/CC_BY/":
        with open(f'output/{database}/CC_BY_lemma.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Output file CC_BY_lemma.freq is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/TIC/":
        with open(f'output/{database}/TIC_lemma.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Output file TIC_lemma.freq is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'output/{database}/' + namefolder + '_lemma.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Output file {namefolder}_lemma.freq is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
Exemplo n.º 5
0
def lemma_output(database, IGC_folder, prop_names):
    """
    Iterates over the input corpus and returns the lemmas not found 
    in the input database, ordered by frequency. Also includes 
    information on the tala of nouns, indicating if a noun only 
    exists in either the singular or the plural form (and whether 
    the automatic lemmatization/pos tagging is off). Can be altered 
    for other databases or malheildir. 
    """
    dci = SQLDatabase(db_name='gagnagrunnar/nmo.db')
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db'
                          )  # Predefined stop-word list based on the RMH
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    RMH = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}

    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)
    for word in RMH.extract(forms=False, lemmas=True, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.lemma)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.lemma
                   ) < 3:  # Ignore very short words, likely to be particles
                continue
            if '-' in [
                    word.lemma[0], word.lemma[1], word.lemma[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.lemma,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                if database == 'NMO':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DCI_ELEMENT',
                        cursor=dci.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DCI_ELEMENT',
                                              cursor=dci.cursor)
                elif database == 'BIN':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DIM_ELEMENT',
                        cursor=dim.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DIM_ELEMENT',
                                              cursor=dim.cursor)

                if not query.exists and not query_lower.exists:  # If the word is not found in the BIN or the stopwords
                    if word.lemma in freqdic:
                        if word.pos[0] == 'n':  # if word is a noun
                            freqdic[word.lemma]['tíðni'] += 1
                            if word.pos[
                                    2] == 'e':  # if the noun is singular (eintala)
                                freqdic[word.lemma]['tala']['eintala'] += 1
                            elif word.pos[
                                    2] == 'f':  # if the noun is plural (fleirtala)
                                freqdic[word.lemma]['tala']['fleirtala'] += 1
                        else:
                            freqdic[word.lemma][
                                'tíðni'] += 1  # Necessary for proper names, nouns with no tala
                            freqdic[word.lemma]['tala']['engin_tala'] += 1
                    else:
                        if word.pos[0] == 'n':
                            if word.pos[2] == 'e':
                                freqdic[word.lemma] = {
                                    'tíðni': 0,
                                    'tala': {
                                        'eintala': 1,
                                        'fleirtala': 0,
                                        'engin_tala': 0
                                    }
                                }
                            elif word.pos[2] == 'f':
                                freqdic[word.lemma] = {
                                    'tíðni': 0,
                                    'tala': {
                                        'eintala': 0,
                                        'fleirtala': 1,
                                        'engin_tala': 0
                                    }
                                }
                            else:
                                freqdic[word.lemma] = {
                                    'tíðni': 0,
                                    'tala': {
                                        'eintala': 0,
                                        'fleirtala': 0,
                                        'engin_tala': 1
                                    }
                                }
                        else:
                            freqdic[word.lemma] = {
                                'tíðni': 0,
                                'tala': {
                                    'eintala': 0,
                                    'fleirtala': 0,
                                    'engin_tala': 1
                                }
                            }
                        freqdic[word.lemma]['tíðni'] = 1
        except IndexError:
            continue
        except ET.ParseError:
            continue

    print("""
    ============================================================
    Flokkar orð eftir tíðni.
    ============================================================
    """)
    if IGC_folder == "malheildir/RMH/":
        with open(f'uttak/{database}/RMH_lemmur.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið RMH_lemmur.freq er tilbúið og er að finna í 
    undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/CC_BY/":
        with open(f'uttak/{database}/CC_BY_lemmur.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið CC_BY_lemmur.freq er tilbúið og er að finna í 
    undirmöppunni uttak/{database}/    
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/MIM/":
        with open(f'uttak/{database}/MIM_lemmur.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið MIM_lemmur.freq er tilbúið og er að finna í 
    undirmöppunni uttak/{database}/    
    ============================================================
        """)
    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'uttak/{database}/' + namefolder + '_lemmur.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Úttaksskjalið {namefolder}_lemmur.freq er tilbúið og er að finna í 
    undirmöppunni uttak/{database}/    
    ============================================================
        """)
Exemplo n.º 6
0
def user_defined_freqlist(database,filterbase,corpus):
    """
    Iterates through a user-defined corpus and compares
    the results to a user-defined database, filtering out
    stopwords if the user has defined a stopword database.
    Returns a frequency word list.
    """
    db = SQLDatabase(db_name=database)
    txt_files = glob.glob(corpus+'/**/*.txt', recursive=True)
    if filterbase != 'None':
        filters = SQLDatabase(db_name=filterbase)
    else:
        pass

    outdict = {}
    
    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)
    filebar = IncrementalBar('Progress', max = len(txt_files))
    for file in txt_files:
        with open(file, 'r', encoding='utf-8') as content:
            f = content.read()
            words = f.split()
            for w in words:
                if w[-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error)
                    continue
                if w[0] == '-':
                    continue
                if (not all(i.isalpha() or i == '-' for i in w)): # if a word contains anything but an alphabetic letter or hyphen, ignore it
                    continue
                if filterbase != 'None': # if a stopword database has been defined, filter the results
                    filter_query = SQLiteQuery(w,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) 
                    if filter_query.exists:
                        continue
                    else:
                        query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor) # parameters must be updated if the database format is changed                 
                        query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) 
                        if not query.exists and not query_lower.exists: 
                            if len(w) >= 3:
                                if w in outdict:
                                    outdict[w] += 1
                                else:
                                    outdict[w] = 1
                else:
                    query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor)                 
                    query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) 
                    if not query.exists and not query_lower.exists: 
                        if len(w) > 1:
                            if w in outdict:
                                outdict[w] += 1
                            else:
                                outdict[w] = 1
        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    output_file = input("""
    ============================================================
    Please indicate what your output file should be called,
    followed by .freq

    Example: lexicon_frequencylist.freq
    ============================================================
    """)

    with open('output/user_defined/'+output_file, mode='w+') as outputfile:
        candidates = {k: v for k, v in sorted(outdict.items(),
                        key=lambda item: item[1], reverse=True)}
        for key, value in candidates.items():
            outputfile.write(key+': '+str(value)+ '\n')

    print(f"""
    ============================================================
    Output file {output_file} is ready and can be 
    found at the output/user_defined/ directory.
    ============================================================
    """)
Exemplo n.º 7
0
def wordform_output(IGC_folder, prop_names):
    """
    Iterates over the input corpus and returns the word forms not found 
    in the BIN, ordered by frequency. Can be altered for other 
    gagnagrunnar or malheildir. 
    """
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db'
                          )  # Predefined stop-word list based on the RMH
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    RMH = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}
    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)
    for word in RMH.extract(forms=True, lemmas=False, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.word_form)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.word_form) < 3:
                continue
            if '-' in [
                    word.word_form[0], word.word_form[1], word.word_form[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.word_form,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                query = SQLiteQuery(
                    word.word_form,
                    'word_form',
                    'DIM_ELEMENT',
                    cursor=dim.cursor)  # Capitalized words included
                query_lower = SQLiteQuery(word.word_form.lower(),
                                          'word_form',
                                          'DIM_ELEMENT',
                                          cursor=dim.cursor)
                if not query.exists and not query_lower.exists:  # If the word is not found in the BIN or the stopwords
                    if word.word_form in freqdic:
                        freqdic[word.word_form] += 1
                    else:
                        freqdic[word.word_form] = 1
        except IndexError:
            continue
        except ET.ParseError:
            continue
    print("""
    ============================================================
    Flokkar orð eftir tíðni.
    ============================================================
    """)

    if IGC_folder == "malheildir/RMH/":
        with open('uttak/BIN/RMH_ordmyndir.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Úttaksskjalið RMH_ordmyndir.freq er tilbúið og er að finna í 
    undirmöppunni uttak/BIN/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/CC_BY/":
        with open('uttak/BIN/CC_BY_ordmyndir.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Úttaksskjalið CC_BY_ordmyndir.freq er tilbúið og er að finna í 
    undirmöppunni uttak/BIN/
    ============================================================
        """)

    elif IGC_folder == "malheildir/RMH/MIM/":
        with open('uttak/BIN/MIM_ordmyndir.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Úttaksskjalið MIM_ordmyndir.freq er tilbúið og er að finna í 
    undirmöppunni uttak/BIN/
    ============================================================
        """)

    else:
        namefolder = IGC_folder.split("/")[3]
        with open('uttak/BIN/' + namefolder + '_ordmyndir.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Úttaksskjalið {namefolder}_ordmyndir.freq er tilbúið og er að finna í 
    undirmöppunni uttak/BIN/
    ============================================================
        """)
Exemplo n.º 8
0
def lemmabase_wordforms(database, IGC_folder, prop_names):
    """
    Iterates through the IGC, outputting a list of lemmas
    and their frequencies as well as all wordforms that appear
    alongside the lemma in the corpus. Useful for detecting whether
    a word only appears in certain context (e.g. fixed expressions)
    or whether a certain wordform never appears. Can be modified to 
    fit the user's need.
    """
    dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db')
    dci = SQLDatabase(db_name='databases/dci.db')
    filters = SQLDatabase(db_name='databases/IGC_filters.db'
                          )  # Predefined stop-word list based on the IGC
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    IGC = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}

    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)

    for word in IGC.extract(forms=True, lemmas=True, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.lemma)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.lemma
                   ) < 3:  # Ignore very short words, likely to be particles
                continue
            if '-' in [
                    word.lemma[0], word.lemma[1], word.lemma[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.lemma,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                if database == 'DCI':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DCI_ELEMENT',
                        cursor=dci.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DCI_ELEMENT',
                                              cursor=dci.cursor)
                elif database == 'DIM':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DIM_ELEMENT',
                        cursor=dim.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DIM_ELEMENT',
                                              cursor=dim.cursor)
                if not query.exists and not query_lower.exists:
                    if word.lemma in freqdic:
                        if word.word_form not in freqdic[
                                word.lemma]['wordforms']:
                            freqdic[word.lemma]['wordforms'].append(
                                word.word_form)
                        freqdic[word.lemma]['freq'] += 1
                    else:
                        freqdic[word.lemma] = {}
                        freqdic[word.lemma]['freq'] = 1
                        freqdic[word.lemma]['wordforms'] = [word.word_form]
        except IndexError:
            continue
        except ET.ParseError:
            continue

    print("""
    ============================================================
    Sorting candidate frequencies.
    ============================================================
    """)

    if IGC_folder == "corpora/IGC/":
        with open(f'output/{database}/IGC_lemma_plus_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file IGC_lemma_plus_wordform.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/CC_BY/":
        with open(f'output/{database}/CC_BY_lemma_plus_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file CC_BY_lemma_plus_wordform.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/TIC/":
        with open(f'output/{database}/TIC_lemma_plus_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file TIC_lemma_plus_wordform.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'output/{database}/' + namefolder +
                  '_lemma_plus_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Output file {namefolder}_lemma_plus_wordform.freq is ready 
    and can be found in the output/DIM/ directory.
    ============================================================
        """)
Exemplo n.º 9
0
def user_defined_collocations(database, filterbase, corpus):
    """
    Iterates through the corpus and retrieves the words that do 
    not appear in the database. Collects 5 word collocations on
    every word, two words before and after the candidate word. 
    """
    db = SQLDatabase(db_name=database)
    txt_files = glob.glob(corpus + '/**/*.txt', recursive=True)
    if filterbase not in ['n', 'N']:
        filters = SQLDatabase(db_name=filterbase)
    else:
        pass  # if there is no filterbase, ignore this step

    outdict = {}

    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)
    filebar = IncrementalBar('Framvinda', max=len(txt_files))
    for file in txt_files:
        with open(file, 'r', encoding='utf-8') as content:
            f = content.read()
            words = f.split()
            for i, w in enumerate(words):
                if w[-1] == '-':  # if a word starts or ends in an hyphen, ignore it (likely OCR error)
                    continue
                if w[0] == '-':
                    continue
                if (
                        not all(i.isalpha() or i == '-' for i in w)
                ):  # if a word contains anything but an alphabetic letter or hyphen, ignore it
                    continue
                if filterbase not in [
                        'n', 'N'
                ]:  # if a stopword database has been defined, filter the results
                    filter_query = SQLiteQuery(w,
                                               'filter',
                                               'FILTER_WORD_FORMS',
                                               cursor=filters.cursor)
                    if filter_query.exists:
                        continue
                    else:
                        query = SQLiteQuery(
                            w, 'word', 'LEXICON_WORD', cursor=db.cursor
                        )  # parameters must be updated if the database format is changed
                        query_lower = SQLiteQuery(w.lower(),
                                                  'word',
                                                  'LEXICON_WORD',
                                                  cursor=db.cursor)
                        if not query.exists and not query_lower.exists:  # If the word is not found in the database nor the filters
                            if len(w) > 1:
                                if i - 2 < 0:  # collects 2 words before and after the candidate
                                    w1 = ""
                                else:
                                    w1 = str(words[i - 2])
                                if i - 1 < 0:
                                    w2 = ""
                                else:
                                    w2 = str(words[i - 1])
                                if i + 1 > len(words) - 1:
                                    w4 = ""
                                else:
                                    w4 = str(words[i + 1])
                                if i + 2 > len(words) - 1:
                                    w5 = ""
                                else:
                                    w5 = str(words[i + 2])
                                if w in outdict:
                                    if str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 +
                                           ' ' +
                                           w5) not in outdict[w]['orðstaða']:
                                        outdict[w]['orðstaða'][str(w1 + ' ' +
                                                                   w2 + ' ' +
                                                                   w + ' ' +
                                                                   w4 + ' ' +
                                                                   w5)] = 1
                                    else:
                                        outdict[w]['orðstaða'][str(w1 + ' ' +
                                                                   w2 + ' ' +
                                                                   w + ' ' +
                                                                   w4 + ' ' +
                                                                   w5)] += 1
                                    outdict[w]['tíðni'] += 1
                                else:
                                    outdict[w] = {}
                                    outdict[w]['tíðni'] = 1
                                    outdict[w]['orðstaða'] = {
                                        str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5):
                                        1
                                    }

                else:
                    query = SQLiteQuery(w,
                                        'word',
                                        'LEXICON_WORD',
                                        cursor=db.cursor)
                    query_lower = SQLiteQuery(w.lower(),
                                              'word',
                                              'LEXICON_WORD',
                                              cursor=db.cursor)
                    if not query.exists and not query_lower.exists:
                        if len(w) > 1:
                            if i - 2 < 0:
                                w1 = ""
                            else:
                                w1 = str(words[i - 2])
                            if i - 1 < 0:
                                w2 = ""
                            else:
                                w2 = str(words[i - 1])
                            if i + 1 > len(words) - 1:
                                w4 = ""
                            else:
                                w4 = str(words[i + 1])
                            if i + 2 > len(words) - 1:
                                w5 = ""
                            else:
                                w5 = str(words[i + 2])
                            if w in outdict:
                                if str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 +
                                       ' ' + w5) not in outdict[w]['orðstaða']:
                                    outdict[w]['orðstaða'][str(w1 + ' ' + w2 +
                                                               ' ' + w + ' ' +
                                                               w4 + ' ' +
                                                               w5)] = 1
                                else:
                                    outdict[w]['orðstaða'][str(w1 + ' ' + w2 +
                                                               ' ' + w + ' ' +
                                                               w4 + ' ' +
                                                               w5)] += 1
                                outdict[w]['tíðni'] += 1
                            else:
                                outdict[w] = {}
                                outdict[w]['tíðni'] = 1
                                outdict[w]['orðstaða'] = {
                                    str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5):
                                    1
                                }

        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    output_file = input("""
    ============================================================
    Skrifaðu það sem þú vilt að úttaksskjalið heiti með 
    endingunni .freq
 
    Dæmi: ordasafn_ordstodulyklar.freq
    ============================================================
    """)

    with open('uttak/notendagogn/' + output_file, mode='w+') as outputfile:
        candidates = {
            k: v
            for k, v in sorted(outdict.items(),
                               key=lambda item: item[1]['tíðni'],
                               reverse=True)
        }  # Sort the candidates by their total frequencies
        for key, item in candidates.items():
            for counter, dictitem in enumerate(item.items()):
                if counter % 2 == 0:
                    freq = dictitem[1]
                elif counter % 2 != 0:
                    sorted_sents = {
                        k: v
                        for k, v in sorted(
                            dictitem[1].items(
                            ),  # Sort the sentence examples by their frequencies
                            key=lambda item: item[1],
                            reverse=True)
                    }
                    if len(
                            sorted_sents
                    ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                        sents = list(sorted_sents)[:5]
                    else:
                        sents = list(sorted_sents)
                    outputfile.write(
                        key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                    )  # word: freq. [sent example 1, sent example 2...]

    print(f"""
    ============================================================
    Úttaksskjalið {output_file} er tilbúið og má finna í 
    undirmöppunni uttak/notendagogn/
    ============================================================
    """)
Exemplo n.º 10
0
def lemmas_collocations(database, IGC_folder, prop_names):
    dci = SQLDatabase(db_name='gagnagrunnar/nmo.db')
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db'
                          )  # Predefined stop-word list based on the RMH
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    outdict = {}

    print("""
    ============================================================
    Les skjöl úr málheildinni. 
    ============================================================
    """)
    xml_files = glob.glob(IGC_folder + '/**/*.xml', recursive=True)

    filebar = IncrementalBar('Framvinda', max=len(xml_files))
    for file in xml_files:
        colloc = []
        with open(file, 'r', encoding='utf-8') as content:
            try:
                tree = ET.parse(content)
                for word in tree.iter():
                    if word.text is not None:
                        if word.attrib.get('lemma') is not None:
                            pos = word.attrib.get('type')
                            lemma = word.attrib.get('lemma')
                            word_form = word.text
                            colloc.append((word_form, lemma, pos))
                        elif word.text in punctuation:
                            colloc.append((word.text, ' ', ' '))

                for i, w in enumerate(colloc):
                    if prop_names == False:
                        if w[2].startswith('n') and w[2].endswith(
                                's'):  # Ignore proper names
                            continue
                    if w[2] in pos_to_ignore:
                        continue
                    if w[1][-1] == '-':  # if a word starts or ends in an hyphen, ignore it (likely OCR error)
                        continue
                    if w[1][0] == '-':
                        continue
                    if (
                            not all(i.isalpha() or i == '-' for i in w[1])
                    ):  # if a word contains anything but an alphabetic letter or hyphen, ignore it
                        continue
                    filter_query = SQLiteQuery(w[1],
                                               'filter',
                                               'FILTER_WORD_FORMS',
                                               cursor=filters.cursor)
                    if filter_query.exists:
                        continue
                    else:
                        if database == 'NMO':
                            query = SQLiteQuery(w[1],
                                                'lemma',
                                                'DCI_ELEMENT',
                                                cursor=dci.cursor
                                                )  # Capitalized words included
                            query_lower = SQLiteQuery(w[1].lower(),
                                                      'lemma',
                                                      'DCI_ELEMENT',
                                                      cursor=dci.cursor)
                        elif database == 'BIN':
                            query = SQLiteQuery(w[1],
                                                'lemma',
                                                'DIM_ELEMENT',
                                                cursor=dim.cursor
                                                )  # Capitalized words included
                            query_lower = SQLiteQuery(w[1].lower(),
                                                      'lemma',
                                                      'DIM_ELEMENT',
                                                      cursor=dim.cursor)
                        if not query.exists and not query_lower.exists:  # If the word is not found in the database nor the filters
                            if len(w[1]) > 1:
                                if i - 2 < 0:  # collects 2 words before and after the candidate
                                    w1 = ""
                                else:
                                    w1 = str(colloc[i - 2][0])
                                if i - 1 < 0:
                                    w2 = ""
                                else:
                                    w2 = str(colloc[i - 1][0])
                                if i + 1 > len(colloc) - 1:
                                    w4 = ""
                                else:
                                    w4 = str(colloc[i + 1][0])
                                if i + 2 > len(colloc) - 1:
                                    w5 = ""
                                else:
                                    w5 = str(colloc[i + 2][0])
                                if w[1] in outdict:
                                    if str(w1 + ' ' + w2 + ' ' + w[0] + ' ' +
                                           w4 + ' ' + w5) not in outdict[
                                               w[1]]['orðstaða']:
                                        outdict[w[1]]['orðstaða'][
                                            str(w1 + ' ' + w2 + ' ' + w[0] +
                                                ' ' + w4 + ' ' + w5)] = 1
                                    else:
                                        outdict[w[1]]['orðstaða'][
                                            str(w1 + ' ' + w2 + ' ' + w[0] +
                                                ' ' + w4 + ' ' + w5)] += 1
                                    outdict[w[1]]['tíðni'] += 1
                                else:
                                    outdict[w[1]] = {}
                                    outdict[w[1]]['tíðni'] = 1
                                    outdict[w[1]]['orðstaða'] = {
                                        str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5):
                                        1
                                    }
            except sqlite3.OperationalError:
                pass
        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    if IGC_folder == "malheildir/RMH/":
        with open(f'uttak/{database}/RMH_lemmur_med_orstodulyklum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(outdict.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }  # Sort the candidates by their total frequencies
            for key, item in candidates.items():
                for counter, dictitem in enumerate(item.items()):
                    if counter % 2 == 0:
                        freq = dictitem[1]
                    elif counter % 2 != 0:
                        sorted_sents = {
                            k: v
                            for k, v in sorted(
                                dictitem[1].items(
                                ),  # Sort the sentence examples by their frequencies
                                key=lambda item: item[1],
                                reverse=True)
                        }
                        if len(
                                sorted_sents
                        ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                            sents = list(sorted_sents)[:5]
                        else:
                            sents = list(sorted_sents)
                        outputfile.write(
                            key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                        )  # word: freq. [sent example 1, sent example 2...]

        print(f"""
    ============================================================
    Úttaksskjalið RMH_lemmur_med_orstodulyklum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)

    elif IGC_folder == "malheildir/RMH/CC_BY/":
        with open(f'uttak/{database}/CC_BY_lemmur_med_orstodulyklum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(outdict.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }  # Sort the candidates by their total frequencies
            for key, item in candidates.items():
                for counter, dictitem in enumerate(item.items()):
                    if counter % 2 == 0:
                        freq = dictitem[1]
                    elif counter % 2 != 0:
                        sorted_sents = {
                            k: v
                            for k, v in sorted(
                                dictitem[1].items(
                                ),  # Sort the sentence examples by their frequencies
                                key=lambda item: item[1],
                                reverse=True)
                        }
                        if len(
                                sorted_sents
                        ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                            sents = list(sorted_sents)[:5]
                        else:
                            sents = list(sorted_sents)
                        outputfile.write(
                            key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                        )  # word: freq. [sent example 1, sent example 2...]

        print(f"""
    ============================================================
    Úttaksskjalið CC_BY_lemmur_med_orstodulyklum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/MIM/":
        with open(f'uttak/{database}/MIM_lemmur_med_orstodulyklum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(outdict.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }  # Sort the candidates by their total frequencies
            for key, item in candidates.items():
                for counter, dictitem in enumerate(item.items()):
                    if counter % 2 == 0:
                        freq = dictitem[1]
                    elif counter % 2 != 0:
                        sorted_sents = {
                            k: v
                            for k, v in sorted(
                                dictitem[1].items(
                                ),  # Sort the sentence examples by their frequencies
                                key=lambda item: item[1],
                                reverse=True)
                        }
                        if len(
                                sorted_sents
                        ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                            sents = list(sorted_sents)[:5]
                        else:
                            sents = list(sorted_sents)
                        outputfile.write(
                            key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                        )  # word: freq. [sent example 1, sent example 2...]

        print(f"""
    ============================================================
    Úttaksskjalið MIM_lemmur_med_orstodulyklum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)

    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'uttak/{database}/' + namefolder +
                  '_lemmur_med_orstodulyklum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(outdict.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }  # Sort the candidates by their total frequencies
            for key, item in candidates.items():
                for counter, dictitem in enumerate(item.items()):
                    if counter % 2 == 0:
                        freq = dictitem[1]
                    elif counter % 2 != 0:
                        sorted_sents = {
                            k: v
                            for k, v in sorted(
                                dictitem[1].items(
                                ),  # Sort the sentence examples by their frequencies
                                key=lambda item: item[1],
                                reverse=True)
                        }
                        if len(
                                sorted_sents
                        ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                            sents = list(sorted_sents)[:5]
                        else:
                            sents = list(sorted_sents)
                        outputfile.write(
                            key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                        )  # word: freq. [sent example 1, sent example 2...]

        print(f"""
    ============================================================
    Úttaksskjalið {namefolder}_lemmur_med_orstodulyklum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
Exemplo n.º 11
0
def user_defined_freqlist(database,filterbase,corpus):
    """
    Iterates through a user-defined corpus and compares
    the results to a user-defined database, filtering out
    stopwords if the user has defined a stopword database.
    Returns a frequency word list.
    """
    db = SQLDatabase(db_name=database)
    txt_files = glob.glob(corpus+'/**/*.txt', recursive=True)
    print(filterbase)
    if filterbase not in ['n', 'N']:
        filters = SQLDatabase(db_name=filterbase)
    else:
        pass

    outdict = {}
    
    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)
    filebar = IncrementalBar('Framvinda', max = len(txt_files))
    for file in txt_files:
        with open(file, 'r', encoding='utf-8') as content:
            f = content.read()
            words = f.split()
            for w in words:
                if w[-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error)
                    continue
                if w[0] == '-':
                    continue
                if (not all(i.isalpha() or i == '-' for i in w)): # if a word contains anything but an alphabetic letter or hyphen, ignore it
                    continue
                if filterbase not in ['n', 'N']: # if a stopword database has been defined, filter the results
                    filter_query = SQLiteQuery(w,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) 
                    if filter_query.exists:
                        continue
                    else:
                        query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor) # parameters must be updated if the database format is changed                 
                        query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) 
                        if not query.exists and not query_lower.exists: 
                            if len(w) > 1:
                                if w in outdict:
                                    outdict[w] += 1
                                else:
                                    outdict[w] = 1
                else:
                    query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor)                 
                    query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) 
                    if not query.exists and not query_lower.exists: 
                        if len(w) > 1:
                            if w in outdict:
                                outdict[w] += 1
                            else:
                                outdict[w] = 1
        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    output_file = input("""
    ============================================================
    Skrifaðu það sem þú vilt að úttaksskjalið heiti með 
    endingunni .freq
 
    Dæmi: ordasafn.freq
    ============================================================
    """)

    with open('uttak/notendagogn/'+output_file, mode='w+') as outputfile:
        candidates = {k: v for k, v in sorted(outdict.items(),
                        key=lambda item: item[1], reverse=True)}
        for key, value in candidates.items():
            outputfile.write(key+': '+str(value)+ '\n')

    print(f"""
    ============================================================
    Úttaksskjalið {output_file} er tilbúið og má finna í 
    undirmöppunni uttak/notendagogn/
    ============================================================
    """)