def morph_counts_old_version(self, words):
        #Word List to list of all morphisms
        print("len words: ")
        print(len(words))
        print("len unique words: ")
        print(len(set(words)))
        frog = Frog(
            FrogOptions(tok=True,
                        lemma=True,
                        morph=True,
                        daringmorph=False,
                        mwu=False,
                        chunking=False,
                        ner=False,
                        parser=False))
        morphisms = []
        print_counter = 1
        t0 = time.time()
        for word in words:
            output = frog.process(word)
            morphisms_word = output[0].get("morph")
            morphisms_word_list = morphisms_word.replace('[', '').split(']')
            #Momenteel GEEN GEHELE WOORDEN IN COUNT
            if len(morphisms_word_list) > 2:
                morphisms += morphisms_word_list
            total_length = len(words)
            print(str(print_counter) + " of " + str(total_length))
            print_counter += 1
        print("Frog Processing Time:")
        print(self.format_time(time.time() - t0))

        morphisms = list(filter(None, morphisms))
        morph_counts = Counter(morphisms)
        return morph_counts
Exemplo n.º 2
0
    def morph_counts_new_version(self, words):
        #Word List to list of all morphisms
        frog = Frog(FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False,parser=False))
        words_string = ' '.join(words)
        morphisms = []
        print_counter = 1
        t0 = time.time()
        print("Starting Frog Processing..")
        output = frog.process(words_string)
        print("Process time:")
        process_time = self.format_time(time.time() - t0)
        print(process_time)
        t1 = time.time()
        for i in range(0,len(words)-1):
            morphisms_word = output[i].get("morph")
            morphisms_word_list = morphisms_word.replace('[', '').split(']')
            #Momenteel GEEN GEHELE WOORDEN IN COUNT
            if len(morphisms_word_list) > 2:
                morphisms += morphisms_word_list
            total_length = len(words)
            print(str(print_counter) + " of " + str(total_length))
            print_counter += 1
        print("Process Time:")
        print(process_time)
        print("Getting Morphisms Time:")
        print(self.format_time(time.time() - t1))
        print("Total Time:")
        print(self.format_time(time.time() - t0))



        morphisms = list(filter(None, morphisms))
        morph_counts = Counter(morphisms)
        return morph_counts
Exemplo n.º 3
0
def prep_nl(df, filename):
    from frog import Frog, FrogOptions

    print("Tokenizing, POS tagging, and lemmatizing the Dutch data...")

    # Create 'frog' instance. Turn off various options to save time.
    frog = Frog(
        FrogOptions(parser=False, morph=False, chunking=False, ner=False))

    # Define set of possible answers
    if not "STAT_C" in str(filename):
        answers = ['Answer']
    elif "STAT_C" in str(filename):
        answers = ['Answer4a', 'Answer2aDec', 'Answer2aCaus']

    # Loop through answers
    for question_type in answers:

        for index in df.index:
            ans = df.loc[index, question_type]

            # Logging
            if index % 20 == 0:
                print(index, "/", df.index[-1], question_type[6:])

            # Remove numbers
            ans = re.sub("\d+", "", ans)

            # Remove tags in spelling-corrected data
            ans = ans.replace("_abbreviation", "")

            # Remove non-Dutch and illegible words
            ans = re.sub("\w+_nonexistent", "", ans)
            ans = re.sub("\w+_nonexisting", "", ans)
            ans = re.sub("\w+_english", "", ans)
            ans = re.sub("\w+_german", "", ans)
            ans = re.sub("\?+_illegible", "", ans)

            # Preprocess the data with Frog
            ans_dict = frog.process(ans)

            tok_answer = []
            lem_answer = []
            pos_tags = []

            # Append outcomes to list
            for word_index in range(len(ans_dict)):
                if ans_dict[word_index][
                        'pos'] != "LET()":  # Exclude punctuation
                    tok_answer.append(ans_dict[word_index]['text'].lower())
                    lem_answer.append(ans_dict[word_index]['lemma'])
                    pos_tags.append(ans_dict[word_index]['pos'])

            # Fill in the dataframe
            df.at[index, 'Tokenized{}'.format(question_type[6:])] = tok_answer
            df.at[index, 'Lemmatized{}'.format(question_type[6:])] = lem_answer
            df.at[index, 'POS{}'.format(question_type[6:])] = pos_tags

    return df
Exemplo n.º 4
0
    def morph_counts_faster_version(self, words):
        #Word List to list of all morphisms

        frog = Frog(FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False,parser=False))
        batch_size = 400
        morphisms = []
        print_batch_number = 1
        start_time = time.time()
        total_batch_number = math.ceil(len(words)/batch_size)
        total_process_time = 0
        total_getting_morphisms_time = 0
        for i in range(0, len(words), batch_size):
            t0 = time.time()
            #print_counter = 1
            words_batch = words[i:i + batch_size]
            words_batch_string = ' '.join(words_batch)
            #print("Starting Frog Processing.. for batch = " + str(print_batch_number))
            output = frog.process(words_batch_string)
            #print("Process time:")
            process_time = time.time() - t0
            #print(self.format_time(process_time))
            #print(process_time)
            t1 = time.time()
            for j in range(0,len(words_batch)-1):
                morphisms_word = output[j].get("morph")
                morphisms_word_list = morphisms_word.replace('[', '').split(']')
                #Momenteel GEEN GEHELE WOORDEN IN COUNT
                if len(morphisms_word_list) > 2:
                    morphisms += morphisms_word_list
                total_batch_length = len(words_batch)
                #print(str(print_counter) + " of " + str(total_batch_length) + " -- of batch -- " + str(print_batch_number) + " of " + str(total_batch_number) )
                #print("batch" + " (batch_size: " + str(batch_size) + " words):    " +  str(print_batch_number) + " of " + str(total_batch_number))
                #print_counter += 1
            print_batch_number += 1
            getting_morphisms_time = time.time() - t1
            total_process_time += process_time
            total_getting_morphisms_time += getting_morphisms_time

        print("Total number of words: ")
        print(len(words))
        print("")
        print("Unique number words: ")
        print(len(set(words)))
        print("")
        print("Total Process Time:")
        print(self.format_time(total_process_time))
        print("")
        print("Total Getting Morphisms Time: ")
        print(self.format_time(total_getting_morphisms_time))
        print("")
        print("Total Time:")
        print(self.format_time(time.time() - start_time))
        print("")

        morphisms = list(filter(None, morphisms))
        morph_counts = Counter(morphisms)
        return morph_counts
Exemplo n.º 5
0
def change_text_to_morphs(sentences,
                          frog_merge=False,
                          save=False,
                          filename=None):
    # sentence list to sentence list in frog morphism form
    morphSentences = []

    frog = Frog(
        FrogOptions(tok=True,
                    lemma=True,
                    morph=True,
                    daringmorph=False,
                    mwu=False,
                    chunking=False,
                    ner=False,
                    parser=False))

    for sentenceNumber in range(0, len(sentences)):
        print(sentenceNumber)
        print("of")
        print(len(sentences))
        sentenceToBeProcessed = sentences[sentenceNumber]
        sentenceToBeProcessed = sentenceToBeProcessed.replace("\n", " ")
        morphSentence = []
        output = frog.process(sentenceToBeProcessed)
        for i in range(0, len(output)):
            morphisms_word = output[i].get("morph")
            morphisms_word_list = morphisms_word.replace('[', '').split(']')
            if frog_merge:
                morphisms_word_list = list(filter(None, morphisms_word_list))
                morphisms_word_list = intersperse(morphisms_word_list,
                                                  "insertmergetoken")
                #print(morphisms_word_list)
            #print("EVET")
            #print(morphisms_word_list)
            morphSentence += morphisms_word_list
        #print("MORPHSENTENCE")
        #print(morphSentence)
        # Remove the empty strings
        morphSentence = list(filter(None, morphSentence))
        #print("ok")
        #print(morphSentence)
        morphSentence = ' '.join(morphSentence)
        #print("HERE")
        #print(morphSentence)
        morphSentences.append(morphSentence)

    if save is True:
        with open(filename, 'wb') as outputfile:
            pickle.dump(morphSentences, outputfile)
    return morphSentences
def change_text_to_morphs(sentences,
                          frog_merge=False,
                          save=False,
                          filename=None):
    # sentence list to sentence list in frog morphism form
    morphSentences = []

    frog = Frog(
        FrogOptions(tok=True,
                    lemma=True,
                    morph=True,
                    daringmorph=False,
                    mwu=False,
                    chunking=False,
                    ner=False,
                    parser=False))
    j = 0
    for sentenceToBeProcessed in sentences:

        if j % 1000 == 0:
            print(j + 1)
            print("of")
            print(len(sentences))

        j += 1
        sentenceToBeProcessed = sentenceToBeProcessed.rstrip('\n')
        morphSentence = []
        output = frog.process(sentenceToBeProcessed)

        for i in range(0, len(output)):
            morphisms_word = output[i].get("morph")
            morphisms_word_list = morphisms_word.replace('[', '').split(']')
            if frog_merge:
                morphisms_word_list = list(filter(None, morphisms_word_list))
                morphisms_word_list = intersperse(morphisms_word_list,
                                                  "__add_merge__")

            morphSentence += morphisms_word_list

        # Remove the empty strings
        morphSentence = list(filter(None, morphSentence))

        morphSentence = ' '.join(morphSentence)
        morphSentences.append(morphSentence)

    if save is True:
        with open(filename, 'wb') as outputfile:
            pickle.dump(morphSentences, outputfile)
    return morphSentences
    def morph_counts_fastest_version(self, words):
        # Word List to list of all morphisms

        word_counts = Counter(
            word for word in toolz.concat(map(self.word_tokenizer, words)))

        print("words_counts: ")
        print(word_counts)
        print("")
        print("Unique number words: " + str(len(set(words))))
        print("Total number of words: " + str(len(words)))
        print("")
        unique_words_set = set(words)
        unique_words = list(unique_words_set)

        frog = Frog(
            FrogOptions(tok=True,
                        lemma=True,
                        morph=True,
                        daringmorph=False,
                        mwu=False,
                        chunking=False,
                        ner=False,
                        parser=False))
        batch_size = 400
        morphisms = []
        print_batch_number = 1
        start_time = time.time()
        total_batch_number = math.ceil(len(unique_words) / batch_size)
        total_process_time = 0
        total_getting_morphisms_time = 0

        for i in range(0, len(unique_words), batch_size):
            t0 = time.time()
            words_batch = unique_words[i:i + batch_size]
            words_batch_string = ' '.join(words_batch)
            output = frog.process(words_batch_string)
            process_time = time.time() - t0
            t1 = time.time()

            for j in range(0, len(words_batch) - 1):
                current_word = output[j].get("text")
                morphisms_word = output[j].get("morph")
                morphisms_word_list = morphisms_word.replace('[',
                                                             '').split(']')
                current_word_count = word_counts[current_word]

                # Momenteel GEEN GEHELE WOORDEN IN COUNT
                if len(morphisms_word_list) > 2:
                    morphisms += morphisms_word_list * current_word_count

                total_batch_length = len(words_batch)
            print("batch" + " (batch_size: " + str(batch_size) +
                  " words):    " + str(print_batch_number) + " of " +
                  str(total_batch_number))

            print_batch_number += 1
            getting_morphisms_time = time.time() - t1
            total_process_time += process_time
            total_getting_morphisms_time += getting_morphisms_time

        print("Total number of words: ")
        print(len(words))
        print("")
        print("Unique number words: ")
        print(len(set(words)))
        print("")
        print("Total Process Time:")
        print(self.format_time(total_process_time))
        print("")
        print("Total Getting Morphisms Time: ")
        print(self.format_time(total_getting_morphisms_time))
        print("")
        print("Total Time:")
        print(self.format_time(time.time() - start_time))
        print("")

        # Remove the empty strings
        morphisms = list(filter(None, morphisms))
        #Make a counter of all morphisms
        morph_counts = Counter(morphisms)
        return morph_counts
Exemplo n.º 8
0
 def __init__(self, **kwargs):
     self._frog = Frog(FrogOptions(parser=False, mwu=False, tok=False, xmlIn=True, **kwargs))
Exemplo n.º 9
0
def main():
    global have_frog

    greekHDfile = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               "list_proiel_word_lemma_POS_freq")
    ghd_words = {}
    nofreqfile = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "list_proiel_perseus_merged_word_lemma_POS_nofreq")
    filenames = []  #list of globbed files
    filename = None  # test file
    extrafile = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "extra-wlt.txt")
    frog_words = {}
    lookup_w = None  #specific word to look up
    lookup_l = None  #specific lemma to look up
    verbose = False
    wltmode = False  #if true, assume test file is columns; only first token is used
    frog_cfg = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            "pretrained_models/herodotus/frog.cfg.template")
    remove_root = True  # default is to remove ROOT from brat files, -R to disable
    suffix = ".lastrun"
    stats = False

    callstr = " ".join(sys.argv)

    try:
        opts, args = getopt.getopt(sys.argv[1:], "c:f:l:L:s:vw:DE:FM:RWS", [])
    except getopt.GetoptError as err:
        print(str(err))
        sys.exit(1)
    for o, a in opts:
        if o in ("-f"):
            filenames = sorted(glob.glob(a))
        elif o in ("-c"):  #alternative frog config
            frog_cfg = a
        elif o in ("-l"):  #lookup a specific lemma, print to screen
            lookup_l = a
        elif o in ("-L"):  #choose another lexicon file
            greekHDfile = a
        elif o in ("-M"):  #choose another merged (wlt) file
            nofreqfile = a
        elif o in ("-E"):  #choose another extra-wlt (wlt) file
            extrafile = a
        elif o in ("-s"):
            suffix = "." + a
        elif o in ("-v"):
            verbose = True
        elif o in ("-w"):  #lookup a specific word, print to screen
            lookup_w = a
        elif o in ("-D"):
            debug = True
        elif o in ("-F"):  # disables Frog, use also when Frog not available
            have_frog = False  #force ignore frog
            frog_cfg = None
        elif o in ("-R"):
            remove_root = not remove_root
        elif o in ("-W"):
            wltmode = True
        elif o in ("-S"):
            stats = True
        else:
            assert False, "unhandled option"

    logfile = "glem" + suffix + ".log"
    lgf = open(logfile, "w")  #or append?
    print(callstr, file=lgf, flush=True)

    # Sanity checks, aborts if specified lexicon files not found.
    files_found = True
    for f in [greekHDfile, filename, nofreqfile, extrafile, frog_cfg]:
        if f and not os.path.exists(f):
            print("ERROR: FILE NOT FOUND:", f, file=lgf, flush=True)
            print("ERROR: FILE NOT FOUND:", f, file=sys.stderr)
            files_found = False
    if not files_found:
        print("ABORT: Necessary files not found", file=sys.stderr)
        print("ABORT: Necessary files nt found", file=lgf, flush=True)
        lgf.close()
        sys.exit(1)

    # Initialise Frog.
    if have_frog:
        print("INITIALISE FROG", file=sys.stderr)
        frog = Frog(
            FrogOptions(parser=True,
                        tok=False,
                        morph=False,
                        mwu=False,
                        chunking=False,
                        ner=False), frog_cfg)

    # Statistics on lexicon files.
    line_count = 0
    new_entries = 0
    zero_freq = 0
    doubles = 0
    conflicts = 0

    print("READING", greekHDfile, file=sys.stderr)
    print("READING", greekHDfile, file=lgf, flush=True)
    with open(greekHDfile, 'r') as f:
        '''
        WORD            LEMMA       TAG             COUNT
        ἀλλήλοις            ἀλλήλων Pc-p---md--i    5
        ἀλλήλοις            ἀλλήλων Pc-p---nd--i    2
        ἀλλήλοισι           ἀλλήλων Pc-p---md--i    9
        '''
        for l in f:
            l = l.strip()
            if len(l) > 0 and l[0] == "#":
                print("SKIP COMMENT", l, file=lgf, flush=True)
                continue
            bits = l.split()
            if len(bits) != 4:
                print("SKIP NOT 4 FIELDS", l, file=lgf, flush=True)
                continue
            line_count += 1
            word = normalize('NFC', bits[0])
            lemma = normalize('NFC', bits[1])
            tag = bits[2]
            try:
                freq = int(bits[3])
            except ValueError:
                print("SKIP FREQUENCY ERROR", l, file=lgf, flush=True)
                continue
            if freq == 0:
                #print( "HAS 0 FREQUENCY", l, file=lgf, flush=True )
                zero_freq += 1
            DBG(word, lemma, tag, freq)
            # Store it.
            if word in ghd_words.keys():
                word_entry = ghd_words[word]
                new_lemma = Lemma(word, lemma, tag, freq)
                new_lemma.src = "greek_Haudag"  #proiel
                # Note we assume unique word-tag combinations.
                if tag in word_entry.lemmas:
                    # WHAT
                    # τοσόνδε, τοσόσδε, Pd-s---na-,     5
                    # τοσόνδε τοσόσδε Pd-s---na- 0
                    # Normally, if the second one has a lower count, it is ignored.
                    if True or freq > word_entry.lemmas[tag].freq:
                        if lemma != word_entry.lemmas[tag].lemma:
                            print("CONFLICTING DOUBLE ENTRY",
                                  file=lgf,
                                  flush=True)
                            conflicts += 1
                        else:
                            print("DOUBLE ENTRY", file=lgf, flush=True)
                        print("STORED",
                              word_entry.lemmas[tag],
                              file=lgf,
                              flush=True)
                        print("   NEW", new_lemma, file=lgf, flush=True)
                        doubles += 1
                word_entry.lemmas[tag] = new_lemma
                DBG("append entry", word)
            else:
                word_entry = Word(word)
                new_lemma = Lemma(word, lemma, tag, freq)
                new_lemma.src = "greek_Haudag"  #"proiel"
                word_entry.lemmas[tag] = new_lemma
                ghd_words[word] = word_entry
                new_entries += 1
                DBG("new entry", word)
    print("Added", new_entries, "new entries.", file=lgf, flush=True)
    print("Counted",
          zero_freq,
          "entries with frequency 0.",
          file=lgf,
          flush=True)
    print("Ignored",
          doubles,
          "double entries, of which",
          conflicts,
          "conflicts.",
          file=lgf,
          flush=True)

    new_entries = 0
    if nofreqfile:
        print("READING", nofreqfile, file=sys.stderr)
        print("READING", nofreqfile, file=lgf, flush=True)
        with open(nofreqfile, 'r') as f:
            for l in f:
                l = l.strip()
                if len(l) > 0 and l[0] == "#":
                    print("SKIP", l, file=lgf, flush=True)
                    continue
                bits = l.split()
                if len(bits) != 3:
                    print("SKIP", l, file=lgf, flush=True)
                    continue
                line_count += 1
                word = normalize('NFC', bits[0])
                lemma = normalize('NFC', bits[1])
                tag = bits[2]
                freq = 0  #unknown
                DBG(word, lemma, tag)
                if word in ghd_words.keys():
                    word_entry = ghd_words[word]
                    if tag in word_entry.lemmas:  # if already present, do nothing, because
                        # we have it from first list
                        DBG("TAG ALREADY PRESENT", word, lemma, tag)
                    else:
                        new_lemma = Lemma(word, lemma, tag, freq)
                        new_lemma.src = "merged"  #"nofreq"
                        word_entry.lemmas[tag] = new_lemma
                        DBG("append entry", word)
                    DBG("skip existing entry", word)
                else:
                    word_entry = Word(word)
                    new_lemma = Lemma(word, lemma, tag, freq)
                    new_lemma.src = "merged"  #"nofreq"
                    word_entry.lemmas[tag] = new_lemma
                    ghd_words[word] = word_entry
                    new_entries += 1
                    DBG("new entry", word)
    print("Added", new_entries, "new entries.", file=lgf, flush=True)
    new_entries = 0

    # At the moment we have punctuation here.
    # format is word-lemma-tag
    #
    if extrafile:
        print("READING", extrafile, file=sys.stderr)
        print("READING", extrafile, file=lgf, flush=True)
        with open(extrafile, 'r') as f:
            for l in f:
                l = l.strip()
                if len(l) > 0 and l[0] == "#":
                    print("SKIP COMMENT", l, file=lgf, flush=True)
                    continue
                bits = l.split()
                if len(bits) != 3:
                    print("SKIP NOT 3 FIELDS", l, file=lgf, flush=True)
                    continue
                line_count += 1
                word = normalize('NFC', bits[0])
                lemma = normalize('NFC', bits[1])
                tag = bits[2]
                if word in ghd_words.keys():
                    word_entry = ghd_words[word]
                    if tag in word_entry.lemmas:  #indexed by tag
                        word_entry.lemmas[tag].freq += 1
                    else:
                        new_lemma = Lemma(word, lemma, tag, 1)
                        new_lemma.src = "extra"
                        word_entry.lemmas[tag] = new_lemma
                else:
                    word_entry = Word(word)
                    new_lemma = Lemma(word, lemma, tag, freq)
                    new_lemma.src = "extra"
                    word_entry.lemmas[tag] = new_lemma
                    ghd_words[word] = word_entry
                    new_entries += 1
    print("Added", new_entries, "new entries.\n", file=lgf, flush=True)
    new_entries = 0

    # Print top-5 most frequent words, with top-5 lemmas
    if verbose:
        sorted_words = sorted(ghd_words,
                              key=lambda k: len(ghd_words[k].lemmas),
                              reverse=True)
        for x in sorted_words[0:5]:
            print(ghd_words[x], file=sys.stderr)
            # print top-5 frequent lemmas
            for l in sorted(sorted(ghd_words[x].lemmas.values(),
                                   key=attrgetter('tag'),
                                   reverse=False),
                            key=attrgetter('freq'),
                            reverse=True)[0:5]:
                print(" ", l, file=sys.stderr)

    # Count lemmatisation stats
    lemmatiser_stats = Counter()

    # Possible lemmatiser "strategies"
    strategies = {
        "MLDTHF":
        "multi lemmas, no pos tag match, highest frequency",  #DT=different tag
        "MLNTHF": "multi lemmas, no tag, highest frequency",
        "MLSTHF": "multi lemmas, pos tag match, and highest frequency",
        "MLNTHF": "multi lemmas, no tag, highest frequency",
        "MLSTOF": "multi lemmas, pos tag match, but other frequency",
        "MLNTOF": "multi lemmas, no tag, other frequency",
        "OLDT": "one lemma, but different pos tag",
        "OLST": "one lemma, same pos tag",
        "OLNT": "one lemma, no tag",
        "FROG": "Frog lemma",
        "UNKNOWN": "unknown"
    }

    # Prefill Counters
    lemmatiser_stats["unknown"] = 0
    for s in strategies:
        lemmatiser_stats[strategies[s]] = 0
    '''
    Lemmatiser strategy:

    Check if word in dictionary.

    If it is:
      1) If it has only one tag/lemma entry, return it.
         ("one lemma, same pos tag" / "one lemma, different pos tag")
      2) More than one tag/lemma entry: go through the tag/lemmas:
         a) if a lemma with a similar pos tag is found, return it.
            ("multiple lemmas, same pos tag, highest frequency" / "multi lemmas, same pos tag, other frequency")
         b) otherwise, return the most frequent tag/lemma.
            ("multi lemmas, different pos tag, highest frequency")
         *) sorting was non-deterministic if same count?

    If it is not:
      1) Take Frog entry, and return it.
         ("Frog" / "Frog list")
      2) If this fails:
      return None.
      ("unknown")
    '''

    # ---------------------------------
    # Process testfile(s)
    # ---------------------------------

    # Look up a single word from the lexicon, this is mostly for debugging
    # and/or introspective purposes.
    if lookup_w:
        print("\nLOOKUP WORD", lookup_w)
        if lookup_w in ghd_words:
            print("  ", ghd_words[lookup_w])
            for l in sorted(ghd_words[lookup_w].lemmas.values(),
                            key=attrgetter('freq'),
                            reverse=True):
                print("    ", l)
        print("\nLEMMATISER", lemmatise(lookup_w, "", ghd_words, verbose))

    # Look up a single lemma in all words
    if lookup_l:
        print("\nLOOKUP LEMMA", lookup_l)
        for x in ghd_words:
            output = []
            for l in sorted(ghd_words[x].lemmas.values(),
                            key=attrgetter('freq'),
                            reverse=True):
                if l.lemma == lookup_l:
                    output.append(l)
            if output:
                print(x)
                for o in output:
                    print("  ", o)

    # Test file format:
    # Lines of Greek text
    #
    if not filenames:
        print("\nNOTHING TO DO...", file=sys.stderr)
        lgf.close()
        sys.exit(0)

    for filename in filenames:
        # Check for my own output, a bit crude but prevents the worse mistakes.
        if filename.endswith(".stats.txt") or filename.endswith(".wlt.txt"):
            continue

        print("\nLEMMATISING", filename, file=sys.stderr)
        print("LEMMATISING", filename, file=lgf, flush=True)

        # Reset Counters
        lemmatiser_stats["unknown"] = 0
        for s in strategies:
            lemmatiser_stats[strategies[s]] = 0

        # Output is put into these (two) files.
        outprefix = filename
        if stats:
            outfile = outprefix + suffix + ".stats.txt"
        else:
            outfile = outprefix + suffix + ".wlt.txt"

        # Process test file.
        lcount = 0
        hcount = 0  #count hash lemmas "foo#1"
        wcount = 0  #words processed
        if filename:
            with open(filename, 'r') as f:
                with open(outfile, 'w') as of:
                    for l in f:
                        l = l.strip()
                        if not l:
                            continue
                        words = l.split()
                        # we need a "wlt" mode for hdt text. and check results
                        if wltmode:
                            words = [words[0]]
                        if verbose:
                            print("words", words)
                        if remove_root and words and words[0] == "ROOT":
                            words.pop(0)
                        words = [normalize('NFC', w) for w in words]
                        if have_frog:
                            frog_out = query_frog_sentence(
                                frog, " ".join(words), verbose)
                        for word in words:
                            if verbose:
                                print("\n", word, lcount, wcount)
                            # first frog for POS, then lemmatiser
                            if have_frog:
                                try:
                                    frog_word = frog_out.pop(0)
                                except IndexError:
                                    print("ABORT. FROG OUTPUT EMPTY")
                                    sys.exit(1)
                                if verbose:
                                    print(frog_word)
                                frog_w = normalize('NFC', frog_word["text"])
                                frog_l = normalize('NFC', frog_word["lemma"])
                                frog_t = frog_word["pos"]
                                if verbose:
                                    print("frog(" + str(word) + "):", frog_w,
                                          frog_l, frog_t)
                            else:
                                frog_t = None
                            # try our lemmatiser, with Frog pos tag
                            the_lemma, ltype = lemmatise(
                                word, frog_t, ghd_words, verbose)
                            if verbose:
                                print("lemmatiser:", word, frog_t, the_lemma,
                                      ltype)
                            # we possibly get (NONE, "UNKNOWN")
                            if not the_lemma:
                                #Use frog output for lemma as well
                                if have_frog and frog_w:
                                    the_lemma = Lemma(word, frog_l, frog_t, 0)
                                    the_lemma.src = "frog"
                                    ltype = "FROG"
                                else:
                                    the_lemma = None
                                    ltype = "UNKNOWN"
                            ltype = strategies[ltype]
                            lemmatiser_stats[ltype] += 1
                            if the_lemma:
                                # Note that the POS tag here is the one from the lexica,
                                # and not the one supplied by Frog.
                                if verbose:
                                    print("lemma =", the_lemma)
                                    print(ltype)
                                #
                                if stats:
                                    of.write(word + "\t" + the_lemma.lemma +
                                             "\t" + the_lemma.tag + "\t" +
                                             repr(the_lemma) + "\t" + ltype +
                                             "\n")
                                else:
                                    of.write(word + "\t" + the_lemma.lemma +
                                             "\t" + the_lemma.tag + "\n")
                            else:  #not the_lemma
                                if stats:
                                    of.write(word +
                                             "\tUNKNOWN\tUNKNOWN\tNONE\t" +
                                             ltype + "\n")
                                else:
                                    of.write(word + "\tNONE\tNONE\n")
                            wcount += 1
                        lcount += 1

        with open(outfile, 'a') as of:
            print("#", callstr, "[" + VERSION + "]", file=of, flush=True)
            print("#\n# line count",
                  lcount,
                  "word count",
                  wcount,
                  file=of,
                  flush=True)

            for stat, count in sorted(lemmatiser_stats.items()):
                #for stat, count in lemmatiser_stats.most_common():
                print("# {0:<60} {1:5n}".format(stat, count),
                      file=of,
                      flush=True)

        print("\nOutput in", file=lgf, flush=True)
        print(" ", outfile, file=lgf, flush=True)
        print("\nOutput in", file=sys.stderr)
        print(" ", outfile, file=sys.stderr)
Exemplo n.º 10
0
                    line_txt = re.sub(r'\\-', ' ', line_txt)  # spelling
                    line_txt = re.sub(r'"', '', line_txt)
                    line_txt = re.sub(r'[ ]+(?=[.,:;?!])', "", line_txt)
                    #                    line_txt = re.sub(r'[\.!]*[!]+[\.!]*', '!', line_txt)   # replace combos including at least 1 '!'
                    #                    line_txt = re.sub(r'[\.!?]*[?]+[\.!?]*', '?', line_txt)  # replace combos including at least 1 '?'
                    #                    line_txt = re.sub(r'\.+', '.', line_txt)   # replace clusters of '.' with a single '.'
                    line_txt = re.sub(r'[!.?]+', '!', line_txt)
                    line_txt = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]',
                                      '', line_txt)
                    line_txt = re.sub(r" 's ", " ''s ", line_txt)
                    line_txt = re.sub(r"^'s ", "''s ", line_txt)
                    #                    if re.search(r'"".+""', line_txt):
                    #                        print(re.search(r'"".+""', line_txt).group())
                    txt_dict[pair][part][spkr] += line_txt + " "

frog = Frog(FrogOptions(mwu=False, ner=False))

for pair in txt_dict:
    for part in txt_dict[pair]:
        with open("{}pos/{}/{}_{}.pos".format(ecsd_path, pair, pair, part),
                  "w",
                  encoding="utf-8") as g:
            for spkr in txt_dict[pair][part]:
                print(pair, part, spkr)
                text = txt_dict[pair][part][spkr]
                word_list = frog.process(text)
                s_counter = 0
                w_counter = 0
                for word in word_list:
                    if word["index"] == "1":
                        s_counter += 1
Exemplo n.º 11
0
from frog import Frog, FrogOptions

frog = Frog(FrogOptions(chunking=False, parser=False))


def get_words(s, debug=False):
    result = frog.process(s)

    kept, thrown = set(), set()
    for word in result:
        if all(not c.isalpha() for c in word['lemma']):
            thrown.add(word['lemma'])
        elif word['ner'] == 'O':
            kept.add(word['lemma'])
        else:
            if '_' in word['ner']:
                # sometimes phrases are returned instead of words,
                # e.g. "zeg maar" will be returned as "zeggen_maar"
                # with the NER value being "O_O", so two types are
                # actually returned separated by an underscore. We
                # split these phrases again and add the "O" types to
                # the "kept" list.
                words = word['lemma'].split('_')
                types = word['ner'].split('_')
                if 'O' in types:
                    for w, t in zip(words, types):
                        if t == 'O':
                            kept.add(w)
                        else:
                            thrown.add(_format_thrown_word(w, t, debug))
                else:
Exemplo n.º 12
0
import itertools
import json

from frog import Frog, FrogOptions
from flask import Flask, request

frog = Frog(FrogOptions())
app = Flask(__name__)


@app.route('/process', methods=['GET', 'POST'])
def process():
    if request.method == 'POST':
        return json.dumps(frog.process(request.form.get('document')))
    else:
        return 'Perform a POST request with a sentence parameter to get the FROG-tokenized and annotated sentence back'


@app.route('/organisations', methods=['GET', 'POST'])
def organisations():
    organisations = []
    if request.method == 'POST':
        processed_document = frog.process(request.form.get('document'))
        for is_organisation, organisation in itertools.groupby(
                processed_document, key=lambda x: x['ner'].endswith('ORG')):
            if is_organisation:
                organisations.append(" ".join(x['text'] for x in organisation))
        return json.dumps(organisations)


@app.route('/persons', methods=['GET', 'POST'])
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '-s',
        '--nerset',
        type=str,
        help="NER FoLiA Set",
        action='store',
        default=
        "https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/namedentities.foliaset.ttl"
    )
    parser.add_argument('-c',
                        '--config',
                        type=str,
                        help="Frog configuration",
                        action='store',
                        required=True)
    parser.add_argument('--notexact',
                        dest='exact',
                        help="Loose evaluation",
                        action='store_false',
                        default=True)
    parser.add_argument('files', nargs='+', help='bar help')
    args = parser.parse_args()

    frog = Frog(FrogOptions(ner=True, parser=False, xmlout=True), args.config)

    sentence = []
    entities = []
    precisions = []
    recalls = []
    entity = None
    entity_cls = None
    doc = None
    classeval = defaultdict(lambda: defaultdict(int))
    for filename in args.files:
        for token, tag in readdata(
                filename):  #extracttrain also works on test gold standard
            if token is None:  #end of sentence
                if entity:
                    entities.append((" ".join(entity), entity_cls))
                    entity = []
                if sentence:
                    print("Processing: ", " ".join(sentence), file=sys.stderr)
                    print("    Reference entities:", entities, file=sys.stderr)
                    doc = frog.process(" ".join(sentence))
                    precision, recall = evaluate(doc, entities, args.nerset,
                                                 classeval, args.exact)
                    print("     precision=",
                          precision,
                          " recall=",
                          recall,
                          file=sys.stderr)
                    if precision is not None:
                        precisions.append(precision)
                    if recall is not None:
                        recalls.append(recall)
                    sentence = []
                    entities = []
            else:
                if tag[0] == 'B':
                    if entity:
                        entities.append((" ".join(entity), entity_cls))
                    entity = []
                    entity_cls = tag[2:]
                    entity.append(token)
                elif tag[0] == 'I':
                    entity.append(token)
                elif entity:
                    entities.append((" ".join(entity), entity_cls))
                    entity = []
                sentence.append(token)

    print("overall precision (macroav):\t", sum(precisions) / len(precisions))
    print("overall recall (macroav):\t", sum(recalls) / len(recalls))

    for cls, evaldata in classeval.items():
        try:
            print(cls + " precision (microav):\t",
                  evaldata['tp'] / (evaldata['tp'] + evaldata['fp']))
        except ZeroDivisionError:
            print(cls + " precision (microav):\tn/a")
        try:
            print(cls + " recall (microav):\t",
                  evaldata['tp'] / (evaldata['tp'] + evaldata['fn']))
        except ZeroDivisionError:
            print(cls + " recall (microav):\tn/a")
Exemplo n.º 14
0
from frog import Frog, FrogOptions
from polyglot.downloader import downloader
from polyglot.text import Text, Word
import morfessor
import pickle
import re

Processed_Sentence = " lezen optimaal liep europese unie gekregen spellen rugzak super allesinds boomhut ontwikkelende gemeenschappen vermeenigvuldigde getallen Vereenvoudigd. ....... is werken lopen een kleine test gewoon om te zien of het wel werkt."
#Processed_Sentence = "Ik spring wat rond in het rond"
#frog = Frog(FrogOptions(tok=True, lemma=True, morph = True, daringmorph=False, mwu=False, chunking=True, ner=True, parser=False))
frog = Frog(
    FrogOptions(tok=True,
                lemma=True,
                morph=True,
                daringmorph=False,
                mwu=False,
                chunking=False,
                ner=False,
                parser=False))
output = frog.process(Processed_Sentence)

print("")
print("RAW OUTPUT")
print(output)
print("length")
print(len(output))
print(output[0])
print(output[1])
print(output[2])
print(output[3])
Exemplo n.º 15
0
 def __init__(self, **kwargs):
     # Disable multiword recognition, which is performed by the chunker
     options = FrogOptions(parser=False, mwu=False, xmlIn=True, **kwargs)
     self.__frog = Frog(options)
Exemplo n.º 16
0
#!/usr/bin/env python3

from __future__ import print_function, unicode_literals

from frog import Frog, FrogOptions
import folia.main as folia

frog = Frog(FrogOptions(parser=True))
output = frog.process_raw("Dit is een test")
print("RAW OUTPUT=", output)

output = frog.process("Dit is nog een test.")
print("PARSED OUTPUT=", output)

frog = Frog(FrogOptions(parser=True, xmlout=True))
output = frog.process("Dit is een FoLiA test.")
assert isinstance(output, folia.Document)
assert isinstance(len(output.data), 1)
assert isinstance(
    next(output.data.select(folia.Sentence)).text(), "Dit is een FoLiA test.")
#output is now no longer a string but an instance of folia.Document, provided by the FoLiA library in PyNLPl (pynlpl.formats.folia)
print("FOLIA OUTPUT=")
print(output.xmlstring())

print("Inspecting FoLiA output (example):")
for word in output.words():
    print(word.text() + " " + word.pos() + " " + word.lemma())
assert len(output.words()) == 5
def change_text_to_lemma_POS(sentences, save=False, filename=None):
    # sentence list to sentence list in frog lemma + pos
    lemmapos_sentences = []

    frog = Frog(
        FrogOptions(tok=True,
                    lemma=True,
                    morph=False,
                    daringmorph=False,
                    mwu=False,
                    chunking=False,
                    ner=False,
                    parser=False))

    j = 0
    for sentenceToBeProcessed in sentences:
        if j % 1000 == 0:
            print(j + 1)
            print("of")
            print(len(sentences))

        j += 1
        sentenceToBeProcessed = sentenceToBeProcessed.rstrip('\n')
        output = frog.process(sentenceToBeProcessed)
        lemmapos_sentence = ""
        for i in range(0, len(output)):
            pos = str(output[i].get("pos"))
            lemma = str(output[i].get("lemma"))
            #posprob = str(output[i].get("posprob"))
            #print(posprob)

            # print("pos:      " + pos)
            # print("lemma:    " + lemma)

            pos = "<" + pos
            pos = pos.replace("(", "><")
            pos = pos.replace(")", ">")
            pos = pos.replace(",", "><")
            pos = pos.replace("<>", "")

            # print(pos)

            lemmapos_word = lemma + " " + "**" + pos + "**"

            #word = str(output[i].get("text"))
            #print(f"{word}: {lemmapos_word}")

            lemmapos_sentence = lemmapos_sentence + " " + lemmapos_word

        # Remove the first empty string
        #print(lemmapos_sentence)

        lemmapos_sentence = lemmapos_sentence[1:]
        #print("")
        #print("")
        #print("")
        #print("")
        #print(lemmapos_sentence)
        #print("")
        #print("")
        #print("")
        #print("")
        lemmapos_sentences.append(lemmapos_sentence)
        #print("")
        #print(lemmapos_sentences)
        #print("")

    if save is True:
        with open(filename, 'wb') as outputfile:
            pickle.dump(lemmapos_sentences, outputfile)
    return lemmapos_sentences
Exemplo n.º 18
0
                                  line_txt)  # deal with d'rbij
                line_txt = re.sub(r'^\.', '', line_txt)
                line_txt = re.sub(
                    r'[!?\.,:;]', lambda m: " " + m.group(),
                    line_txt)  # prevent from being interpreted as SPEC(afk)
                #                    if re.search(r'"".+""', line_txt):
                #                        print(re.search(r'"".+""', line_txt).group())
                #                if len(line_txt) > 0:
                #                    if line_txt[-1] not in [".", ",", "!", "?", ":", ";"]:  # add . if chunk does not end in punctuation
                #                            if re.search(r' [A-Za-z]$', line_txt):  # prevent from being interpreted as SPEC(afk)
                #                                line_txt += "!"
                #                            else:
                #                        line_txt += " ."
                txt_dict[file_n][speakers[file_n][spkr]] += line_txt + " "

frog = Frog(FrogOptions(parser=True))


def tag_files(files):
    for fl in files:
        with open(tens_path + "Annotations/pos/" + fl + ".pos", "w") as g:
            for spkr in txt_dict[fl]:
                print(fl, spkr)
                text = txt_dict[fl][spkr]
                #                print(text)
                word_list = frog.process(text)
                print("BLA")
                s_counter = 0
                for word in word_list:
                    if word["index"] == "1":
                        s_counter += 1
Exemplo n.º 19
0
import sys
import re
from frog import Frog, FrogOptions
import multiprocessing

frog = Frog(FrogOptions(parser=True, numThreads=1))

f_path = "/Volumes/tensusers/timzee/other/" if sys.platform == "darwin" else "/vol/tensusers/timzee/other/"

trials = [
    "administrateur", "admiraal", "alarm", "balkon", "ballon", "bar", "baron",
    "bretel", "broer", "cabriolet", "champignon", "commandant", "compagnie",
    "crediteur", "dessert", "directeur", "donateur", "doorn", "duel",
    "dynastie", "epidemie", "expert", "galerie", "gazon", "gel", "generaal",
    "genie", "hoorn", "idee", "interieur", "journaal", "kanaal", "kapitein",
    "kopie", "luitenant", "magnetron", "majoor", "meneer", "mevrouw",
    "microfoon", "militair", "miljonair", "model", "monarch", "monogram",
    "monteur", "mortier", "officier", "perron", "pion", "pistool", "protocol",
    "redacteur", "regisseur", "reptiel", "residu", "restaurant", "saxofoon",
    "sergeant", "sjaal", "strategie", "telegram", "theorie", "trofee",
    "vampier"
]

print("Loading CELEX")
celex = {}
with open(f_path + "DPW3.CD", "r") as f:
    for line in f:
        l_list = line[:-1].split("\\")
        word = l_list[1]
        syls = l_list[4].split("-")
        syl_struc = [i.strip("[]") for i in l_list[5].split("[") if i != ""]