示例#1
0
def frog_process_files(files, verbose=True):
    seen = []
    start_time = time.time()

    frogger = frog.Frog(frog.FrogOptions(parser=False,mwu=False,ner=False,morph=False,chunking=False, numThreads=8),'/etc/frog/frog.cfg')

    for i, filename in enumerate(files):
        with open(filename,'r') as in_file:
            output = frogger.process_raw(in_file.read())

        if verbose:
            print ('> PROCESSING', filename, str(len(seen))+'/'+str(len(files)))

            seen.append(filename)

            #Timings (estimation of time remaining)
            runtime = time.time() - start_time
            per_document_time = runtime/len(seen)
            remaining_time = (len(files)-len(seen))*per_document_time
            total_time = remaining_time+runtime

            print ("RUNTIME", duration_to_string(runtime),
             "("+duration_to_string(per_document_time)+")",
              'REMAINING', duration_to_string(remaining_time),
               'TOTAL', duration_to_string(total_time))

        frogged_filename = util.filename_without_extension(filename, '.txt')

        with open(OUTPUT_FOLDER+frogged_filename+'.frog.out', 'w') as f:
            f.write(output)
示例#2
0
 def process_data(self, X):
     import frog
     frogg = frog.Frog(frog.FrogOptions(lemma=False, morph=False))
     new_X = [
         ' '.join([word['pos'] for word in frogg.process(x)]) for x in X
     ]
     return new_X
示例#3
0
 def __init__(self, lmdir, sleep=False):
     """Starts the frog server if the sleep function isn't on."""
     if not sleep:
         import frog
         opts = frog.FrogOptions(parser=False, ner=False)
         self.frogger = frog.Frog(
             opts, lmdir + "LaMachine/lamachine/etc/"
             "frog/frog-twitter.cfg")
示例#4
0
 def process_data(self, X):
     frogg = frog.Frog(
         frog.FrogOptions(morph=False, mwu=False, chunking=False,
                          ner=False))
     new_X = [
         ' '.join([word['lemma'] for word in frogg.process(x)]) for x in X
     ]
     return new_X
示例#5
0
def preprocess(files):
    """
    Preprocess a list of XML-files
    The cleaned files will be saved in the output folder

    Remove the XML-tags and clean the remaining raw text
    to have one sentence per line with lemmatized words
    """
    frog_options = frog.FrogOptions(tok=False,
                                    morph=False,
                                    mwu=True,
                                    chunking=False,
                                    ner=False,
                                    numThreads=8)
    frogger = frog.Frog(frog_options,
                        '/vol/customopt/lamachine/etc/frog/frog.cfg')

    start_time = time.time()
    for i, file_name in enumerate(files):
        outfile = ntpath.basename(file_name)[:-4] + '.txt'
        out_name = os.path.join(OUTPUT_FOLDER, outfile)
        if os.path.isfile(out_name):
            print('Already done:', out_name)
            continue
        with open(file_name, 'r', encoding='utf-8') as file:
            try:
                text = file.read()
                # Remove all XML tags
                text = re.sub('<[^>]*>', '', text)
                lines = text.splitlines()
                # Remove abundant whitespace
                lines = [line.strip() for line in lines]
                # One sentence per line
                lines = [
                    re.sub(r'(\w)\. ([A-Z])', '\\1.\n\\2', line)
                    for line in lines
                ]
                # Remove punctuation
                lines = [
                    re.sub(r'[\.,:;/\(\)\[\]\'\"]', '', line) for line in lines
                ]
                # Remove empty lines and make lower case
                lines = [line.lower() for line in lines if line != '']
                # Convert each word to its lemma
                lemmas = [lemmatize(line, frogger) for line in lines]
                # Change extension to .txt
                with open(out_name, 'w', encoding='utf-8') as out:
                    out.write('\n'.join(lemmas))
                if i % 49 == 0 and i != 0:
                    print('Done {}/{}'.format(i, len(files)))
                    time_per_doc = (time.time() - start_time) / i
                    print('Average time/document:',
                          sec_to_string(time_per_doc))
                    time_remaining = time_per_doc * (len(files) - i)
                    print('Time remaining:', sec_to_string(time_remaining))
            except UnicodeError:
                print('Skipping {}, UnicodeError'.format(file_name))
示例#6
0
def activate_lemmatizers():
    global frog_installed, frog_lemmatizer, lemmas_nl, lemmas_nl_file, wn_lemmatizer

    wn_lemmatizer = WordNetLemmatizer()
    frog_installed = True
    with open("./data/lemmas_nl.csv", 'r') as lemmas_nl_file:
        lemmas_nl_df = pandas.read_csv(lemmas_nl_file, sep=",")
        lemmas_nl = dict(zip(lemmas_nl_df["word"], lemmas_nl_df["lemma"]))
    try:
        import frog
        frog_lemmatizer = frog.Frog(frog.FrogOptions(parser=False))
        lemmas_nl_file = open("./data/lemmas_nl.csv", 'a')
    except ImportError:
        frog_installed = False
示例#7
0
def get_frog():
    """Returns the interface object to frog NLP. (There should only be one
    instance, because it spawns a frog process that consumes a lot of RAM.)
    """
    global FROG
    if FROG is None:
        FROG = frog.Frog(
            frog.FrogOptions(tok=True,
                             lemma=True,
                             morph=False,
                             daringmorph=False,
                             mwu=True,
                             chunking=False,
                             ner=False,
                             parser=False),
            "/home/rahiel/hortiradar/venv/share/frog/nld/frog.cfg")
    return FROG
示例#8
0
def function_sents(X):
    import frog
    frogg = frog.Frog(frog.FrogOptions(morph=False, mwu=False, chunking=False))
    aux = open('data/ww.txt', 'r').read().splitlines()
    new_X = []
    for x in X:
        new_x = []
        output = frogg.process(x)
        for word in output:
            if word['pos'][:3] not in ['LID', 'VNW', 'VG(', 'WW(']:
                continue
            if word['pos'][:2] == 'WW':
                if word['lemma'] in aux:
                    new_x.append(word['lemma'])
                continue
            new_x.append(word['text'].lower())
        new_X.append(new_x)
    return new_X
示例#9
0
 def process_data(self, X):
     """Filter data. Leave only articles, pronouns, conjunctions and auxiliary verbs."""
     frogg = frog.Frog(
         frog.FrogOptions(morph=False, mwu=False, chunking=False))
     aux = open(config.VERB_FILE, 'r').read().splitlines()
     new_X = []
     for x in X:
         new_x = []
         output = frogg.process(x)
         for word in output:
             if word['pos'][:3] not in ['LID', 'VNW', 'VG(', 'WW(']:
                 continue
             if word['pos'][:2] == 'WW':
                 if word['lemma'] in aux:
                     new_x.append(word['lemma'])
                 continue
             new_x.append(word['text'].lower())
         new_X.append(new_x)
     return new_X
示例#10
0
# Sanity checks, aborts if specified lexicon files not found.
files_found = True
for f in [greekHDfile, filename, nofreqfile, extrafile, frog_cfg]:
    if f and not os.path.exists(f):
        print("ERROR: FILE NOT FOUND:", f, file=sys.stderr)
        files_found = False
if not files_found:
    sys.exit(1)

# Initialise Frog.
if have_frog:
    print("INITIALISE FROG", file=sys.stderr)
    frog = frog.Frog(
        frog.FrogOptions(parser=True,
                         tok=False,
                         morph=False,
                         mwu=False,
                         chunking=False,
                         ner=False), frog_cfg)

# Statistics on lexicon files.
line_count = 0
new_entries = 0
zero_freq = 0

if greekHDfile:
    print("READING", greekHDfile, file=sys.stderr)
    with open(greekHDfile, 'r') as f:
        '''
        WORD            LEMMA       TAG             COUNT
        ἀλλήλοις            ἀλλήλων Pc-p---md--i    5
        ἀλλήλοις            ἀλλήλων Pc-p---nd--i    2
示例#11
0
import os
import frog
from nltk import pos_tag
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import csv

frog = frog.Frog(frog.FrogOptions(parser=False))
lemmatizer = WordNetLemmatizer()


def wn_lemmatizer(word):
    tag = pos_tag([word])[0][1]  # Converting it to WordNet format.
    mapping = {'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ}
    tag_wn = mapping.get(tag[0], wn.NOUN)
    lemma = lemmatizer.lemmatize(word, tag_wn)
    return lemma


def preprocess_word(w):
    # An auxiliary function to clean test files.
    if "f-" in w:
        w = w[2:]
    if w[:3] == "vk-":
        w = w[3:]
    if w[:2] == "vk":
        w = w[2:]
    if w == "geen":
        return ""
    if "(" in w:
        w = w[:w.index("(")]
示例#12
0
文件: tag_ask.py 项目: cmry/amica
import csv
import frog
from tqdm import tqdm

frogger = frog.Frog(frog.FrogOptions(parser=False, ner=False))
reader = csv.reader(open('dmad_a.csv'))
writer = csv.writer(open('dmad_a_tagged.csv', 'w'))

corp = [x for x in reader]

for i, r in enumerate(tqdm(corp)):
    try:
        r += ["\n".join(["\t".join([token["text"], token["lemma"], token["pos"]]) for token in frogger.process(r[4])])] if i else ['frogs']
        writer.writerow(r)
    except IndexError:
        print(r)
示例#13
0
import nltk.data
import frog
import codecs
import docopt

Usage:
    sample_file_builder <input_file> <output_file>


sent_detector = nltk.data.load('tokenizers/punkt/dutch.pickle')
froggie = frog.Frog(frog.FrogOptions(parser=False), "/etc/frog/frog.cfg")
# counter = 1

with codecs.open(output_file,"w","utf-8") as of:
    with codecs.open(input_file,"r","utf-8") as infile:
        for line in infile:
            of.write(froggie.process_raw(s)+"\n")
            # counter+=1
示例#14
0
def lemmatize_sents(X):
    import frog
    frogg = frog.Frog(
        frog.FrogOptions(morph=False, mwu=False, chunking=False, ner=False))
    new_X = [' '.join([word['lemma'] for word in frogg.process(x)]) for x in X]
    return new_X
# -*- coding: utf-8 -*-

import frog
import re


with open("./data/pos.translated.tok", "r") as f_in:
    pos_trans_list = [l for l in f_in]
with open("./data/neg.translated.tok", "r") as f_in:
    neg_trans_list = [l for l in f_in]



frog = frog.Frog(frog.FrogOptions(parser=False, ner=False, tok=False))
p = re.compile('(ADJ|BW|LID|N|SPEC|TSW|TW|VG|VNW|VZ|WW|LET)\((.*)\)')

def parse_pos(pos):
    m = p.match(pos)    
    coarse = m.group(1)
    fine = m.group(2)
    return coarse, fine.split(",")

X_pos = [
    [parse_pos(t["pos"])[0] for t in frog.process(sent)]
    
    for sent in pos_trans_list
]
X_neg = [
    [parse_pos(t["pos"])[0] for t in frog.process(sent)]
    
    for sent in neg_trans_list