def lemma_via_patternlib(token, pos): if pos == 'NP': # singularize noun return singularize(token) elif pos.startswith('V'): # get infinitive of verb return conjugate(token) elif pos.startswith('ADJ') or pos.startswith('ADV'): # get baseform of adjective or adverb return predicative(token) return token
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")): if tag == "n": if de.singularize(pl) == sg: i +=1 n += 1 self.assertTrue(float(i) / n > 0.82) print("pattern.de.singularize()")
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")): if tag == "n": if de.singularize(pl) == sg: i +=1 n += 1 self.assertTrue(float(i) / n > 0.81) print "pattern.de.singularize()"
def _getSingularize(word, language): import pattern.en as pattern_en # @UnresolvedImport import pattern.es as pattern_es # @UnresolvedImport import pattern.fr as pattern_fr # @UnresolvedImport import pattern.de as pattern_de # @UnresolvedImport import pattern.it as pattern_it # @UnresolvedImport if language == "es": return pattern_es.singularize(word) elif language == "en": return pattern_en.singularize(word) elif language == "it": return pattern_it.singularize(word) elif language == "fr": return pattern_fr.singularize(word) elif language == "de": return pattern_de.singularize(word) else: return pattern_en.singularize(word)
if "no result for" in output: knownWord=0 ## Feature 4: no free pronouns (Stanford) yes(1)/no(0) ## go through sentence and see if there are tags for pronouns wordsPOS0=sentTagged[j] wordsPOS=wordsPOS0[1] if wordsPOS=="PPOS" or wordsPOS=="PDS" or wordsPOS=="PRELS" or wordsPOS=="PWS" or wordsPOS=="PIS" or wordsPOS=="PRF" or wordsPOS=="PPER": pronouns=1 elif wordsPOS=="ADJA" or wordsPOS=="ADJD": adj=adj+1 ## Feature 7: word frequencies (look up frequencies in DeReWo) rf=0 countInList=0 word2 = wordsLowWOPunc[j] word2=singularize(word2.decode("utf-8")).lower() for i in range(0,len(freqWo)): if freqWo[i].decode("utf-8")== word2: freqNo[i]=rf countInList=countInList+1 if rf > 14: rareFreq=rareFreq+1 break ## if the word is not in the list it must be rare therefore increase count if countInList==0: rareFreq=rareFreq+1 ## Feat. 8: mean word length wordLen=len(word2) wordLenMean=wordLen+wordLenMean
dict_name_nn = {} dict_name_ne = {} with open('testfile.txt', 'r') as openfile: read_text = openfile.read() parsetree_text = parsetree(read_text) # pprint(parsetree_text) # read_text = read_text.rstrip('\n') # print(re.findall(r'[\w]+|[.,!?;]', read_text)) # pprint(parse(read_text, tags=True, chunks=True, relations=True, # lemmata=True, encoding='utf-8', tagset='STTS')) for word, pos in tag(read_text, tagset='STTS'): if pos == 'NN': singularForm = singularize(word) if word == singularForm: pass # plural = True # print word + '\t' + singularForm + '\t' + str(plural) else: word = singularForm # plural = False # print word + '\t' + singularForm + '\t' + str(plural) if word not in dict_name_nn.keys(): dict_name_nn[word] = 1 elif word in dict_name_nn.keys(): dict_name_nn[word] += 1 else:
import codecs # not used, but possibly interesting http://www.nltk.org/ # http://www.clips.ua.ac.be/pages/pattern-de from pattern.de import lemma, tag, predicative, singularize # possible parts of speech: # PRP$, FW, VBN, WDT, JJ, WP, DT, RP, NN, TO, PRP, # RB, NNS, NNP, VB, WRB, CC, LS, CD, IN, MD, UH part_of_speech_command = { 'PRP$': lambda word: predicative(word), # pronomina 'VBN': lambda word: lemma(word), # verba 'DT': lambda word: predicative(word), # pronomina 'VB': lambda word: lemma(word), # verba 'NN': lambda word: singularize(word), # nomina 'JJ': lambda word: predicative(word) # preposice } pattern_word = re.compile('[a-zA-Z]') pattern_punctuation = re.compile(ur'[—\-|«»…–<>]') def transform(tagword): word = tagword[0] part = tagword[1] # if part == 'VBN': # print tagword # word must contain some letters if not bool(pattern_word.match(tagword[0])):
def insertTagsToParsedDB(lastID, lastTitel, lastText): """Function to process the input, POS-tag it and write it in the DB dbparsedText.""" parsedDatabase = SQLFORM(db.dbParsedText) inputDatabase = SQLFORM(db.dbInput) dictNN = {} dictNE = {} # SQL Query to extract ID, Title and Text extractQueryInputDB = db.executesql('select id, inputTitle, inputText\ from dbInput') lastText = extractQueryInputDB[-1][-1] # Begin of For-Loop for POS-Tagging for word, postag in tag(lastText, tagset='STTS'): word = word.decode('utf-8') if postag == 'NN': singularFormNN = singularize(word) if word == singularFormNN: pass else: word = singularFormNN if word not in dictNN.keys(): dictNN[word] = 1 elif word in dictNN.keys(): dictNN[word] += 1 else: pass elif postag == 'NE': singularFormNE = singularize(word) if word == singularFormNE: pass else: word = singularFormNE if word not in dictNE.keys(): dictNE[word] = 1 elif word in dictNE.keys(): dictNE[word] += 1 else: pass else: pass listNN = dictNN.items() listNE = dictNE.items() # for key, value in dict710WC.iteritems(): # print key # print dict710WC # print 'Letzte ID: ' + str(lastID) # print 'Letzter Titel: ' + str(lastTitel) # print 'Letzter Text: ' + lastText # print '\n\n' # print dictNE # print '\n' # print dictNN # return extractQueryInputDB # return locals() return dictNE, dictNN
def buildWordList(): """ Function to build lists and dictionaries. Function exports txt-files incl. dictionaries and lists from both DDC for the words that are tagged as NE and NN. """ # Defining list variables list330NE = [] list330NN = [] list330 = [] list710NE = [] list710NN = [] list710 = [] # Defining dictionary variables dict330NE = {} dict330NN = {} dict330 = {} dict330WithoutCommons = {} dict710NE = {} dict710NN = {} dict710 = {} dict710WithoutCommons = {} for dirpath, dirs, files in os.walk('../../collecting/temp/'): for filename in fnmatch.filter(files, '*.txt'): with open('../../collecting/temp/' + dirpath + '/' + filename, 'r') as openfile: parsefile = openfile.read() # parsefile = parse(parsefile) ddcFromFilepath = dirpath[-3:] for word, postag in tag(parsefile, tagset='STTS'): word = word.decode('utf-8') if ddcFromFilepath == '330': if postag == 'NN': singularForm = singularize(word) if word == singularForm: pass else: word = singularForm list330NN.append(word) list330.append(word) if word not in dict330NN.keys(): dict330NN[word] = 1 elif word in dict330NN.keys(): dict330NN[word] += 1 else: pass elif postag == 'NE': singularForm = singularize(word) if word == singularForm: pass else: word = singularForm list330NE.append(word) list330.append(word) if word not in dict330NE.keys(): dict330NE[word] = 1 elif word in dict330NE.keys(): dict330NE[word] += 1 else: pass else: pass elif ddcFromFilepath == '710': if postag == 'NN': singularForm = singularize(word) if word == singularForm: pass else: word = singularForm list710NN.append(word) list710.append(word) if word not in dict710NN.keys(): dict710NN[word] = 1 elif word in dict710NN.keys(): dict710NN[word] += 1 else: pass elif postag == 'NE': singularForm = singularize(word) if word == singularForm: pass else: word = singularForm list710NE.append(word) list710.append(word) if word not in dict710NE.keys(): dict710NE[word] = 1 elif word in dict710NE.keys(): dict710NE[word] += 1 else: pass else: pass else: pass # Building list with words common in both DDCs listCommonWords = list(set(list330).intersection(list710)) # Building new lists without the common words list330WithoutCommons = [] list710WithoutCommons = [] for i in list330: if i not in listCommonWords: list330WithoutCommons.append(i) for i in list710: if i not in listCommonWords: list710WithoutCommons.append(i) # Building new dictionaries without the common words for i in list330WithoutCommons: if i not in dict330WithoutCommons.keys(): dict330WithoutCommons[i] = 1 elif i in dict330WithoutCommons.keys(): dict330WithoutCommons[i] += 1 else: pass for i in list710WithoutCommons: if i not in dict710WithoutCommons.keys(): dict710WithoutCommons[i] = 1 elif i in dict710WithoutCommons.keys(): dict710WithoutCommons[i] += 1 else: pass # Merge NE and NN dictionaries into one dictionary dict330.update(dict330NE) dict330.update(dict330NN) dict710.update(dict710NE) dict710.update(dict710NN) # Dump dictionaries into JSON files with open('../../collecting/dict330NE.json', 'w') as exportfile330NE: json.dump(dict330NE, exportfile330NE, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict330NN.json', 'w') as exportfile330NN: json.dump(dict330NN, exportfile330NN, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict330WithoutCommons.json', 'w') as\ exportfile330WithoutCommons: json.dump(dict330WithoutCommons, exportfile330WithoutCommons, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict330All.json', 'w') as exportfile330All: json.dump(dict330, exportfile330All, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict710NE.json', 'w') as exportfile710NE: json.dump(dict710NE, exportfile710NE, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict710NN.json', 'w') as exportfile710NN: json.dump(dict710NN, exportfile710NN, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict710WithoutCommons.json', 'w') as\ exportfile710WithoutCommons: json.dump(dict710WithoutCommons, exportfile710WithoutCommons, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict710All.json', 'w') as exportfile710All: json.dump(dict710, exportfile710All, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': '))