-
Notifications
You must be signed in to change notification settings - Fork 0
/
GermEvalReader.py
104 lines (70 loc) · 2.96 KB
/
GermEvalReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
import re
from nltk.compat import xrange
from unidecode import unidecode
import numpy as np
"""
Functions to read in the files from the GermEval contest,
create suitable numpy matrices for train/dev/test
@author: Nils Reimers
"""
def readFile(filepath):
sentences = []
sentence = []
for line in open(filepath):
line = line.strip()
if len(line) == 0 or line[0] == '#':
if len(sentence) > 0:
sentences.append(sentence)
sentence = []
continue
splits = line.split('\t')
sentence.append([splits[1], splits[2]])
return sentences
def multiple_replacer(key_values):
# replace_dict = dict(key_values)
replace_dict = key_values
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values.iteritems()]), re.M)
return lambda string: pattern.sub(replacement_function, string)
def multiple_replace(string, key_values):
return multiple_replacer(key_values)(string)
def normalizeWord(line):
line = np.unicode(line, "utf-8") # Convert to UTF8
line = line.replace(u"„", u"\"")
line = line.lower(); # To lower case
# Replace all special charaters with the ASCII corresponding, but keep Umlaute
# Requires that the text is in lowercase before
replacements = dict(((u"ß", "SZ"), (u"ä", "AE"), (u"ü", "UE"), (u"ö", "OE")))
replacementsInv = dict(zip(replacements.values(), replacements.keys()))
line = multiple_replace(line, replacements)
line = unidecode(line)
line = multiple_replace(line, replacementsInv)
line = line.lower() # Unidecode might have replace some characters, like € to upper case EUR
line = re.sub("([0-9][0-9.,]*)", '0', line) # Replace digits by NUMBER
return line.strip();
def createNumpyArray(sentences, windowsize, word2Idx, label2Idx):
unknownIdx = word2Idx['UNK']
paddingIdx = word2Idx['MASK']
xMatrix = []
yVector = []
wordCount = 0
unknownWordCount = 0
for sentence in sentences:
targetWordIdx = 0
for targetWordIdx in xrange(len(sentence)):
# Get the context of the target word and map these words to the index in the embeddings matrix
wordIndices = []
for wordPosition in xrange(targetWordIdx - windowsize, targetWordIdx + windowsize + 1):
if wordPosition < 0 or wordPosition >= len(sentence):
wordIndices.append(paddingIdx)
continue
word = sentence[wordPosition]
wordIndices.append(word)
# Get the label and map to int
labelIdx = label2Idx[sentence[targetWordIdx][1]]
xMatrix.append(wordIndices)
yVector.append(labelIdx)
print
"Unknowns: %.2f%%" % (unknownWordCount / (float(wordCount)) * 100)
return (np.asarray(xMatrix, dtype='int32'), np.asarray(yVector, dtype='int32'))