def getAnglicismsList(url): """Extracts a list of anglicisms from a wiktionary page.""" anglicisms_list_html = BS(urllib2.urlopen(url)) # Extract the html-code # Extracting every relevant section from the html-code sections = anglicisms_list_html.find_all("p") wil("Extracting anglicisms from wictionary.", 30) entries = [] # Array for anglicisms for section in sections: # The many variants of seperators section_ = re.split("( - | – | -|- |– )", str(section)) for s in section_: entries.append(s) entries = entries[3:len(entries)-1] # Using only the relevant parts fl() wil("Extracting anglicisms from wictionary..") for i in range(len(entries)-1, -1, -1): if entries[i] in [" - ", "- ", " -", " – ", "– "]: entries.pop(i) # Popping redundant matches fl() wil("Extracting anglicisms from wictionary...Complete!", 30, "\n") return entries
def readFile(filename, ignore_character="##########", onestring=False): """Reads a file.""" # ignore_character for leaving out redundant lines wil("Reading file %s" %(filename)) file = codecs.open(filename, "r", "utf-8") lines = [] line = file.readline() count = 0 fl() while line != "": wil("Reading File %s - %i lines so far" %(filename, count), 20) if not line.startswith(ignore_character): lines.append(line) line = file.readline() count += 1 fl() wil("Reading file %s...Complete!" %(filename), 30, "\n") if onestring: # If result should be one string instead an array of strings onestring = "" for i in xrange(len(lines)): onestring += lines[i] + " " return onestring return lines
def readVectorFile(word_list, vectors_file, filter=True): """Reads a vector file.""" w("Reading VectorFile %s..." %(vectors_file)) D = {} fl() with codecs.open(vectors_file, "r", "utf-8") as fin: for i, line in enumerate(fin): wil("Reading VectorFile %s - %i lines so far%s" %(vectors_file, i+1), 20) vector = [x for x in line.split() if x] # die ersten zwei zeilen einer word2vec vector-datei kann man ignorieren. # sie sind nur ein paar informationen zur anzahl der vektoren und # der vektor </s> (anzahl der zeilen im korpus) if i > 1: # hier werden diejenigen vektoren rausgefiltert, die nicht # im wörterbuch sind if filter: if vector[0] in word_list: D[vector[0]] = [float(x) for x in vector[1:]] del word_list[word_list.index(vector[0])] elif not filter: D[vector[0]] = [float(x) for x in vector[1:]] fl() wil("Reading VectorFile %s...Complete!%s\n" %(vectors_file), 30) return D
def readTupleFile(input_file, separation_character="\t"): """Reads a tuple file. Tuples are separated by separation_character.""" lines = readFile(input_file) tuples = [] for line in lines: percentage = lines.index(line)*1.0/len(lines)*100.0 wil("Reading tuple file %s - %.2f%% complete" %(input_file, percentage), 30) line = line.replace("\n", "") parts = line.split(separation_character) tuples.append(tuple(parts)) fl() wil("Reading tuple file %s...Complete!" %(input_file), 30, "\n") return tuples
def readTupleFileToDict(input_file, dicttype, separation_character="\t"): """Reads a tuple file into a dictionary.""" # Tuples are separated by separation_character lines = readFile(input_file) dict_ = {} dict_ = defaultdict() LENGTH = len(lines) for i in xrange(LENGTH): percentage = (i*1.0/LENGTH*100.0) wil("Reading tuple file %s and creating dictionary -%.2f%% complete%s" %(input_file, percentage), 30) line = lines[i].replace("\n", "") parts = line.split(separation_character) if isinstance(dicttype, int): dict_.setdefault(parts[0], int(parts[1])) elif isinstance(dicttype, basestring): dict_.setdefault(unicode(parts[0]), unicode(parts[1])) fl() wil("Reading tuple file %s and creating dictionary...Complete!%s\n" %(input_file), 30) return dict_
def randomSubset(array, n, output=True): if output: w("Creating random subset...") if isinstance(array, dict): # conversion to array of tuples keys = array.keys() values = array.values() length = len(array) array = [(keys[i], values[i]) for i in xrange(length)] res = [] while len(res) != n: if output: percentage = len(res)*1.0/n*100 wil("Creating random subset - %.2f%% complete" %(percentage)) ri = random.randint(0, len(array)-1) res.append(array[ri]) if output: fl() if output: wil("Creating random subset...Complete!", 50, "\n") return res
def lookUpTranslations(list, printErrors=True): """Looks up the English translation of an anglicism.""" # Array for tuples with format (anglzism, [translation 1, translation2]) tuples = [] for e in list: percentage = list.index(e)*1.0/len(list)*100 wil("Looking up translations for %s - %.2f%% complete" %(e[0].replace("ä", "ae").replace("é", "e"), percentage), 20) if e[1] == "": # If there is no wikilink fl() continue try: # Extracting the html-code of wiktionary-page r = urllib2.Request(e[1]) html = BS(urllib2.urlopen(r)) # If there are English translations if len(re.findall("/wiki/Englisch.+<\/li>", str(html))) > 0: translations = re.findall("/wiki/Englisch.+<\/li>", unicode(html))[0] translations = re.findall(">[0-9a-zA-Z-. äöüÄÖÜßé]+<", translations) for i in range(len(translations)-1, -1, -1): if translations[i] == "> <" or \ translations[i] == ">Englisch<": translations.pop(i) # Popping redundant matches... else: # ...or just formatting the results translations[i] = translations[i].replace(">", "").replace("<", "") else: translations = [] # Default tuples.append((e[0].decode('utf-8'), translations)) except Exception, ex: if printErrors: print str(ex) fl()
def extractFalseFriends(lines): """Extracts false friends from .txt file.""" wil("Extracting False Friends...") tuples = [] array = ["" for i in range(4)] entry_index = 0 for l in lines: if l == "\n": # Reset entry_index = 0 array = ["" for i in range(4)] elif entry_index == 3: array[entry_index] = l.replace("\n", "").replace("\t", "") tuples.append(tuple(array)) # Reset entry_index = 0 array = ["" for i in range(4)] else: array[entry_index] = l.replace("\n", "").replace("\t", "") entry_index += 1 fl() wil("Extracting False Friends...Complete!\n") return tuples
continue # Extracting the anglicisms anglicism = anglicism[0].replace("<", "").replace(">", "") wikilink = "" if "(Seite nicht vorhanden)" not in str(e): # Extracting the wikilink wikilink = re.findall('=".+"\s', e)[0].replace('="', "").replace('" ', "") wikilink = "http://de.wiktionary.org" + wikilink tuples.append((anglicism, wikilink)) except Exception, ex: errors.append((str(e), str(ex))) continue finally: fl() if printErrors == True: wil("The following errors occured:", 150, "\n") for error in errors: print "Error at entry: %s - %s" %(error[0], error[1]) wil("Creating tuples of anglicisms and their wikilinks...Complete!", 30, "\n") return tuples def lookUpTranslations(list, printErrors=True): """Looks up the English translation of an anglicism.""" # Array for tuples with format (anglzism, [translation 1, translation2]) tuples = []