Exemplo n.º 1
0
def generateEntries(list, printErrors=True):
    """Generates array of tuples (anglicism, wiktionary-link)."""
    tuples = []  # Array for tuples (anglicism, wiktionary-link)
    errors = []  
    
    for e in list:
        percentage = list.index(e)*1.0/len(list)*100
        wil("Creating tuples of anglicisms and their wikilink -" 
        	"%.2f%% complete" %(percentage), 60)
        try:
            anglicism = re.findall(">[0-9a-zA-Z-. äöüÄÖÜßé]+<", e)
            if anglicism == []:
                continue
            # Extracting the anglicisms
            anglicism = anglicism[0].replace("<", "").replace(">", "")
            wikilink = ""
            
            if "(Seite nicht vorhanden)" not in str(e):
            	# Extracting the wikilink
                wikilink = re.findall('=".+"\s', e)[0].replace('="', 
                				        "").replace('" ', "") 
                wikilink = "http://de.wiktionary.org" + wikilink
            tuples.append((anglicism, wikilink))
        except Exception, ex:
            errors.append((str(e), str(ex)))
            continue
        finally:
Exemplo n.º 2
0
def extractDictEntries(lines, printErrors=True):
    """Extracts dictionary entries and returns an array of tuples."""
    # You can find the entries in the dict.cc-file in following form:
    # Entry {specification} Additional Entry <Abbreviation> 
    # [Comment1] [Comment2] [...]   wordtype
    # Many of these parts are optional or depend on the word class.
    
    tuples = [] # Array of tuples in form of (German DictEntry object, 
                # english DictEntry object, word class)
    errors = []

    for i in xrange(len(lines)):
        percentage = i*1.0/len(lines)*100
        wil("Extracting and generating dictionary entries - "
                         "%.2f%% complete" %(percentage), 50)
        try:
            # Seperates german part, english part and word class
            _entries = re.split("\t", lines[i]) 
            german_parts = extractParts(_entries[0])
            english_parts = extractParts(_entries[1])
            germanEntry = (german_parts[0], german_parts[1], german_parts[2], 
                           german_parts[3], german_parts[4])
            englishEntry = (english_parts[0], english_parts[1], 
                            english_parts[2], english_parts[3], 
                            english_parts[4])
            tuples.append((germanEntry, englishEntry, _entries[2].
                           replace("\n", "")))

        except Exception, e:
            errors.append("%s with line %s" %(e, lines[i]))
            continue
        finally:
Exemplo n.º 3
0
def readVectorFile(word_list, vectors_file, filter=True):
    """Reads a vector file."""
    w("Reading VectorFile %s..." %(vectors_file))
    D = {}
    fl()
    with codecs.open(vectors_file, "r", "utf-8") as fin:
        for i, line in enumerate(fin):
            wil("Reading VectorFile %s - %i lines so far%s" 
                %(vectors_file, i+1), 20)
            vector = [x for x in line.split() if x]
            # die ersten zwei zeilen einer word2vec vector-datei kann man ignorieren.
            # sie sind nur ein paar informationen zur anzahl der vektoren und
            # der vektor </s> (anzahl der zeilen im korpus)
            if i > 1:
                # hier werden diejenigen vektoren rausgefiltert, die nicht
                # im wörterbuch sind
                if filter:
                    if vector[0] in word_list: 
                        D[vector[0]] = [float(x) for x in vector[1:]]
                        del word_list[word_list.index(vector[0])]
                elif not filter:
                    D[vector[0]] = [float(x) for x in vector[1:]]
            fl()

    wil("Reading VectorFile %s...Complete!%s\n" %(vectors_file), 30)
    return D
Exemplo n.º 4
0
def readTupleFile(input_file, separation_character="\t"):
    """Reads a tuple file. Tuples are separated by separation_character."""
    lines = readFile(input_file)
    tuples = []
    for line in lines:
        percentage = lines.index(line)*1.0/len(lines)*100.0
        wil("Reading tuple file %s - %.2f%% complete"
            %(input_file, percentage), 30)
        line = line.replace("\n", "")
        parts = line.split(separation_character)
        tuples.append(tuple(parts))
        fl()
    wil("Reading tuple file %s...Complete!" %(input_file), 30, "\n")
    return tuples
Exemplo n.º 5
0
def writeTupleFile(tuples, output_file, separation_character="\t", 
                   printErrors=True):
    """Enhanced function for writing a tuple file."""
    file = codecs.open(output_file, "w", "utf-8")
    for i in xrange(len(tuples)):
        percentage = i*1.0/len(tuples)*100
        wil("Writing file %s - %.2f%% complete" 
            %(output_file, percentage), 50)
        try:
            file.write(unicode(tuples[i][0]) + separation_character +
                       unicode(tuples[i][1]) + "\n")
        except Exception, ex:
            if printErrors:
                w("%s: %s" %(str(ex), str(tuples[i])))
            continue
        finally:
Exemplo n.º 6
0
def main():
	cl()
	wh("\t\tCrOssinG: CompaRing Of AngliciSmS IN German", 75)

	dictionary = readDictionary("../res/dictEntries.txt")
	anglicisms = readTupleFile("../res/anglicisms.txt")
	devectors = pickle.load(open("../res/DE_VEC.bin"))
	envectors = pickle.load(open("../res/EN_VEC.bin"))
	false_friends = readTupleFile("../res/false_friends.txt")

	alphas = [0.0001, 0.0002, 0.001, 0.002, 0.01, 0.02, 0.1, 0.2]
	models = ["ridge", "net", "Lasso"]
	model_paras = [(model, alpha) for model in models for alpha in alphas]
	i = 1
	w("Creating VectorTransformators...\n")

	vm = VectorManager.VectorTransformator()
	vm.Dictionary = dictionary
	vm.V = devectors
	vm.W = envectors

	for tuple_ in model_paras:
		vm.createTransformationMatrix(tuple_[0], tuple_[1])
		w("VectorTransformator Nr. %i with Model=%s and alpha=%g has been"
		  " created\n" %(i, tuple_[0], tuple_[1]))
		i += 1

	w("Creating VectorTransformators...Complete!\n\n")
	models = vm.Models
	top_model = compareMatrices(false_friends, vm, models, devectors, envectors)

	w("\nChecking the quality of an evaluation with false-friend-pairs...\n")
	true_count = 0
	false_count = 0
	n_tests = 100
	for i in range(n_tests):
		wil("False friend test nr. %i" %(i+1))
		res = falseFriendsCheck(false_friends, vm, top_model, devectors,\
								envectors, dictionary, 50, False)
		if res:
			true_count += 1
		elif not res:
			false_count += 1
	w("\nIn %i out of %i times, a random subset had a lower or equal average"\
	 "similarity than a random false friend subset.\n" %(false_count, n_tests))
Exemplo n.º 7
0
def readTupleFileToDict(input_file, dicttype, separation_character="\t"):
    """Reads a tuple file into a dictionary.""" 
    # Tuples are separated by separation_character
    lines = readFile(input_file)
    dict_ = {}
    dict_ = defaultdict()
    LENGTH = len(lines)
    for i in xrange(LENGTH):
        percentage = (i*1.0/LENGTH*100.0)
        wil("Reading tuple file %s and creating dictionary -%.2f%% complete%s" 
            %(input_file, percentage), 30)
        line = lines[i].replace("\n", "")
        parts = line.split(separation_character)
        if isinstance(dicttype, int): 
            dict_.setdefault(parts[0], int(parts[1]))
        elif isinstance(dicttype, basestring): 
            dict_.setdefault(unicode(parts[0]), unicode(parts[1]))
        fl()
    wil("Reading tuple file %s and creating dictionary...Complete!%s\n" 
        %(input_file), 30)
    return dict_
Exemplo n.º 8
0
def randomSubset(array, n, output=True):
	if output:
		w("Creating random subset...")
	if isinstance(array, dict): 
		# conversion to array of tuples
		keys = array.keys()
		values = array.values()
		length = len(array)
		array = [(keys[i], values[i]) for i in xrange(length)]
	res = []
	while len(res) != n:
		if output:
			percentage = len(res)*1.0/n*100
			wil("Creating random subset - %.2f%% complete" %(percentage))
		ri = random.randint(0, len(array)-1)
		res.append(array[ri])
		if output:
			fl()
	if output:		
		wil("Creating random subset...Complete!", 50, "\n")
	return res
Exemplo n.º 9
0
def getAnglicismsList(url):
    """Extracts a list of anglicisms from a wiktionary page."""
    anglicisms_list_html = BS(urllib2.urlopen(url)) # Extract the html-code
    # Extracting every relevant section from the html-code
    sections = anglicisms_list_html.find_all("p") 
    wil("Extracting anglicisms from wictionary.", 30)
    entries = []  # Array for anglicisms
    
    for section in sections:
        # The many variants of seperators
        section_ = re.split("( - | – | -|- |– )", str(section)) 
        for s in section_:
            entries.append(s)

    entries = entries[3:len(entries)-1]  # Using only the relevant parts
    fl()
    wil("Extracting anglicisms from wictionary..")

    for i in range(len(entries)-1, -1, -1):
        if entries[i] in [" - ", "- ", " -", " – ", "– "]:
            entries.pop(i)  # Popping redundant matches

    fl()
    wil("Extracting anglicisms from wictionary...Complete!", 30, "\n")
    return entries
Exemplo n.º 10
0
def readFile(filename, ignore_character="##########", onestring=False):
    """Reads a file."""  
    # ignore_character for leaving out redundant lines
    wil("Reading file %s" %(filename))

    file = codecs.open(filename, "r", "utf-8")
    lines = []
    line = file.readline()
    count = 0
    fl()

    while line != "":
        wil("Reading File %s - %i lines so far" %(filename, count), 20)
        if not line.startswith(ignore_character):
            lines.append(line)
	line = file.readline()
	count += 1
	fl()
    wil("Reading file %s...Complete!" %(filename), 30, "\n")
    if onestring:
        # If result should be one string instead an array of strings
        onestring = ""
        for i in xrange(len(lines)):
            onestring += lines[i] + " "
        return onestring
    return lines
Exemplo n.º 11
0
def lookUpTranslations(list, printErrors=True):
    """Looks up the English translation of an anglicism."""
    # Array for tuples with format (anglzism, [translation 1, translation2])
    tuples = [] 
    
    for e in list:
        percentage = list.index(e)*1.0/len(list)*100
        wil("Looking up translations for %s - %.2f%% complete" 
        	%(e[0].replace("ä", "ae").replace("é", "e"), percentage), 20)
        if e[1] == "":  # If there is no wikilink
            fl()
            continue
        try:
            # Extracting the html-code of wiktionary-page
            r = urllib2.Request(e[1])
            html = BS(urllib2.urlopen(r))
            # If there are English translations
            if len(re.findall("/wiki/Englisch.+<\/li>", str(html))) > 0: 
                translations = re.findall("/wiki/Englisch.+<\/li>", 
                						  unicode(html))[0]
                translations = re.findall(">[0-9a-zA-Z-. äöüÄÖÜßé]+<", 
                						  translations)
                for i in range(len(translations)-1, -1, -1):
                    if translations[i] == "> <" or \
                       translations[i] == ">Englisch<":
                        translations.pop(i)  # Popping redundant matches...
                    else:
                    	# ...or just formatting the results
                        translations[i] = translations[i].replace(">", 
                        					"").replace("<", "") 
            else:
                translations = []  # Default
            tuples.append((e[0].decode('utf-8'), translations))
        except Exception, ex:
            if printErrors:
                print str(ex) 
            fl()
Exemplo n.º 12
0
def extractFalseFriends(lines):
    """Extracts false friends from .txt file."""
    wil("Extracting False Friends...")
    tuples = []
    array = ["" for i in range(4)]
    entry_index = 0

    for l in lines:
        if l == "\n":
            # Reset
            entry_index = 0
            array = ["" for i in range(4)]
        elif entry_index == 3:
            array[entry_index] = l.replace("\n", "").replace("\t", "")
            tuples.append(tuple(array))
            # Reset
            entry_index = 0
            array = ["" for i in range(4)]
        else:
            array[entry_index] = l.replace("\n", "").replace("\t", "")
            entry_index += 1
    fl()
    wil("Extracting False Friends...Complete!\n")
    return tuples
Exemplo n.º 13
0
            wikilink = ""
            
            if "(Seite nicht vorhanden)" not in str(e):
            	# Extracting the wikilink
                wikilink = re.findall('=".+"\s', e)[0].replace('="', 
                				        "").replace('" ', "") 
                wikilink = "http://de.wiktionary.org" + wikilink
            tuples.append((anglicism, wikilink))
        except Exception, ex:
            errors.append((str(e), str(ex)))
            continue
        finally:
            fl()
                    
    if printErrors == True:
        wil("The following errors occured:", 150, "\n")
        for error in errors:
            print "Error at entry: %s - %s" %(error[0], error[1])

    wil("Creating tuples of anglicisms and their wikilinks...Complete!", 
            30, "\n")
    return tuples

def lookUpTranslations(list, printErrors=True):
    """Looks up the English translation of an anglicism."""
    # Array for tuples with format (anglzism, [translation 1, translation2])
    tuples = [] 
    
    for e in list:
        percentage = list.index(e)*1.0/len(list)*100
        wil("Looking up translations for %s - %.2f%% complete" 
Exemplo n.º 14
0
            german_parts = extractParts(_entries[0])
            english_parts = extractParts(_entries[1])
            germanEntry = (german_parts[0], german_parts[1], german_parts[2], 
                           german_parts[3], german_parts[4])
            englishEntry = (english_parts[0], english_parts[1], 
                            english_parts[2], english_parts[3], 
                            english_parts[4])
            tuples.append((germanEntry, englishEntry, _entries[2].
                           replace("\n", "")))

        except Exception, e:
            errors.append("%s with line %s" %(e, lines[i]))
            continue
        finally:
            fl()
    wil("Extracting dictionary entries...Complete!", 90, "\n")
    if printErrors:
        w("The following errors occurred:\n")
        for error in errors:
            w(error)
    return tuples

def extractParts(dict_string):
    """Extracts the different parts of an entry."""
    entry_array = []  # Main entry
    specification = ""  # Specification, e.g. numerus or gender
    additional_entry_array = []  # Additional Entry
    abbr = ""  # Abbreviation
    comments = []  # Array of comments
    entry_end = False  # To determine whether the main entry ended already
    # Splitting with whitspaces; connecting parts in brackets
Exemplo n.º 15
0
    """Enhanced function for writing a tuple file."""
    file = codecs.open(output_file, "w", "utf-8")
    for i in xrange(len(tuples)):
        percentage = i*1.0/len(tuples)*100
        wil("Writing file %s - %.2f%% complete" 
            %(output_file, percentage), 50)
        try:
            file.write(unicode(tuples[i][0]) + separation_character +
                       unicode(tuples[i][1]) + "\n")
        except Exception, ex:
            if printErrors:
                w("%s: %s" %(str(ex), str(tuples[i])))
            continue
        finally:
            fl()
    wil("Writing file %s...Complete!" %(output_file), 50, "\n")
    file.close()

def dumpObject(self, obj, name=None):
        """Takes an object as an argument and dumps its content on disk using
        specified "name" as its file name. If no file name is specified,
        that object's __repr__ will be used instead.
        """
        try:
            with open(name, "wb") as output_file:
                pickle.dump(obj, output_file, -1)
            print "Successfully dumped " + obj + " into " + name + "."
        except IOError:
            with open(repr(obj), "wb") as output_file:
                pickle.dump(obj, output_file, -1)
            print "Successfully dumped " + obj + \