Пример #1
0
def getAnglicismsList(url):
    """Extracts a list of anglicisms from a wiktionary page."""
    anglicisms_list_html = BS(urllib2.urlopen(url)) # Extract the html-code
    # Extracting every relevant section from the html-code
    sections = anglicisms_list_html.find_all("p") 
    wil("Extracting anglicisms from wictionary.", 30)
    entries = []  # Array for anglicisms
    
    for section in sections:
        # The many variants of seperators
        section_ = re.split("( - | – | -|- |– )", str(section)) 
        for s in section_:
            entries.append(s)

    entries = entries[3:len(entries)-1]  # Using only the relevant parts
    fl()
    wil("Extracting anglicisms from wictionary..")

    for i in range(len(entries)-1, -1, -1):
        if entries[i] in [" - ", "- ", " -", " – ", "– "]:
            entries.pop(i)  # Popping redundant matches

    fl()
    wil("Extracting anglicisms from wictionary...Complete!", 30, "\n")
    return entries
Пример #2
0
def readFile(filename, ignore_character="##########", onestring=False):
    """Reads a file."""  
    # ignore_character for leaving out redundant lines
    wil("Reading file %s" %(filename))

    file = codecs.open(filename, "r", "utf-8")
    lines = []
    line = file.readline()
    count = 0
    fl()

    while line != "":
        wil("Reading File %s - %i lines so far" %(filename, count), 20)
        if not line.startswith(ignore_character):
            lines.append(line)
	line = file.readline()
	count += 1
	fl()
    wil("Reading file %s...Complete!" %(filename), 30, "\n")
    if onestring:
        # If result should be one string instead an array of strings
        onestring = ""
        for i in xrange(len(lines)):
            onestring += lines[i] + " "
        return onestring
    return lines
Пример #3
0
def readVectorFile(word_list, vectors_file, filter=True):
    """Reads a vector file."""
    w("Reading VectorFile %s..." %(vectors_file))
    D = {}
    fl()
    with codecs.open(vectors_file, "r", "utf-8") as fin:
        for i, line in enumerate(fin):
            wil("Reading VectorFile %s - %i lines so far%s" 
                %(vectors_file, i+1), 20)
            vector = [x for x in line.split() if x]
            # die ersten zwei zeilen einer word2vec vector-datei kann man ignorieren.
            # sie sind nur ein paar informationen zur anzahl der vektoren und
            # der vektor </s> (anzahl der zeilen im korpus)
            if i > 1:
                # hier werden diejenigen vektoren rausgefiltert, die nicht
                # im wörterbuch sind
                if filter:
                    if vector[0] in word_list: 
                        D[vector[0]] = [float(x) for x in vector[1:]]
                        del word_list[word_list.index(vector[0])]
                elif not filter:
                    D[vector[0]] = [float(x) for x in vector[1:]]
            fl()

    wil("Reading VectorFile %s...Complete!%s\n" %(vectors_file), 30)
    return D
Пример #4
0
def readTupleFile(input_file, separation_character="\t"):
    """Reads a tuple file. Tuples are separated by separation_character."""
    lines = readFile(input_file)
    tuples = []
    for line in lines:
        percentage = lines.index(line)*1.0/len(lines)*100.0
        wil("Reading tuple file %s - %.2f%% complete"
            %(input_file, percentage), 30)
        line = line.replace("\n", "")
        parts = line.split(separation_character)
        tuples.append(tuple(parts))
        fl()
    wil("Reading tuple file %s...Complete!" %(input_file), 30, "\n")
    return tuples
Пример #5
0
def readTupleFileToDict(input_file, dicttype, separation_character="\t"):
    """Reads a tuple file into a dictionary.""" 
    # Tuples are separated by separation_character
    lines = readFile(input_file)
    dict_ = {}
    dict_ = defaultdict()
    LENGTH = len(lines)
    for i in xrange(LENGTH):
        percentage = (i*1.0/LENGTH*100.0)
        wil("Reading tuple file %s and creating dictionary -%.2f%% complete%s" 
            %(input_file, percentage), 30)
        line = lines[i].replace("\n", "")
        parts = line.split(separation_character)
        if isinstance(dicttype, int): 
            dict_.setdefault(parts[0], int(parts[1]))
        elif isinstance(dicttype, basestring): 
            dict_.setdefault(unicode(parts[0]), unicode(parts[1]))
        fl()
    wil("Reading tuple file %s and creating dictionary...Complete!%s\n" 
        %(input_file), 30)
    return dict_
Пример #6
0
def randomSubset(array, n, output=True):
	if output:
		w("Creating random subset...")
	if isinstance(array, dict): 
		# conversion to array of tuples
		keys = array.keys()
		values = array.values()
		length = len(array)
		array = [(keys[i], values[i]) for i in xrange(length)]
	res = []
	while len(res) != n:
		if output:
			percentage = len(res)*1.0/n*100
			wil("Creating random subset - %.2f%% complete" %(percentage))
		ri = random.randint(0, len(array)-1)
		res.append(array[ri])
		if output:
			fl()
	if output:		
		wil("Creating random subset...Complete!", 50, "\n")
	return res
Пример #7
0
def lookUpTranslations(list, printErrors=True):
    """Looks up the English translation of an anglicism."""
    # Array for tuples with format (anglzism, [translation 1, translation2])
    tuples = [] 
    
    for e in list:
        percentage = list.index(e)*1.0/len(list)*100
        wil("Looking up translations for %s - %.2f%% complete" 
        	%(e[0].replace("ä", "ae").replace("é", "e"), percentage), 20)
        if e[1] == "":  # If there is no wikilink
            fl()
            continue
        try:
            # Extracting the html-code of wiktionary-page
            r = urllib2.Request(e[1])
            html = BS(urllib2.urlopen(r))
            # If there are English translations
            if len(re.findall("/wiki/Englisch.+<\/li>", str(html))) > 0: 
                translations = re.findall("/wiki/Englisch.+<\/li>", 
                						  unicode(html))[0]
                translations = re.findall(">[0-9a-zA-Z-. äöüÄÖÜßé]+<", 
                						  translations)
                for i in range(len(translations)-1, -1, -1):
                    if translations[i] == "> <" or \
                       translations[i] == ">Englisch<":
                        translations.pop(i)  # Popping redundant matches...
                    else:
                    	# ...or just formatting the results
                        translations[i] = translations[i].replace(">", 
                        					"").replace("<", "") 
            else:
                translations = []  # Default
            tuples.append((e[0].decode('utf-8'), translations))
        except Exception, ex:
            if printErrors:
                print str(ex) 
            fl()
Пример #8
0
def extractFalseFriends(lines):
    """Extracts false friends from .txt file."""
    wil("Extracting False Friends...")
    tuples = []
    array = ["" for i in range(4)]
    entry_index = 0

    for l in lines:
        if l == "\n":
            # Reset
            entry_index = 0
            array = ["" for i in range(4)]
        elif entry_index == 3:
            array[entry_index] = l.replace("\n", "").replace("\t", "")
            tuples.append(tuple(array))
            # Reset
            entry_index = 0
            array = ["" for i in range(4)]
        else:
            array[entry_index] = l.replace("\n", "").replace("\t", "")
            entry_index += 1
    fl()
    wil("Extracting False Friends...Complete!\n")
    return tuples
Пример #9
0
                continue
            # Extracting the anglicisms
            anglicism = anglicism[0].replace("<", "").replace(">", "")
            wikilink = ""
            
            if "(Seite nicht vorhanden)" not in str(e):
            	# Extracting the wikilink
                wikilink = re.findall('=".+"\s', e)[0].replace('="', 
                				        "").replace('" ', "") 
                wikilink = "http://de.wiktionary.org" + wikilink
            tuples.append((anglicism, wikilink))
        except Exception, ex:
            errors.append((str(e), str(ex)))
            continue
        finally:
            fl()
                    
    if printErrors == True:
        wil("The following errors occured:", 150, "\n")
        for error in errors:
            print "Error at entry: %s - %s" %(error[0], error[1])

    wil("Creating tuples of anglicisms and their wikilinks...Complete!", 
            30, "\n")
    return tuples

def lookUpTranslations(list, printErrors=True):
    """Looks up the English translation of an anglicism."""
    # Array for tuples with format (anglzism, [translation 1, translation2])
    tuples = []