Exemplo n.º 1
0
def fileToHits(filename):
    '''
    opens an outputfile from run() and returns it as a list of dicts
    Move to common.py
    '''
    lines = common.openFile(filename)
    wikiHits = []
    for l in lines:
        parts = l.split('|')
        unit = {}
        for p in parts:
            if p:
                key = p.split(':')[0]
                value = p[
                    len(key) +
                    1:]  # cannot use a basic split since parameter may contain aditional colons
                if key.endswith(u'_link') and value:
                    if ';' in value:
                        value = value.split(';')
                    else:
                        value = [
                            value,
                        ]
                unit[key] = value
        wikiHits.append(unit.copy())
    return wikiHits
Exemplo n.º 2
0
def openCSV(filename, prefix=''):
    '''
    prefix is needed in case idno don't already include source
    This is not live (outputs to local file)
    !!!BROKEN!!! needs to deal with added info i.e. will likely recieve a dict
    '''
    outName = u'%s-artists.sql' % filename.replace(u'-tmp', '')[:-4]
    f = codecs.open(outName, 'w', 'utf8')
    lines = common.openFile(filename)
    for l in lines:
        cols = l.split('|')
        ids = cols[1].split(';')
        for i in range(0, len(ids)):
            ids[i] = u'%s%s' % (prefix, ids[i])
        if len(cols[2].split(';')) == 2:
            fName, lName = cols[2].split(';')
        elif len(cols[2].split(';')) == 1:
            lName = cols[2]
            fName = ''
        else:
            print u'Aborting: name column had to many parts for: %s' % l
            break
        wikidata = cols[3]
        comments = cols[4]
        inOdok = cols[5]
        if len(wikidata) > 0 and not inOdok:
            f.write(addArtist(fName, lName, wikidata, ids))
            f.write('\n')
        elif inOdok:
            f.write(addLink(inOdok, ids))
            f.write('\n')
            #add to artist_links
    f.close()
    print 'created %s' % outName
Exemplo n.º 3
0
def openCSV(filename, prefix=''):
    '''
    prefix is needed in case idno don't already include source
    This is not live (outputs to local file)
    !!!BROKEN!!! needs to deal with added info i.e. will likely recieve a dict
    '''
    outName = u'%s-artists.sql' %filename.replace(u'-tmp','')[:-4]
    f = codecs.open(outName,'w','utf8')
    lines = common.openFile(filename)
    for l in lines:
        cols = l.split('|')
        ids = cols[1].split(';')
        for i in range(0,len(ids)): ids[i] = u'%s%s' %(prefix, ids[i])
        if len(cols[2].split(';'))==2:
            fName, lName = cols[2].split(';')
        elif len(cols[2].split(';'))==1:
            lName = cols[2]
            fName = ''
        else:
            print u'Aborting: name column had to many parts for: %s' %l
            break
        wikidata = cols[3]
        comments = cols[4]
        inOdok = cols[5]
        if len(wikidata)>0 and not inOdok:
            f.write(addArtist(fName, lName, wikidata, ids))
            f.write('\n')
        elif inOdok:
            f.write(addLink(inOdok, ids))
            f.write('\n')
            #add to artist_links
    f.close()
    print 'created %s'%outName
Exemplo n.º 4
0
def file_to_dict(filename, idcol=0, namecol=1, verbose=False):
    '''
    reads in a file and passes it to a dict where each row is in turn a dict
    lines starting with # are treated as comments. Semicolons in the namecol are treated as separate names.
    Sqare brakets as the real value (i.e. rest is ignored)
    '''
    listcols = isinstance(namecol, list)
    if listcols and len(namecol) != 2:
        print u'namecol must be a single integer or two integers'
    lines = common.openFile(filename)
    dDict = {}
    for l in lines:
        if len(l) == 0 or l.startswith(u'#'):
            continue
        col = l.split('|')
        idno = col[idcol]
        nameparts = {}
        #names can be constructed by two columns (first name, last name)
        if listcols:  #
            namesF = common.extractName(col[namecol[0]])
            namesL = common.extractName(col[namecol[1]])
            names = []
            for i in range(0, len(namesF)):
                name = u'%s %s' % (namesF[i], namesL[i])
                names.append(name.strip())
                nameparts[name.strip()] = u'%s;%s' % (namesF[i], namesL[i])
        else:
            names = common.extractName(col[namecol])
            #trying to identify the name parts
            for name in names:
                nameparts[name] = common.extractNameParts(name)
            #trying to identify the name parts. Define last name as last word
        for name in names:
            if name in dDict.keys():
                dDict[name][0].append(idno)
            else:
                npart = ''
                if name in nameparts.keys(): npart = nameparts[name]
                dDict[name] = ([
                    idno,
                ], npart)
    if verbose:
        print 'read %s: from %r lines identified %r items.' % (
            filename, len(lines), len(dDict))
    return dDict
Exemplo n.º 5
0
def file_to_dict(filename, idcol=0, namecol=1, verbose=False):
    '''
    reads in a file and passes it to a dict where each row is in turn a dict
    lines starting with # are treated as comments. Semicolons in the
    namecol are treated as separate names.
    Sqare brakets as the real value (i.e. rest is ignored)
    '''
    listcols = isinstance(namecol, list)
    if listcols and len(namecol) != 2:
        print u'namecol must be a single integer or two integers'
    lines = common.openFile(filename)
    dDict = {}
    for l in lines:
        if len(l) == 0 or l.startswith(u'#'):
            continue
        col = l.split('|')
        idno = col[idcol]
        nameparts = {}
        # names can be constructed by two columns (first name, last name)
        if listcols:  #
            namesF = common.extractName(col[namecol[0]])
            namesL = common.extractName(col[namecol[1]])
            names = []
            for i in range(0, len(namesF)):
                name = u'%s %s' % (namesF[i], namesL[i])
                names.append(name.strip())
                nameparts[name.strip()] = u'%s;%s' % (namesF[i], namesL[i])
        else:
            names = common.extractName(col[namecol])
            # trying to identify the name parts
            for name in names:
                nameparts[name] = common.extractNameParts(name)
            # trying to identify the name parts. Define last name as last word
        for name in names:
            if name in dDict.keys():
                dDict[name][0].append(idno)
            else:
                npart = ''
                if name in nameparts.keys():
                    npart = nameparts[name]
                dDict[name] = ([idno, ], npart)
    if verbose:
        print 'read %s: from %r lines identified %r items.' % (filename, len(lines), len(dDict))
    return dDict
Exemplo n.º 6
0
def fileToHits(filename):
    '''
    opens an outputfile from run() and returns it as a list of dicts
    Move to common.py
    '''
    lines = common.openFile(filename)
    wikiHits = []
    for l in lines:
        parts = l.split('|')
        unit = {}
        for p in parts:
            if p:
                key = p.split(':')[0]
                value = p[len(key)+1:]  # cannot use a basic split since parameter may contain aditional colons
                if key.endswith(u'_link') and value:
                    if ';' in value:
                        value = value.split(';')
                    else:
                        value = [value, ]
                unit[key] = value
        wikiHits.append(unit.copy())
    return wikiHits
Exemplo n.º 7
0
    flag = False
    for tav in word:
        if not filterAlfaBeta(tav):
            flag = True
    if flag and not filterOtherTok(word):
        return False
    return True


# For all text train files
# For each sentence i check if i can work with them, if i not i jump to the next sentence.
dirs = os.listdir(".")
for dirname in dirs:
    if '_train' in dirname and not os.path.isfile(pathForSaveFileCorpus +
                                                  dirname[:11] + ".train"):
        tokens = common.openFile(dirname)
        tokens_sens = common.partDataToSentences(tokens)

        good_sens = []
        for index, sen in enumerate(tokens_sens):
            flag = False
            for word in sen:
                if not checkWord(word):
                    flag = True
                    break
            if not flag:
                good_sens.append(sen)

        # Save to new train corpus file
        with open(pathForSaveFileCorpus + dirname[:11] + ".train",
                  "w") as text_file:
Exemplo n.º 8
0
import os
import common
import nltk
import time

pathForTrainFiles = "./train/"
lib = os.listdir(pathForTrainFiles)
n = 2

bigram = []

for i, trainFile in enumerate(lib):
    start = time.time()
    print i

    array = common.openFile(pathForTrainFiles + trainFile)
    array = np.array(array)
    array = np.split(array, np.where(array == '\n')[0])
    array = [np.delete(arr, np.where(arr == '\n')[0]) for arr in array]
    print "part1", time.time() - start

    start = time.time()
    bigram.append([])
    for sen in array:
        ngram = nltk.ngrams(sen,
                            n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol='<s>',
                            right_pad_symbol='</s>')
        for gram in ngram:
Exemplo n.º 9
0
# coding: utf-8

import numpy as np
from itertools import permutations
import common
from random import randint

# const files
pathTrain = "./train/50000-59999.train"
pathVocabulary = "./vocabulary.lex"

# Sentence train files
dataTrain = common.openFile(pathTrain)
sensTrain = common.partDataToSentences(dataTrain)

# Vocabulary for check the proper for new words
vocabulary = set(common.openFile(pathVocabulary))

# For each sentence:
# I start from random place in sentence
# I searce from random start word, word with different letter order.
# If exsit i change and move to the next sentence, else i keep looking.
indexsForModifiedSentences = []
newSentences = []
for indexSourceSentence, sen in enumerate(sensTrain):
    newSen = sen[:]
    randomIndex = randint(0, len(sen))
    for i in range(randomIndex, len(sen)):
        tok = sen[i]
        if len(tok) > 3 and len(tok) < 12 and not common.filterOtherTok(tok):
            perms = np.unique([''.join(p) for p in permutations(tok)])
Exemplo n.º 10
0
# coding: utf-8

import os
import common
import nltk

path = "./train/"

# Export all tokens for corpus files
tokens = []

for i, trainFile in enumerate(os.listdir(path)):
    tokens.append(common.openFile(path + trainFile))

# To each token sum, for all tokens
for index, array in enumerate(tokens):
    tokens[index] = nltk.FreqDist(array)

# Unification of all tokens sum from all files.
voc = nltk.FreqDist([])
for index, array in enumerate(tokens):
    print index
    voc += array
    tokens[index] = []

# Save Vocabulat and sum for all token in new file.
with open("vocabulary.lex", "w") as text_file:
    for key in sorted(voc.keys()):
        text_file.write(key + '\t' + str(voc[key]) + '\n')