def fileToHits(filename): ''' opens an outputfile from run() and returns it as a list of dicts Move to common.py ''' lines = common.openFile(filename) wikiHits = [] for l in lines: parts = l.split('|') unit = {} for p in parts: if p: key = p.split(':')[0] value = p[ len(key) + 1:] # cannot use a basic split since parameter may contain aditional colons if key.endswith(u'_link') and value: if ';' in value: value = value.split(';') else: value = [ value, ] unit[key] = value wikiHits.append(unit.copy()) return wikiHits
def openCSV(filename, prefix=''): ''' prefix is needed in case idno don't already include source This is not live (outputs to local file) !!!BROKEN!!! needs to deal with added info i.e. will likely recieve a dict ''' outName = u'%s-artists.sql' % filename.replace(u'-tmp', '')[:-4] f = codecs.open(outName, 'w', 'utf8') lines = common.openFile(filename) for l in lines: cols = l.split('|') ids = cols[1].split(';') for i in range(0, len(ids)): ids[i] = u'%s%s' % (prefix, ids[i]) if len(cols[2].split(';')) == 2: fName, lName = cols[2].split(';') elif len(cols[2].split(';')) == 1: lName = cols[2] fName = '' else: print u'Aborting: name column had to many parts for: %s' % l break wikidata = cols[3] comments = cols[4] inOdok = cols[5] if len(wikidata) > 0 and not inOdok: f.write(addArtist(fName, lName, wikidata, ids)) f.write('\n') elif inOdok: f.write(addLink(inOdok, ids)) f.write('\n') #add to artist_links f.close() print 'created %s' % outName
def openCSV(filename, prefix=''): ''' prefix is needed in case idno don't already include source This is not live (outputs to local file) !!!BROKEN!!! needs to deal with added info i.e. will likely recieve a dict ''' outName = u'%s-artists.sql' %filename.replace(u'-tmp','')[:-4] f = codecs.open(outName,'w','utf8') lines = common.openFile(filename) for l in lines: cols = l.split('|') ids = cols[1].split(';') for i in range(0,len(ids)): ids[i] = u'%s%s' %(prefix, ids[i]) if len(cols[2].split(';'))==2: fName, lName = cols[2].split(';') elif len(cols[2].split(';'))==1: lName = cols[2] fName = '' else: print u'Aborting: name column had to many parts for: %s' %l break wikidata = cols[3] comments = cols[4] inOdok = cols[5] if len(wikidata)>0 and not inOdok: f.write(addArtist(fName, lName, wikidata, ids)) f.write('\n') elif inOdok: f.write(addLink(inOdok, ids)) f.write('\n') #add to artist_links f.close() print 'created %s'%outName
def file_to_dict(filename, idcol=0, namecol=1, verbose=False): ''' reads in a file and passes it to a dict where each row is in turn a dict lines starting with # are treated as comments. Semicolons in the namecol are treated as separate names. Sqare brakets as the real value (i.e. rest is ignored) ''' listcols = isinstance(namecol, list) if listcols and len(namecol) != 2: print u'namecol must be a single integer or two integers' lines = common.openFile(filename) dDict = {} for l in lines: if len(l) == 0 or l.startswith(u'#'): continue col = l.split('|') idno = col[idcol] nameparts = {} #names can be constructed by two columns (first name, last name) if listcols: # namesF = common.extractName(col[namecol[0]]) namesL = common.extractName(col[namecol[1]]) names = [] for i in range(0, len(namesF)): name = u'%s %s' % (namesF[i], namesL[i]) names.append(name.strip()) nameparts[name.strip()] = u'%s;%s' % (namesF[i], namesL[i]) else: names = common.extractName(col[namecol]) #trying to identify the name parts for name in names: nameparts[name] = common.extractNameParts(name) #trying to identify the name parts. Define last name as last word for name in names: if name in dDict.keys(): dDict[name][0].append(idno) else: npart = '' if name in nameparts.keys(): npart = nameparts[name] dDict[name] = ([ idno, ], npart) if verbose: print 'read %s: from %r lines identified %r items.' % ( filename, len(lines), len(dDict)) return dDict
def file_to_dict(filename, idcol=0, namecol=1, verbose=False): ''' reads in a file and passes it to a dict where each row is in turn a dict lines starting with # are treated as comments. Semicolons in the namecol are treated as separate names. Sqare brakets as the real value (i.e. rest is ignored) ''' listcols = isinstance(namecol, list) if listcols and len(namecol) != 2: print u'namecol must be a single integer or two integers' lines = common.openFile(filename) dDict = {} for l in lines: if len(l) == 0 or l.startswith(u'#'): continue col = l.split('|') idno = col[idcol] nameparts = {} # names can be constructed by two columns (first name, last name) if listcols: # namesF = common.extractName(col[namecol[0]]) namesL = common.extractName(col[namecol[1]]) names = [] for i in range(0, len(namesF)): name = u'%s %s' % (namesF[i], namesL[i]) names.append(name.strip()) nameparts[name.strip()] = u'%s;%s' % (namesF[i], namesL[i]) else: names = common.extractName(col[namecol]) # trying to identify the name parts for name in names: nameparts[name] = common.extractNameParts(name) # trying to identify the name parts. Define last name as last word for name in names: if name in dDict.keys(): dDict[name][0].append(idno) else: npart = '' if name in nameparts.keys(): npart = nameparts[name] dDict[name] = ([idno, ], npart) if verbose: print 'read %s: from %r lines identified %r items.' % (filename, len(lines), len(dDict)) return dDict
def fileToHits(filename): ''' opens an outputfile from run() and returns it as a list of dicts Move to common.py ''' lines = common.openFile(filename) wikiHits = [] for l in lines: parts = l.split('|') unit = {} for p in parts: if p: key = p.split(':')[0] value = p[len(key)+1:] # cannot use a basic split since parameter may contain aditional colons if key.endswith(u'_link') and value: if ';' in value: value = value.split(';') else: value = [value, ] unit[key] = value wikiHits.append(unit.copy()) return wikiHits
flag = False for tav in word: if not filterAlfaBeta(tav): flag = True if flag and not filterOtherTok(word): return False return True # For all text train files # For each sentence i check if i can work with them, if i not i jump to the next sentence. dirs = os.listdir(".") for dirname in dirs: if '_train' in dirname and not os.path.isfile(pathForSaveFileCorpus + dirname[:11] + ".train"): tokens = common.openFile(dirname) tokens_sens = common.partDataToSentences(tokens) good_sens = [] for index, sen in enumerate(tokens_sens): flag = False for word in sen: if not checkWord(word): flag = True break if not flag: good_sens.append(sen) # Save to new train corpus file with open(pathForSaveFileCorpus + dirname[:11] + ".train", "w") as text_file:
import os import common import nltk import time pathForTrainFiles = "./train/" lib = os.listdir(pathForTrainFiles) n = 2 bigram = [] for i, trainFile in enumerate(lib): start = time.time() print i array = common.openFile(pathForTrainFiles + trainFile) array = np.array(array) array = np.split(array, np.where(array == '\n')[0]) array = [np.delete(arr, np.where(arr == '\n')[0]) for arr in array] print "part1", time.time() - start start = time.time() bigram.append([]) for sen in array: ngram = nltk.ngrams(sen, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>') for gram in ngram:
# coding: utf-8 import numpy as np from itertools import permutations import common from random import randint # const files pathTrain = "./train/50000-59999.train" pathVocabulary = "./vocabulary.lex" # Sentence train files dataTrain = common.openFile(pathTrain) sensTrain = common.partDataToSentences(dataTrain) # Vocabulary for check the proper for new words vocabulary = set(common.openFile(pathVocabulary)) # For each sentence: # I start from random place in sentence # I searce from random start word, word with different letter order. # If exsit i change and move to the next sentence, else i keep looking. indexsForModifiedSentences = [] newSentences = [] for indexSourceSentence, sen in enumerate(sensTrain): newSen = sen[:] randomIndex = randint(0, len(sen)) for i in range(randomIndex, len(sen)): tok = sen[i] if len(tok) > 3 and len(tok) < 12 and not common.filterOtherTok(tok): perms = np.unique([''.join(p) for p in permutations(tok)])
# coding: utf-8 import os import common import nltk path = "./train/" # Export all tokens for corpus files tokens = [] for i, trainFile in enumerate(os.listdir(path)): tokens.append(common.openFile(path + trainFile)) # To each token sum, for all tokens for index, array in enumerate(tokens): tokens[index] = nltk.FreqDist(array) # Unification of all tokens sum from all files. voc = nltk.FreqDist([]) for index, array in enumerate(tokens): print index voc += array tokens[index] = [] # Save Vocabulat and sum for all token in new file. with open("vocabulary.lex", "w") as text_file: for key in sorted(voc.keys()): text_file.write(key + '\t' + str(voc[key]) + '\n')