def convertNDDSToCDS(options): size = options['numberOfData'] dim = options['numberOfDimension'] distribution = options['distribution'] cardinality = options['numberOfAlphabet'] numberOfVP = options['numberOfVP'] typeOfVP = options['typeOfVP'] dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP) # cdsDataFileName = utils.getCDSDataFileName(options) # cdsQueryFileName= utils.getCDSQueryFileName(options) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas), len(querys), len(vps) d = [{} for i in xrange(dim)] for i in xrange(len(datas)): for j in xrange(dim): if datas[i][j] in d[j]: d[j][datas[i][j]] += 1 else: d[j][datas[i][j]] = 1 for i in xrange(dim): for key in d[i]: d[i][key] = 1.0 - float(d[i][key]) / float(len(datas)) def geh(a, b): ret = 0.0 for i in xrange(len(a)): if a[i] != b[i]: ret += 1.0 else: ret += d[i][a[i]] / float(dim) return ret / float(dim) datas = datas[:1000] average = 0.0 for i in xrange(len(datas)): cur = 98765432 for j in xrange(len(datas)): if i != j: cur = min(cur, geh(datas[i], datas[j])) average += cur average /= len(datas) print average
def fetch_words_by_subjects(): data = readDataFromFile('data/subjects.json') subjects = data['Subjects'] for subject in subjects: with urllib.request.urlopen("https://minder.vn/api/words/words?id_subject={0}".format(subject['id'])) as url: data = json.loads(url.read().decode()) writeDataToFile('data/subject-{0}.json'.format(subject['id']), data)
def calculateAllPairDistance(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] vps = utils.readDataFromFile(utils.getVPFileName(options)) for i in xrange(len(vps)): for j in xrange(i+1,len(vps)): print i,j,utils.hammingDistance(vps[i],vps[j])
def convertNDDSToCDS(options): size = options['numberOfData'] dim = options['numberOfDimension'] distribution = options['distribution'] cardinality = options['numberOfAlphabet'] numberOfVP = options['numberOfVP'] typeOfVP = options['typeOfVP'] dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP) cdsDataFileName = utils.getCDSDataFileName(options) cdsQueryFileName = utils.getCDSQueryFileName(options) # cdsDataFileName = 'cds_data/data_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP) # cdsQueryFileName = 'cds_query/query_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas), len(querys), len(vps) cdsDatas = [] for i in xrange(len(datas)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(datas[i], vps[j])) cdsDatas.append(t) utils.writeDataToFile(cdsDataFileName, cdsDatas) cdsQuerys = [] for i in xrange(len(querys)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(querys[i], vps[j])) cdsQuerys.append(t) utils.writeDataToFile(cdsQueryFileName, cdsQuerys) print cdsDataFileName, cdsQueryFileName
def convertNDDSToCDS(options): size = options['numberOfData'] dim = options['numberOfDimension'] distribution = options['distribution'] cardinality = options['numberOfAlphabet'] numberOfVP = options['numberOfVP'] typeOfVP = options['typeOfVP'] dataFileName = 'data/data_%d_%d_%s_%d.txt'%(size,dim,distribution,cardinality) queryFileName = 'query/query_%d_%d_%s_%d.txt'%(size,dim,distribution,cardinality) vpFileName = 'vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,typeOfVP) cdsDataFileName = utils.getCDSDataFileName(options) cdsQueryFileName= utils.getCDSQueryFileName(options) # cdsDataFileName = 'cds_data/data_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP) # cdsQueryFileName = 'cds_query/query_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas),len(querys),len(vps) cdsDatas = [] for i in xrange(len(datas)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(datas[i],vps[j])) cdsDatas.append(t) utils.writeDataToFile(cdsDataFileName,cdsDatas) cdsQuerys = [] for i in xrange(len(querys)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(querys[i],vps[j])) cdsQuerys.append(t) utils.writeDataToFile(cdsQueryFileName,cdsQuerys) print cdsDataFileName, cdsQueryFileName
def parse_words(): data = readDataFromFile('data/subjects.json') subjects = data['Subjects'] for sub in subjects: subject = { 'id': sub['id'], 'name': sub['name'], 'total': sub['total'], 'words': [] } subject_data = readDataFromFile('data/subject-{}.json'.format( sub['id'])) words = subject_data['Words'] ''' parse dict: word {'id', 'word', 'phonetic', 'mean'} ''' for word in words: subject['words'].append({ key: word[key] for key in word.keys() & {'id', 'word', 'phonetic', 'mean'} }) worksheet = spreadsheet.create_worksheet(subject['name']) spreadsheet.update_cells(worksheet, subject)
def getSplitData(fromDate, toDate, type): data = utils.readDataFromFile(type, fromDate, toDate) splitData = [] for i in range(5): splitData.append([]) for number in data: info = number.split(" ") realNumber = info[2] date = info[0] time = info[1] for i in range(5): splitData[i].append(" ".join([realNumber[i], date, time])) return splitData
def getSplitData(fromDate, toDate, type) : data = utils.readDataFromFile(type, fromDate, toDate) splitData = [] for i in range(5) : splitData.append([]) for number in data : info = number.split(" ") realNumber = info[2] date = info[0] time = info[1] for i in range(5) : splitData[i].append(" ".join([realNumber[i], date, time])) return splitData
def calculateAllPairDistance(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] vps = utils.readDataFromFile(utils.getVPFileName(options)) d = [0 for i in xrange(dim + 1)] s = set() for i in xrange(len(vps)): for j in xrange(i + 1, len(vps)): dist = utils.hammingDistance(vps[i], vps[j]) d[dist] = d[dist] + 1 s.add(dist) print i, j, dist for i in xrange(0, dim + 1): print i, d[i] print len(s)
utils.createDirectory('ndt_data') utils.createDirectory('ndt_query') dictionary = makeDictionaryKeyIsAlphabet() dataFileNames = glob.glob('data/*.txt') for dataFileName in dataFileNames: print dataFileName onlyFileName = dataFileName.split('.')[0].split('/')[1] size = onlyFileName.split('_')[1] dim = onlyFileName.split('_')[2] vptype = onlyFileName.split('_')[3] cardinality = onlyFileName.split('_')[4] queryFileName = 'query/query_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) ndtDataFileName = 'ndt_data/data_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality) ndtQueryFileName = 'ndt_query/query_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality) if os.path.exists(ndtDataFileName): print '%s is exists'%ndtDataFileName else : with open(ndtDataFileName,'w') as fp: for i in xrange(len(datas)): for j in xrange(len(datas[i])): fp.write(str(dictionary[datas[i][j]])+' ') fp.write('\n') if os.path.exists(ndtQueryFileName): print '%s is exsts'%ndtQueryFileName else :
def convertNDDSToCDS(options): size = options['numberOfData'] dim = options['numberOfDimension'] distribution = options['distribution'] cardinality = options['numberOfAlphabet'] numberOfVP = options['numberOfVP'] typeOfVP = options['typeOfVP'] dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP) # cdsDataFileName = utils.getCDSDataFileName(options) # cdsQueryFileName= utils.getCDSQueryFileName(options) cdsDataFileName = 'cds_data/data_%d_%d_%d_%s_%d_%sgeh.txt' % ( size, dim, numberOfVP, distribution, cardinality, typeOfVP) cdsQueryFileName = 'cds_query/query_%d_%d_%d_%s_%d_%sgeh.txt' % ( size, dim, numberOfVP, distribution, cardinality, typeOfVP) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas), len(querys), len(vps) d = [{} for i in xrange(dim)] for i in xrange(len(datas)): for j in xrange(dim): if datas[i][j] in d[j]: d[j][datas[i][j]] += 1 else: d[j][datas[i][j]] = 1 for i in xrange(dim): for key in d[i]: d[i][key] = 1.0 - float(d[i][key]) / float(len(datas)) def geh(a, b): ret = 0.0 for i in xrange(len(a)): if a[i] != b[i]: ret += 1.0 else: ret += d[i][a[i]] / float(dim) return ret / float(dim) cdsDatas = [] for i in xrange(len(datas)): t = [] for j in xrange(len(vps)): t.append(geh(datas[i], vps[j])) cdsDatas.append(t) utils.writeDataToFile(cdsDataFileName, cdsDatas) cdsQuerys = [] for i in xrange(len(querys)): t = [] for j in xrange(len(vps)): t.append(geh(querys[i], vps[j])) cdsQuerys.append(t) utils.writeDataToFile(cdsQueryFileName, cdsQuerys) print cdsDataFileName, cdsQueryFileName
#!/usr/bin/python #-*- coding:utf-8 -*- import utils import os import string if __name__ == '__main__': utils.createDirectory('figure') options = utils.getOptions() dataFileName = utils.getDataFileName(options) vpFileName = utils.getVPFileName(options) datas = utils.getDataInFile(dataFileName) vps = utils.readDataFromFile(vpFileName) curDatas = datas for i in xrange(len(vps)): print i n = len(curDatas) x = [[] for j in xrange(len(vps[i]) + 1)] for j in xrange(n): nextPosition = utils.hammingDistance(vps[i], curDatas[j]) x[nextPosition].append(j) mx, position = (0, 0) xp = [] yp = [] for j in xrange(len(vps[i]) + 1): xp.append(j) yp.append(len(x[j])) if mx < len(x[j]): mx, position = (len(x[j]), j)
#!/usr/bin/python #-*- coding:utf-8 -*- import utils import os import string import numpy as np if __name__ == '__main__': utils.createDirectory('figure_pair') options = utils.getOptions() dim = options['numberOfDimension'] dataFileName = utils.getDataFileName(options) vpFileName = utils.getVPFileName(options) datas = utils.getDataInFile(dataFileName) vps = utils.readDataFromFile(vpFileName) for i in xrange(len(vps)): for j in xrange(i+1,len(vps)): cc = utils.calculateCorrelationCoefficient(vps[i],vps[j],datas) cc = abs(cc) imageFileName = utils.getFigurePairName(options,i,j,cc) print imageFileName if os.path.exists(imageFileName): print '%s is exists'%imageFileName continue xp = [] yp = [] zp = [] zcnt = [ [ 0 for ii in xrange(dim+1) ] for jj in xrange(dim+1) ] for k in xrange(len(datas)):
from GA import GA import numpy as np import networkx as nx import matplotlib.pyplot as plt import warnings import math from utils import readDataFromFile, modularity, printData #Pun aceeasi valoare intr un vector pe pozitia nodurilor care fac parte dintr o comunitate #reteaua mea din care citesc net = readDataFromFile('polbooks.gml') #parametri genetic algorithm #300 de cromozomi in populatie gaParam = {"popSize": 300, "noGen": 15, "network": net} problParam = {'function': modularity, 'retea': net} def afisare_graf(network): warnings.simplefilter('ignore') A = np.matrix(network["matrix"]) G = nx.from_numpy_matrix(A) pos = nx.spring_layout(G) # compute graph layout plt.figure(figsize=(8, 8)) # image is 8 x 8 inches nx.draw_networkx_nodes(G, pos, node_size=600, cmap=plt.cm.RdYlBu) nx.draw_networkx_edges(G, pos, alpha=0.3) plt.show()