示例#1
0
def norm_dist(d_name):
	global cur
	cur = sh.worksheet(d_name)
	#how many VDCs?
	no_vdc = len(cur.col_values(3))-2
	print 'len is: ' + str(no_vdc)
	print 'dist: ' + d_name


	#get values
	vals = cur.get_all_values()
	
	for col in cols_reg:
		col[1] = cur.col_values(vals[1].index(col[0])+1)[2:]
		print col[1]	
	#normalize values in each column
	
	for col in cols_reg:
		col[2]= normalize.norm(col[1])

	#get output cells and  normalize
	for col in cols_reg:
		index = vals[1].index(col[0]+"_normalized")
		
		out_vals = cur.range(cols[index]+"3:"+cols[index]+str(no_vdc+1))
	
		it = 0
		for cell in out_vals:
			if it < len(col[2]):
				cell.value = col[2][it]
				it+=1
	
	print out_vals	
	cur.update_cells(out_vals)
示例#2
0
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca):

    pca_n = 2
    sc = SparkContext(master)
    data = sc.textFile(dataPath)

# not RDD data 

    ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))

    if label == 0:
        ndata = ndata.map(lambda line: line[::-1])

    if normalize == 1:
        test_data = norm(ndata.collect())    
        norm_data = sc.parallelize(test_data)
        train_data = norm_data.map(lambda part: lbp(part[0], part[1]))   
     #raw_data = data.map(lambda line: line.split(character))


    else:
        test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect()
        train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1]))
    
    
    if ispca == 1:
        pca = PCA(n_components = pca_n)
        pca_train = [test_data[i][1] for i in range(len(test_data))]
        pca_data = pca.fit(pca_train).transform(pca_train)

        test = []
        for i in range(len(pca_data)):
            test.append([test_data[i][0], pca_data[i]])

        train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
        test_data = test
            

    model_lr = lr.train(train_data)
    err_lr = 0.0
    size = len(train_data.collect())
   
    for i in range(size):
        err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0])
           

    print "result:", err_lr/size

    String = "Linear Regression Result:\n"
    String = String + str(model_lr.weights) + '\n'
    String = String + "Error: " + str(err_lr / size) 
    
    sc.stop()

    return String
示例#3
0
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
from normalize import norm

data = np.genfromtxt('./sylvester/out.csv', delimiter=',')
data = norm(data)

pm25 = list(data[:, 0])
pm10 = list(data[:, 1])

plt.scatter(pm25, pm10, c=np.arange(len(pm10)), cmap='gnuplot', s=0.1)
cbar = plt.colorbar()
ticks = ['16:00', '18:00', '20:00', '22:00', '24:00', '02:00']

cbar.set_ticks(range(len(pm10))[::60 * 60 * 2])
#cbar.set_label('time in sconds')

cbar.ax.set_yticklabels(ticks)

plt.xlabel('pm2.5 in µg/m³')
plt.ylabel('pm10 in µg/m³')

plt.show()
示例#4
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')]
            #image_feature = [0.] * IMAGE_FEATURE_LEN
            assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % (
                img, len(image_feature))

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_feature': melt.float_feature(image_feature),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_feature':
                            melt.float_feature(image_feature),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        if FLAGS.small_feature:
                            image_features.append(image_feature)
                        else:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.big_feature_image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
示例#5
0
def deal_imgtextfile(file):
    """
  since img text or encoded img both big.. say for 2w pic will be 18G, while for image feature (23820, 2048) will only be 373M
  this is not used much, only if you do not want to do metric evaluate(recall@1,... for images), and you do not want to 
  convert and store image binaries from imatext(preprocess)
  """
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    assert len(pic_info_map) > 0
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            if img not in pic_info_map:
                continue

            img_text = l[-1]
            encoded_image = urllib.unquote_plus(img_text)

            text_info = pic_info_map[img]
            texts = text_info.split('\x01')

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_data': melt.bytes_feature(encoded_image),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_data':
                            melt.bytes_feature(encoded_image),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        ##--well too big for encoded_image and so not consider evaluation?  TODO
                        #image_features.append(encoded_image)
                        if FLAGS.image_dir:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
示例#6
0
print('min_count:', FLAGS.min_count, 'most_common:', FLAGS.most_common)

num = 0
for line in sys.stdin:
  if num % 10000 == 0:
    print(num, file=sys.stderr)
  l = line.rstrip().split('\t')
  
  try:
    texts = l[1].split('\x01')
  except Exception:
    print(line, file=sys.stderr)
  #texts = l[2].split('\x01')
  
  for text in texts:
    text = normalize.norm(text)
    words = segmentor.Segment(text, FLAGS.seg_method)
    if num % 10000 == 0:
      print(text, '|'.join(words), len(words), file=sys.stderr)
    counter.add(START_WORD)
    for word in words:
      counter.add(word)
      if word.isdigit():
        counter.add('<NUM>')
    counter.add(END_WORD)
  num += 1

counter.add(START_WORD)

print(FLAGS.out_dir, file=sys.stderr)
if not FLAGS.vocab_name:
from normalize import Normalize as norm
from arOp_Grey import ArOpGrey as ar
from geometricsOperations import GeometricsOperations as geo
from arOp_Color import ArOpColor as arC

e = geo()
n = norm()
ar = ar()
arC = arC()
#Tutaj trzeba wstawić kolorowe zdjęcia
#n.geometricColorNormalize("temp_img/raster_grey.png", "temp_img/raster_grey_2.png")
#n.geometricGreyNormalize("temp_img/raster_grey.png", "temp_img/raster_grey_2.png")

#ar.sumImageWithNumber("temp_img/zdj1.jpg", 40)
#ar.sumImageWithImage("temp_img/2/1.jpg", "temp_img/2/2.jpg")
#ar.multiplyImgWithNumber("temp_img/zdj1.jpg", 100)
#ar.multiplyImgWithImg("temp_img/2/1.jpg", "temp_img/2/2.jpg")
#ar.mixImagesWithRate("temp_img/2/1.jpg", "temp_img/2/2.jpg", 0.4)
#ar.escalateImg("temp_img/2/1.jpg", 2)
#ar.divideImgByNumber("temp_img/2/2.jpg", 50)
#ar.divideImgByImg("temp_img/2/1.jpg", "temp_img/2/2.jpg")
#ar.extractImg("temp_img/2/1.jpg", 3)
#ar.logImg("temp_img/2/1.jpg")

#arC.sumImgWithNumber("temp_img/cukierki.tiff", 20)
#arC.sumImgWithImg("temp_img/2/1.jpg", "temp_img/2/2.jpg")
#arC.multiplyImgWithNumber("temp_img/2/1.jpg", 150)
#arC.multiplyImgWithImg("temp_img/2/1.jpg", "temp_img/2/2.jpg")
#arC.mixImagesWithRate("temp_img/2/1.jpg", "temp_img/2/2.jpg", 0.4)
#arC.escalateImg("temp_img/2/1.jpg", 2)
#arC.divideImgByNumber("temp_img/2/1.jpg", 300)
示例#8
0
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from normalize import norm


def func(x, a, b, c):
    return a * np.exp(-b * x) + 2.5


data = np.genfromtxt('./messung1/out.csv', delimiter=',')

ydata25 = np.array(norm(data[2500:4000])[:, 0], np.float32)
ydata10 = np.array(norm(data[2500:4000])[:, 1], np.float32)
xdata = np.array(np.arange(ydata25.shape[0]), np.float32)

plt.figure(1)
plt.subplot(211)

popt25, pcov25 = curve_fit(func, xdata, ydata25)
popt10, pcov10 = curve_fit(func, xdata, ydata10)

plt.plot(ydata25, label='pm2.5')
plt.plot(func(xdata, *popt25), label='pm2.5-regression')
plt.plot(ydata10, label='pm10')
plt.plot(func(xdata, *popt10), label='pm10-regression')

plt.ylabel('linear-scale')
plt.xticks([])

plt.legend()
示例#9
0
        open('raw_data/people_mapping.txt','r') as fspk, \
        open('train_'+data_name+'.txt','w') as ftrain, \
        open('train_'+data_name+'.info','wb') as ftrain_info, \
        open('train_'+data_name+'.sp5','wb') as ftrain_kb, \
        open('dev_'+data_name+'.txt','w') as fvalid, \
        open('dev_'+data_name+'.info','wb') as fvalid_info, \
        open('dev_'+data_name+'.sp5','wb') as fvalid_kb, \
        open('test_'+data_name+'.txt','w') as ftest, \
        open('test_'+data_name+'.info','wb') as ftest_info, \
        open('test_'+data_name+'.sp5','wb') as ftest_kb:

        nodes, edges = read_kb.read_in_graph('.')
        if not os.path.exists('temp_norm.dict'):
            spk_map = normalize.get_spk_map(fspk)
            print(spk_map)
            normalized_contexts, alligned_entities, alligned_dykb = normalize.norm(fchat, nodes, edges, spk_map)
            pickle.dump((normalized_contexts, alligned_entities, alligned_dykb), open('temp_norm.dict','wb'))
        else:
            (normalized_contexts, alligned_entities, alligned_dykb) = pickle.load(open('temp_norm.dict','rb'))
        entities_occurs = split.shuffle( nodes, list(sorted(edges.keys())), \
            normalized_contexts, alligned_entities, alligned_dykb, \
            [ftrain, fvalid, ftest], \
            [ftrain_info, fvalid_info, ftest_info], \
            [ftrain_kb, fvalid_kb, ftest_kb], \
            [0.85,0.05,0.1])

    with open('for_kb_cloud.txt','w') as fkb:
        num_kb_appears = 0
        kb_counts = {}
        for n in nodes:
            box = []
示例#10
0
文件: mlsvm.py 项目: Tomlong/MLlib-UI
def SVMModel(dataPath, label, max_label, min_label, character, master, normalize, ispca):
    
    pca_n = 2
    sc = SparkContext(master)
    data = sc.textFile(dataPath)
    
    mid_label = (float(max_label) + float(min_label)) / 2.0

    print data.map(lambda line: line.split(character)).collect()
 
    ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))

    if label == 0:
        ndata = ndata.map(lambda line: line[::-1])

    if normalize == 1:
        test_data = norm(ndata.collect())    
        norm_data = sc.parallelize(test_data)
        train_data = norm_data.map(lambda part: lbp([1.0 if float(part[0]) > mid_label else 0.0][0], part[1])) 
        test_data = norm_data.map(lambda part: ([1.0 if float(part[0]) > mid_label else 0.0][0], part[1])).collect()

    else:
        train_data = ndata.map(lambda part: lbp([1.0 if float(len(part) - 1) > mid_label else 0.0][0], part[0: len(part) - 1]))
        test_data = ndata.map(lambda part: ([1.0 if float(part[len(part) - 1]) > mid_label else 0.0][0], part[0:len(part) - 1])).collect()

    if ispca == 1:
        pca = PCA(n_components = pca_n)
        pca_train = [test_data[i][1] for i in range(len(test_data))]
        pca_data = pca.fit(pca_train).transform(pca_train)

        test = []
        for i in range(len(pca_data)):
            test.append([test_data[i][0], pca_data[i]])

        train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
        test_data = test
    


    model_svm = svm.train(train_data)
    acc_svm = 0
    err_lrg = 0.0
    size = len(train_data.collect())
   
    for i in range(size):
        if model_svm.predict(test_data[i][1]) == test_data[i][0]:
            acc_svm += 1
   
    String = "SVM Result:\n"
    String = String + str(model_svm.weights) + "\n"
    String = String + str((float(acc_svm)/ float(size)) * 100) + "%"
    

    x = []
    y = []
    showpic = 0

    if len(test_data[0][1]) == 2:
        ispca = 1

    if ispca == 1:
        for i in range(size):  
            if test_data[i][0] == 0.0:     
                plt.plot(test_data[i][1][0], test_data[i][1][1], 'ro', color = 'r', markersize = 8)
            elif test_data[i][0] == 1.0:
                plt.plot(test_data[i][1][0], test_data[i][1][1], 'ro', color = 'b', markersize = 8)

        test = sc.parallelize(test_data)
        max_axis = test.map(lambda part: part[1][0]).max()
        min_axis = test.map(lambda part: part[1][0]).min()
        plt.plot([min_axis, max_axis], [max_axis * model_svm.weights[0] + model_svm.weights[1], min_axis * model_svm.weights[0] + model_svm.weights[1]], 'g-', linewidth= 2)
        plt.savefig('result.jpg')
        plt.close('all')
        showpic = 1

    sc.stop()
    return (showpic, String)