def create_data_record(out_filename, images_addrs, labels_addrs): # open the TFRecords file writer = tf.python_io.TFRecordWriter(out_filename) for i in range(len(images_addrs)): # print how many images are saved every 100 images if i % 100==0: print('Train data: {}/{}'.format(i, len(images_addrs))) sys.stdout.flush() # Load the image img = read_img(images_addrs[i]) label = read_label(labels_addrs[i]) if img is None or label is None: print("erroe null image") continue # Create a feature feature = { 'image_raw': _bytes_feature(img.tostring()), 'label': _bytes_feature(label.tostring()) } # Create an example protocol buffer example = tf.train.Example(features=tf.train.Features(feature=feature)) # Serialize to string and write on the file writer.write(example.SerializeToString()) writer.close() sys.stdout.flush()
def main(): filename = './origin_data/bugreports.xml' path = './bug_reports' bugslist = utils.read_xml(filename) # print(bugslist) label = utils.read_label('./origin_data/goldset.txt') # print(label) samples, ids = utils.get_content(bugslist) # print(samples) num_word_list, numword = utils.count_word(samples) # print(len(num_word_list)) # for i in num_word_list: # num_sentence.append(len(i)) utils.savefile(samples) # print(num_sentence) results = textrank.bugsum(path, numword, num_word_list) print(len(i) for i in results) # extra_ids = index2id(results,ids) # print(len(extra_ids)) pred = eval.index2pred(results, ids) y = eval.label2y(label, ids) mean_acc, mean_pr, mean_re, mean_f1 = eval.evaluate(y, pred) print('mean_acc, mean_pr, mean_re, mean_f1', mean_acc, mean_pr, mean_re, mean_f1)
def __init__(self, sess, checkpoint_dir, log_dir, training_paths, testing_paths, training_survival_data, testing_survival_data, batch_size=6, n_hidden_1=500, n_hidden_2=500, n_z=20): self.sess = sess self.checkpoint_dir = checkpoint_dir self.log_dir = log_dir self.training_paths = training_paths self.testing_paths = testing_paths self.training_survival_data = training_survival_data self.testing_survival_data = testing_survival_data label = read_label(self.training_paths[0]) self.label_size = label.shape[:-1] self.channel = label.shape[-1] self.n_input = label.size self.batch_size = batch_size self.n_hidden_1 = n_hidden_1 self.n_hidden_2 = n_hidden_2 self.n_z = n_z self.build_model() self.saver = tf.train.Saver()
def __init__(self, sess, checkpoint_dir, log_dir, training_paths, testing_paths, training_survival_data, testing_survival_data, batch_size=1, features=16, dropout=0.5): self.sess = sess self.checkpoint_dir = checkpoint_dir self.log_dir = log_dir self.training_paths = training_paths self.testing_paths = testing_paths self.training_survival_data = training_survival_data self.testing_survival_data = testing_survival_data label = read_label(self.training_paths[0]) self.label_size = label.shape[:-1] self.batch_size = batch_size self.channel = label.shape[-1] self.features = features self.dropout = dropout self.build_model() self.saver = tf.train.Saver()
def mAP(path_test_repo_images, path_test_repo_labels, detection_method="yolo", IoU_min=0.50): """ Given two directories (images and associated labels) and a detection method, return de Mean Average Precision for this method on the given datas. Attention : datas must be sorted in alphabetical order ! """ image_filenames = [] label_filenames = [] for image_filename in os.listdir('images'): image_filenames.append('images' + '/' + image_filename) for label_filename in os.listdir('labels'): label_filenames.append('labels' + '/' + label_filename) image_filenames = sorted(image_filenames) label_filenames = sorted(label_filenames) APs = [] print('') print('mAP evaluation on the images of the test repository "images"') print('Configuration :') print('.............. IoU minimum : ', IoU_min) print('.............. Detection method : ', detection_method) debut = time.clock() for image_filename, label_filename in zip(image_filenames, label_filenames): print('') print('Image name : ' + image_filename) print('Processing..........') bndbx_truth = utils.read_label(label_filename) bndbx_detected = predictions(image_filename, detection_method) print('Number of labelled objects : ', len(bndbx_truth)) print('Number of detected objects : ', len(bndbx_detected)) ap = AP(bndbx_truth, bndbx_detected, IoU_min) APs.append(ap) print('Average Precision (AP) : ', "%.2f" % ap) print('..........Finished') mAP = mean(APs) fin = time.clock() print('') print('Mean Average Precision (mAP) : ', "%.2f" % mAP) print('Execution time : ', "%.2f" % (fin - debut), ' secondes') print('') return mAP
def generate(output_file_prefix): labelfile, rev_label = utils.read_label(output_file_prefix + 'label.txt') sequence_file = utils.read_sequences(output_file_prefix + 'sequences.txt') cluster_file = open(output_file_prefix + 'clusters.txt', 'w') # imp N = len(sequence_file) + 1 threshold = 10 print('N = ' + str(N) + '\n') height_dic = {} X = np.zeros((N, N)) print("files loaded\n") members = {i: [i] for i in range(0, N)} print("members initialized\n") clId = 0 clustID = {labelfile[i]: i for i in range(0, N)} for line in sequence_file: line = line.strip() seq = line.split('\t') seqA = int(seq[0]) seqB = int(seq[1]) seqC = int(seq[2]) if seqC not in members: members[seqC] = [] if (seqA < N): height_dic[seqA] = 0 if (seqB < N): height_dic[seqB] = 0 members[seqC].extend(members[seqA]) members[seqC].extend(members[seqB]) height_dic[seqC] = max(height_dic[seqA], height_dic[seqB]) + 1 for i in members[seqA]: for j in members[seqB]: X[i][j] = height_dic[seqC] X[j][i] = height_dic[seqC] if X[i][j] <= threshold: if labelfile[i] not in clustID: clId += 1 clustID[labelfile[i]] = clId clustID[labelfile[j]] = clustID[labelfile[i]] print("matrix created\n") clusters = {} for key, value in clustID.iteritems(): if value not in clusters: clusters[value] = list() clusters[value].append(key) for key, value in clusters.iteritems(): cluster_file.write('clusterId:' + str(key) + '\t') cluster_file.write(str(constants.value_separator.join(value))) cluster_file.write('\n') cluster_file.close() print("clusters written\n") return clusters, rev_label, X
def data_reader(self, left_img_paths, right_img_paths): if self.mode == 0 or self.mode == 1: batch_size = len(left_img_paths) else: batch_size = len(right_img_paths) batch_imgs = np.zeros((batch_size, *self.input_shape), dtype=np.float32) batch_labels = np.zeros((batch_size, self.num_attribute), dtype=np.float32) left_imgs = list() if self.mode == 0 or self.mode == 1: for i, left_img_path in enumerate(left_img_paths): left_imgs.append(self.data_preprocessing(left_img_path)) # Process labels batch_labels[i, :] = utils.read_label( left_img_path, img_format=self.img_format) right_imgs = list() if self.mode == 0 or self.mode == 2: for i, right_img_path in enumerate(right_img_paths): right_imgs.append(self.data_preprocessing(right_img_path)) # Process labels batch_labels[i, :] = utils.read_label( right_img_path, img_format=self.img_format) # Normalize labels batch_labels = self.normalize(batch_labels) if self.mode == 0: for i in range(len(left_imgs)): left_img = left_imgs[i] right_img = right_imgs[i] batch_imgs[i] = np.dstack([left_img, right_img]) elif self.mode == 1: for i in range(len(left_imgs)): batch_imgs[i] = np.expand_dims(left_imgs[i], axis=-1) else: for i in range(len(right_imgs)): batch_imgs[i] = np.expand_dims(right_imgs[i], axis=-1) return batch_imgs, batch_labels
def benchmark(self): model_path = os.path.join(BROOT, f'models/{self.dataset}-{self.model}-2.0') valid_data = os.path.join(BROOT, get_valid_data(self.dataset, self.model)) inference_bin = os.path.join(BROOT, 'build/inference') ret = subprocess.run([ inference_bin, '--logtostderr', '--model', model_path, '--data', valid_data, '--mode', mode, '--batch_size', str(batch_size), '--num_labels', get_num_labels(self.dataset), '--seq_lens', str(self.seq_len), '--min_graph', str(self.args.min_graph), '--ignore_copy', str(self.args.ignore_copy), ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if ret.returncode != 0: print(ret.stderr.decode('ascii')) assert False, 'Prediction failed.' prediction = list() for line in ret.stdout.decode('ascii').splitlines(): if line.startswith('Sents/s'): _, qps = line.split() else: prediction.append(int(line)) prediction = np.asarray(prediction) testcase = os.path.join(BROOT, get_valid_labels(self.dataset)) labels = read_label(testcase) metric = get_metric(self.dataset) ret = metric(prediction, labels) stat = {'Sents/s': float(qps)} stat['metric_value'] = ret stat['metric'] = metric.__name__ stat['batch_size'] = batch_size stat['dataset'] = self.dataset stat['model'] = self.model + '-2.0' stat['mode'] = self.mode if self.seq_len == 0: stat['seq_len'] = 'dynamic' else: stat['seq_len'] = self.seq_len return stat
def training(label_file, image_file): # this function used to train data from training datasets and will # return list of P(vj), matrix of P(ai,j = 1|vj), and matrix of P(ai,j = 0|vj) # smooth_type determines which type of smoothing data used. There are 2 that will be used in this hw: # + # + # first compute the list of P(vj) from the training labels dataset, # which is the list of size 10 stores P(0),P(1),...,P(9) in that order label_list = utils.read_label(label_file) p_list = [0,0,0,0,0,0,0,0,0,0] count_p_list = [0,0,0,0,0,0,0,0,0,0] # this list will count the number of images corresponding to each label for e in label_list: p_list[e] = p_list[e] + 1 count_p_list[e] = count_p_list[e] + 1 for i in xrange(len(p_list)): p_list[i] = p_list[i]/5000.0 # compute P(ai,j = 1|vj) and P(ai,j = 0|vj) probs = [[],[],[],[],[],[],[],[],[],[]] for i in xrange(5000): # read each image curr_im = utils.read_image(i, image_file) label = label_list[i] if len(probs[label]) == 0: # create and initialize 28x28 matrix for row in curr_im: prob_row = [] for j in row: prob_tuple = [0,0] if j == 0: prob_tuple[0] = prob_tuple[0] + 1 else: prob_tuple[1] = prob_tuple[1] + 1 prob_row.append(prob_tuple) probs[label].append(prob_row) else: # add to current matrice for row in xrange(28): for col in xrange(28): if curr_im[row][col] == 0: probs[label][row][col][0] = probs[label][row][col][0] + 1 else: probs[label][row][col][1] = probs[label][row][col][1] + 1 # divide each element in probs by # of count corresponding vj to get the prob. # Using Laplace Smoothing with k = 1 for i in xrange(10): for j in xrange(28): for k in xrange(28): probs[i][j][k][0] = (probs[i][j][k][0] + 1)/(count_p_list[i] * 1.0 + 2.0) probs[i][j][k][1] = (probs[i][j][k][1] + 1)/(count_p_list[i] * 1.0 + 2.0) return p_list, probs
def load_graph(dataset, labels_is_onehot=True): features = read_feature("./data/" + dataset + ".feature", is_normalize=False) if os.path.exists("./data/" + dataset + ".label"): labels = read_label("./data/" + dataset + ".label", is_onehot=labels_is_onehot) else: labels = None G = read_graph("./data/" + dataset + '.edgelist') graph = Graph(features, G, labels) return graph
def image_generator(self, data_type): if data_type == 'train': i_dir = self.train_dir l_dir = self.train_label num_img = self.count_train i_list = self.train_files elif data_type == 'validation': i_dir = self.val_dir l_dir = self.val_label num_img = self.count_val i_list = self.val_files else: return while True: f = i_list[np.random.randint(0, num_img)] c_img = utils.read_single_image(i_dir + f, 3) c_label = utils.read_label(l_dir + self.get_label_name(f)) yield c_img, c_label
def _make_data(self, label): paths = read_label(label) data = [] n_max = -np.inf n_min = np.inf for fp in paths: basename = os.path.basename(fp) wav, fs = sf.read(fp) assert (self._fs == fs) wav = self._pre_emphasis(wav) wav_length = wav.shape[0] n_max = np.max([n_max, np.max(wav)]) n_min = np.min([n_min, np.min(wav)]) dd = {'name': basename, 'len': wav_length, 'wav': wav} data.append(dd) return data, n_max, n_min
def generate_dendrogram(): data_dic = read_labels_detected() labels_op = open('labels_op.txt', 'w') for key, val in data_dic.iteritems(): labels_op.write(','.join(val['labels'])) labels_op.write('\n') labels_op.close() output_file_prefix = 'levels' generateData.generate_data('labels_op.txt', constants.label_separator, 'levels') # Done separately # os.system(constants.hier_code_root + "/hier " + output_file_prefix + 'wvec.dat dummy.txt 1 > '+ output_file_prefix+'logs.txt') clusters, rev_label, X = generatePlotReducedDendro.generate(output_file_prefix) labels , rev_label = utils.read_label(output_file_prefix + 'label.txt') new_labels = [ str(','.join(label.strip().split(',')[:3])) for label in labels] utils.generate_dendrogram(X, new_labels, 'dendrogram_with_labels') print("plotting done\n")
def test(start, end): global test_labels result = test_im(start, end) print "TEST LABEL RESULTS: " print result test_labels = utils.read_label(test_labels) count_labels = [0,0,0,0,0,0,0,0,0,0] # keep track of how many of each values in the label file for value in test_labels: count_labels[value] = count_labels[value] + 1 acc = 0 acc_list =[0,0,0,0,0,0,0,0,0,0] for i in xrange(len(result)): if result[i] == test_labels[i]: acc = acc + 1 acc_list[test_labels[i]] = acc_list[test_labels[i]] + 1 acc_percent = (acc/(len(result) * 1.0)) * 100 for i in xrange(10): acc_list[i] = (acc_list[i]/(count_labels[i] * 1.0)) * 100 print "Digits Accuracy" for i in xrange(len(acc_list)): print 'Digit ' + str(i) + ': ' + str (acc_list[i]) print "Average Accuracy: " + str(acc_percent)
import utils import os import pandas as pd data_file = 'bugreports.xml' label_file = 'goldset.txt' data_path = './origin_data' if __name__ == "__main__": test = [] buglist = utils.read_xml(os.path.join(data_path, data_file)) labels = utils.read_label(os.path.join(data_path, label_file)) data = pd.DataFrame(buglist) # print(data['Title'][0]) samples, ids = utils.get_content(buglist) count, flag = 0, 0 for sentences, id_list in zip(samples, ids): count += 1 for sentence, id in zip(sentences, id_list): if id in labels[count - 1]: test.append({ 'index': count, 'title': data['Title'][count - 1], 'sentence': sentence, 'id': id, 'label': 1 }) else: test.append({ 'index': count, 'title': data['Title'][count - 1],
acc_list_all = [] pre_list_all = [] re_list_all = [] f1_list_all = [] filepath = './bugreports_sds/' step = 6 bc = BertClient() sentences = [] vectors = [] for i in range(36): report_sent = [] with open(filepath + str(i + 1) + '.txt', "r", encoding='utf-8') as f: for line in f.readlines(): report_sent.append(line.strip('\n')) sentences.append(report_sent) labels_ids = read_label('./data/goldset_sds.txt') ids = get_ids() labels = [] for index, id_list in enumerate(ids): label = [] for id in id_list: if id in labels_ids[index]: label.append(1) else: label.append(0) labels.append(label) sentences, labels = clear_data(sentences, labels) for i in range(0, 36, step): # model = create_classify_dense(EMBEDDING_DIM) model = create_classify_lstm_att(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
from sklearn.cluster import KMeans import numpy as np import matplotlib.pyplot as plt TRAIN_DIR = '/home/diendl/Desktop/new_dataset/data/train' TEST_DIR = '/home/diendl/Desktop/new_dataset/data/test' NUM_ANCHORS = 9 WIDTH = 416 HEIGHT = 416 label_files = glob.glob(os.path.join(TRAIN_DIR, '*.txt')) label_files += glob.glob(os.path.join(TEST_DIR, '*.txt')) X = None for label_path in label_files: true_boxes = utils.read_label(label_path) for true_box in true_boxes: bw, bh = true_box[3] * WIDTH, true_box[4] * HEIGHT this_box = np.expand_dims([bw, bh], axis=0) if X is None: X = this_box else: X = np.concatenate((X, this_box), axis=0) kmeans = KMeans(n_clusters=NUM_ANCHORS, random_state=0).fit(X) centers = np.array(kmeans.cluster_centers_) print(centers) colors = [ 'red', 'orange', 'yellow', 'green', 'blue', 'violet', 'brown', 'olive', 'pink'
#train_file = '../data_yahoo/ctr_20170517_0530_0.015.txt.thres10.yx.100000' #test_file = '../data_yahoo/ctr_20170531.txt.downsample_all.0.1.thres10.yx.100000' # fm_model_file = '../data/fm.model.txt' print "train_file: ", train_file print "test_file: ", test_file sys.stdout.flush() input_dim = utils.INPUT_DIM """ train_data = utils.read_data(train_file) test_data = utils.read_data(test_file) """ train_label = utils.read_label(train_file) test_label = utils.read_label(test_file) train_size = train_label.shape[0] test_size = test_label.shape[0] num_feas = len(utils.FIELD_SIZES) min_round = 1 num_round = 1000 early_stop_round = 15 batch_size = 2000 bb = 1 field_sizes = utils.FIELD_SIZES field_offsets = utils.FIELD_OFFSETS
clf = lib.NaiveBayesClassifier("hi") # X=[[1,2],[3,4],[5,6]] # y=[1, 2, 3] # clf.train(X,y) # category = clf.predict([1,2,3,4,5,6]) # print category # import impl as nb import utils as utils import numpy as np path = '20news-bydate-matlab/matlab' features = utils.read_features("expanded.txt") label_array = utils.read_label(path, 'train.label') print len(label_array) answer_label_array = utils.read_label(path, 'test.label') test_features = utils.read_features("test_expanded.txt") vocab = utils.read_vocab("vocabulary.txt") #remove stop words from stop_words import get_stop_words stop_words = get_stop_words('en') # vocab = set(vocab)
# velo_path = "data/000015_velo.bin" # # output_viz_path = "data/match.jpg" # crop_imgs_output_dir = "Images" # offset_output_dir = "Annotations" # # gt_objects = read_label(gt_file) # pred_objects = read_label(pred_file) pred_paths = glob.glob(os.path.join(src_pred_dir, "*.txt")) for idx, pred_path in tqdm.tqdm(enumerate(pred_paths)): pred_idx = get_filename(pred_path, False) gt_path = get_file_path_by_idx(pred_idx, src_label_dir) # Get matched objects gt_objects = read_label(gt_path) pred_objects = read_label(pred_path) match = get_match_objs(pred_objects, gt_objects) # Generate target offset is_valid = gen_offset(pred_idx, match, output_anno_dir) if not is_valid: print("without detected objs, skip.") continue calib_path = get_file_path_by_idx(pred_idx, src_calib_dir) calibs = read_calib_file(calib_path) P = calibs["P2"] P = np.reshape(P, [3, 4]) match_8_pts = obj_to_8_points(match, P)
# fm_model_file = '../data/fm.model.txt' print "path_train: ", path_train print "path_validation: ", path_validation print "path_test: ", path_test sys.stdout.flush() #path_feature_index = '../data_yahoo/dataset2/featindex_25m_thres10.txt' #path_feature_index = '../data_cvr/featureindex_thres5.txt' #path_saved_model = 'model' #if os.path.exists(path_saved_model) and os.path.isdir(path_saved_model): # shutil.rmtree(path_saved_model) INPUT_DIM, FIELD_OFFSETS, FIELD_SIZES = utils.initiate(path_feature_index) print 'FIELD_SIZES', FIELD_SIZES train_label = utils.read_label(path_train) validation_label = utils.read_label(path_validation) test_label = utils.read_label(path_test) train_size = train_label.shape[0] validation_size = validation_label.shape[0] test_size = test_label.shape[0] num_feas = len(utils.FIELD_SIZES) min_round = 1 num_round = 1000 early_stop_round = 2 batch_size = 1000 #bb = 10 round_no_improve = 5
def train(self, config): optimizer = tf.train.AdamOptimizer().minimize(self.loss) self.sess.run(tf.global_variables_initializer()) train_writer = tf.summary.FileWriter( os.path.join(self.log_dir, self.model_dir, 'train'), self.sess.graph) if self.testing_paths is not None: test_writer = tf.summary.FileWriter( os.path.join(self.log_dir, self.model_dir, 'test')) counter = 0 for epoch in range(config['epoch']): # Shuffle the orders training_paths = np.random.permutation(self.training_paths) # Go through all selected patches for f in range(len(training_paths) // self.batch_size): labels = np.empty( (self.batch_size, self.label_size[0], self.label_size[1], self.label_size[2], self.channel), dtype=np.float32) survivals = np.empty((self.batch_size, 1), dtype=np.float32) ages = np.empty((self.batch_size, 1), dtype=np.float32) for b in range(self.batch_size): labels[b] = read_label(training_paths[b]) survivals[b] = self.training_survival_data[ os.path.basename(training_paths[b])][1] ages[b] = self.training_survival_data[os.path.basename( training_paths[b])][0] _, train_loss, summary = self.sess.run( [optimizer, self.loss, self.loss_summary], feed_dict={ self.labels: labels, self.survival: survivals, self.age: ages }) train_writer.add_summary(summary, counter) counter += 1 if np.mod(counter, 1000) == 0: self.save(counter) # Run test if self.testing_paths is not None and np.mod(counter, 100) == 0: testing_paths = np.random.permutation(self.testing_paths) for b in range(self.batch_size): labels[b] = read_label(testing_paths[b]) survivals[b] = self.testing_survival_data[ os.path.basename(testing_paths[b])][1] ages[b] = self.testing_survival_data[os.path.basename( testing_paths[b])][0] test_loss, summary = self.sess.run( [self.loss, self.loss_summary], feed_dict={ self.labels: labels, self.survival: survivals, self.age: ages }) print( str(counter) + ":" + "train_loss: " + str(train_loss) + " test_loss: " + str(test_loss)) test_writer.add_summary(summary, counter) # Save in the end self.save(counter)
import json import pprint import tqdm import pprint from utils import obj_to_8_points, read_label, Object3d, read_calib_file, get_xyzwhlrs_offset, \ crop_image, get_box_range, obj_to_8_points, draw_projected_box3d, write_json, \ load_velo_scan, render_lidar_on_image if __name__ == "__main__": pred_dir = "/mnt/nfs/chengyong/dev/frustum-pointnets/train/detection_results_16_v3/data" valid_dict = defaultdict(int) max_ry = -1000 min_ry = 1000 for pred_file in tqdm.tqdm(sorted(glob.glob(pred_dir + "/*.txt"))): pred_objects = read_label(pred_file) for obj in pred_objects: ry = obj.ry max_ry = max(max_ry, ry) min_ry = min(min_ry, ry) if ry > 3.14 or ry < -3.14: valid_dict[ntpath.basename(pred_file)] += 1 pprint.pprint(valid_dict) print("max ry: ", max_ry) print("min ry: ", min_ry) info_dict = defaultdict(int) pred_dir = "/mnt/nfs/chengyong/dev/thesis/TDPosRefineNet_v2_2/line-16-v3_val_iter15000_2" for pred_file in tqdm.tqdm(sorted(glob.glob(pred_dir + "/*.txt"))):