def main(): import codecs import sys import itertools import math args = parse_args() ilines = [util.preprocess(x, args.lang) for x in codecs.open(args.input, 'r', 'utf-8').readlines()] rlines = [util.preprocess(x, args.lang) for x in codecs.open(args.ref, 'r', 'utf-8').readlines()] if len(ilines) != len(rlines): print("Error: input file has {0} lines, but reference has {1} lines.".format(len(ilines), len(rlines))) sys.exit(1) scores = [] falign = open(args.align, 'w') if args.align is not None else None for lineno, (rline, iline) in enumerate(itertools.izip(ilines, rlines), start=1): if args.force_token_mode: rline, iline = rline.split(), iline.split() else: rline, iline = util.split(rline, args.lang), util.split(iline, args.lang) # iline, rline are list object score, alignment = ter(iline, rline, align=True) if args.align is not None: falign.write('%s\n' % ' '.join(alignment)) scores.append(score) if args.verbose: print("Sentence {0}: {1:.4f}".format(lineno, score)) if args.align is not None: falign.close() average = sum(scores) / len(scores) variance = sum((x - average) ** 2 for x in scores) / len(scores) stddev = math.sqrt(variance) print("Average={0:.4f}, Variance={1:.4f}, Standard Deviation={2:.4f}".format(average, variance, stddev))
def __init__(self, data_dir, data_name, train_ratio, device): self.train_ratio = train_ratio self.num_negatives = 3 self.device = device if data_name == 'ml-100k': sep = '\t' filename = 'u.data' self.num_users, self.num_items = 943, 1682 elif data_name == 'ml-1m': sep = '::' filename = 'ratings.dat' self.num_users, self.num_items = 6040, 3952 else: raise NotImplementedError( 'Dataset not loaded. Availabe are: {ml-100k, ml-1m}') data_path = os.path.join(data_dir, data_name, data_name + '.data') stat_path = os.path.join(data_dir, data_name, data_name + '.stat') if os.path.exists(data_path) and os.path.exists(data_path): print('Already preprocessed. Load from file.') else: preprocess(os.path.join(data_dir, data_name, filename), data_path, stat_path, sep) print('Read movielens data from %s' % data_path) self.train_matrix, self.test_matrix, self.user_id_map, self.user_popularity, self.item_id_map, self.item_popularity, self.num_uesrs, self.num_items = load_data( data_path)
def main(): import sys import codecs args = parse_args() hlines = [ util.preprocess(x, "en") for x in codecs.open(args.hypothesis, 'r', 'utf-8').readlines() ] rlines = [ util.preprocess(x, "en") for x in codecs.open(args.reference, 'r', 'utf-8').readlines() ] if len(hlines) != len(rlines): print("Error: input file has {0} lines, but reference has {1} lines.". format(len(hlines), len(rlines))) sys.exit(1) scores = [] for lineno, (hline, rline) in enumerate(zip(hlines, rlines), start=1): rline, hline = list(rline), list(hline) score = eed(hline, rline) scores.append(score) if args.verbose: print("Sentence {0}: {1:.4f}".format(lineno, score)) average = sum(scores) / len(scores) print("System Score={0:.4f}".format(average)) sys.exit(0)
def main(): ifHash = False trainfile = 'yelp_reviews_train.json' X, y, top = util.preprocess(trainfile, ifTrain=True, ifHash=ifHash, trainTop=[]) W = multiLR.BSGD(X, y) t, s = multiLR.predict(W, X) print eval.eval(t, s, y) predfile = 'yelp_reviews_dev.json' x, _, _ = util.preprocess(predfile, ifTrain=False, ifHash=ifHash, trainTop=top) t, s = multiLR.predict(W, x) util.writePred(t, s, 'v7.txt') return
def get_id(file_path, input_len=400, target_len=100, max_oov=400): with open(file_path, encoding='utf8') as f: text = f.read() text = text.split('\n\n') word_list = [] art, summ = [], [] for t in text: temp = t.split(':==:') art.append(util.preprocess(temp[0])) summ.append(util.preprocess(temp[1])) word_list += util.preprocess(temp[0]).split() word_list += util.preprocess(temp[1]).split() del text, t, temp word_list = list(set(word_list)) oov2idx, idx2oov = vocab.create_oov_list(word_list, max_oov) art_max, sum_max = 0, 0 for ind, k in enumerate(art): if len(k.split()) > art_max: art_max = len(k.split()) if len(summ[ind].split()) > sum_max: sum_max = len(summ[ind].split()) if art_max > input_len: art_max = input_len if sum_max > target_len: sum_max = target_len temp = [] for index in range(8): lst = art[index].split()[:art_max - 2] lst = vocab.word_list_to_idx_list(lst, oov2idx) lst.insert(0, vocab.w2i['<SOS>']) lst.insert(len(lst), vocab.w2i['<EOS>']) diff = 0 if len(lst) < art_max: diff = art_max - len(lst) pad = [vocab.w2i['<PAD>']] * diff lst = lst + pad temp.append(lst) inp = np.array(temp).astype(int) temp = [] for index in range(8): lst = summ[index].split()[:sum_max - 1] lst = vocab.word_list_to_idx_list(lst, oov2idx) lst.insert(len(lst), vocab.w2i['<EOS>']) diff = 0 if len(lst) < sum_max: diff = sum_max - len(lst) pad = [vocab.w2i['<PAD>']] * diff lst = lst + pad temp.append(lst) tar = np.array(temp).astype(int) return (inp, tar, idx2oov)
def score(hypIn, refIn): import codecs hyp = [util.preprocess(x) for x in open(hypIn, mode='rt', encoding='utf-8').readlines()] ref = [util.preprocess(x) for x in open(refIn, mode='rt', encoding='utf-8').readlines()] assert len(hyp) == len(ref) scores = [] for (h, r) in zip(hyp, ref): # h, r = list(h), list(r) score = eed(h, r) scores.append(score) return sum(scores) / len(scores)
def generate(): ifHash = True trainfile = 'yelp_reviews_train.json' X, y, top = util.preprocess(trainfile, ifTrain=True, ifHash=ifHash, trainTop=[]) predfile = 'yelp_reviews_dev.json' x, _, _ = util.preprocess(predfile, ifTrain=False, ifHash=ifHash, trainTop=top) process(X, y, 'libtrainHash.txt') process(x, np.zeros(x.shape[0]), 'libdevHash.txt') return
def __getitem__(self, i): i = random.randint(0, len(self.raws) - 1) raw, blur = self.raws[i], self.blurs[i] if self.noise: raw, blur, blur_noise = preprocess([raw, blur], self.patchsize, self.noise) return {'A': raw, 'B': blur, 'B_n': blur_noise} else: raw, blur = preprocess([raw, blur], self.patchsize, self.noise) return {'A': raw, 'B': blur}
def __getitem__(self, i): idx = random.randint(0, len(self.raw_Ss) - 1) raw_S = self.raw_Ss[idx] blur_S = self.blur_Ss[idx] if self.noise: raw, blur, blur_noise = preprocess([raw_S, blur_S], self.patchsize, self.noise) return {'A': raw, 'B': blur, 'B_n': blur_noise} else: raw, blur = preprocess([raw_S, blur_S], self.patchsize, self.noise) return {'A': raw, 'B': blur}
def set_data(self, X_train, labels_train): """ Store shuffled data in instance variable self.data, and make distinction between training and validation sets.""" self.data = self._get_shuffled_data_dict(X_train, labels_train) n_train = int(50e3) self.data['X_val'] = util.preprocess(self.data['X_train'][n_train:]) self.data['X_train'] = util.preprocess(self.data['X_train'][:n_train]) self.data['labels_val'] = self.data['labels_train'][n_train:] self.data['labels_train']= self.data['labels_train'][:n_train] # The 'active' data will later correspond with the minibatch being used. self.X_active = self.data['X_train'] self.n_data_active = self.X_active.shape[0] self.labels_active = self.data['labels_train']
def fit(self, args, train_data, dev_data): x_dev_batch, y_dev_batch = util.preprocess(dev_data) with tf.Session(graph=self.graph) as sess: sw = tf.train.SummaryWriter(self.result_folder, sess.graph) print("Init models") sess.run(tf.initialize_all_variables()) for i in range(args.num_epochs): train_iterator = util.ptb_iterator(train_data, args.batch_size) for x_batch, y_batch in train_iterator: _, train_summaries, total_loss, current_step = self.train_step( sess, x_batch, y_batch) sw.add_summary(train_summaries, current_step) if current_step % args.eval_freq == 0: acc, dev_summaries = self.dev_step( sess, x_dev_batch, y_dev_batch) sw.add_summary(dev_summaries, current_step) if current_step % args.save_freq == 0: self.saver.save(sess, self.result_folder + '/bee.chkp', global_step=current_step) epoch_acc, dev_summaries = self.dev_step( sess, x_dev_batch, y_dev_batch) print('Epoch: %d, Accuracy: %f' % (i + 1, epoch_acc)) self.saver.save(sess, self.result_folder + '/bee.chkp') with open(self.result_folder + '/bee_saver_def.pb', 'w') as f: f.write(self.saver.as_saver_def())
def main(): window_size = 1 hidden_size = 5 batch_size = 3 max_epoch = 1000 text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) contexts, target = create_context_target(corpus, window_size) one_hot_target = convert_one_hot(target, vocab_size) one_hot_contexts = convert_one_hot(contexts, vocab_size) model = SimpleCBOW(vocab_size, hidden_size) optimizer = Adam() trainer = Trainer(model, optimizer) trainer.fit(one_hot_contexts, one_hot_target, max_epoch, batch_size) # trainer.plot() word_vecs = model.word_vecs for word_id, word in id_to_word.items(): print(word, word_vecs[word_id]) print('DONE')
def get_style_feature(input_style_image, device, style_name): # writer = tf.summary.FileWriter(P.st_logs + style_name + "_style_feature") with tf.Graph().as_default(), tf.device(device), tf.Session() as sess: style_image = tf.placeholder(dtype=tf.float32, shape=input_style_image.shape(), name='style_image') style_image_pre = util.preprocess(style_image) net = VGG.net(P.st_vgg_path, style_image_pre, layer_name='style_feature') style_features = dict() style_layer = dict(filter(lambda x: x[0] in STYLE_LAYERS, net.items())) merge = tf.summary.merge_all() # style_layer['summary'] = merge result = sess.run(style_layer, {style_image: input_style_image.image}) for layer in STYLE_LAYERS: features = result[layer] features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram # writer.add_graph(sess.graph) # writer.add_summary(result['summary']) # writer.flush() # writer.close() return style_features
def score(hypIn, refIn): import codecs hyp = [ util.preprocess(x, "en") for x in codecs.open(hypIn, 'r', 'utf-8').readlines() ] ref = [ util.preprocess(x, "en") for x in codecs.open(refIn, 'r', 'utf-8').readlines() ] scores = [] for (h, r) in zip(hyp, ref): h, r = list(h), list(r) score = eed(h, r) scores.append(score) return sum(scores) / len(scores)
def plot_CDF(filename, start_selector, end_selector, start_index=None, end_index=None, **kwargs): records = read_records(filename) lineages = preprocess(records, cmd_of_interest='', send_only=False) intervals = get_intervals(lineages, start_selector, end_selector, start_index=start_index, end_index=end_index, **kwargs) sortedtime = np.sort(intervals.values()) p = 1. * np.arange(len(intervals.values())) / (len(intervals.values()) - 1) plt.plot(sortedtime, p, **kwargs)
def _file_reader(self, filename_queue): # read file from queue reader = tf.WholeFileReader() _, img_bytes = reader.read(filename_queue) # decode it image_data = tf.image.decode_jpeg(img_bytes, channels=3) # preprocess it and return return preprocess(image_data, self.config)
def get_rnn_data(N_rows, bucket_size): parse_dates = [['Date', 'Time']] filename = "household_power_consumption.txt" df = preprocess(N_rows, parse_dates, filename) df = pd.DataFrame(bucket_avg(df["Global_active_power"], bucket_size)) df.dropna(inplace=True) x = np.array(range(df.shape[0])) y = np.array(df.Global_active_power) return x, y
def _process_document(self, abs_path): """处理单个文档 返回文档的id以及 这个文档中每个词出现的次数""" document = doc(abs_path) content = document.article + document.abstract #分词 去掉禁用词 之后进行词性还原 content_tokens = preprocess(content) return document.id_, Counter(content_tokens)
def __init__(self, query_string, topk=10): """args: query_string:查询的字符串 topk:返回的文档的个数 """ self.query_string = query_string self.query_tokens = preprocess(self.query_string) self.topk = topk
def pre_process(self,top_k,file_names,word2vec_file_name,is_train): p = preprocess() new_file_name = p.new_file_name(word2vec_file_name,top_k) #if training create new word embedding file of top k words if is_train: print("creating new word2vec file of top {} fetaures".format(top_k)) top_k_words = p.top_k_freq_words(file_names,top_k) p.top_k_word2vec(word2vec_file_name,top_k_words,embedding_dimension,new_file_name) print("loading word2vec file from ",new_file_name) self.read_word_embedding(new_file_name)
def infer_vectors(self, posDic, posLda, negDic, negLda): """ infer the topic vectors. """ pos = "" neg = "" for (ratings, review) in self.pos_reviews: pos = pos + review for (ratings, review) in self.neg_reviews: neg = neg + review pos_tuple = posLda[posDic.doc2bow(preprocess(pos))] neg_tuple = negLda[negDic.doc2bow(preprocess(neg))] pos_repr = [0] * posLda.num_topics neg_repr = [0] * negLda.num_topics for k, v in pos_tuple: pos_repr[k] = v for k, v in neg_tuple: neg_repr[k] = v self.lda_repr = pos_repr + neg_repr return self.lda_repr
def show_results(file_name,items,colors,labels): img_show=util.preprocess(file_name,RGB2BGR=False) for r in items: #print(r) img_show=cv2.rectangle(img_show,(r[2],r[3]),(r[2]+r[4],r[3]+r[5]),color=colors[r[0]],thickness=1) font=cv2.FONT_HERSHEY_SIMPLEX img_show=cv2.putText(img_show,labels[r[0]],(r[2],r[3]),font,0.6,colors[r[0]],2) img_show=cv2.putText(img_show,str(r[1]),(r[2]+r[4],r[3]+r[5]),font,0.3,colors[r[0]],2) plt.imshow(img_show) plt.show() return img_show
def predict(cls, input_text): # get the model and vectorizer. clf, vectorizer = cls.get_model() # clean the text same a what we did while training. preprocessed_text = preprocess(input_text) # converted the cleaned text into vector vector = vectorizer.transform([preprocessed_text]) # use the sklearn logistic regression predict function to make predicitions return clf.predict(vector)[0], clf.predict_proba(vector)[0]
def perturb(self, x, y, sess): sess.run(self.new_vars_initializer) sess.run(self.xs.initializer) sess.run(self.do_clip_xs, {self.orig_xs: x}) for i in range(self.num_steps): imgs = sess.run(self.xs) points = imgs.reshape((-1, 3)) t = preprocess(imgs, self.codes) sess.run(self.train, feed_dict={self.ys: y, self.z: t}) sess.run(self.do_clip_xs, {self.orig_xs: x}) return sess.run(self.xs)
def get_style_images(content_img): _, ch, cw, cd = content_img.shape style_imgs = [] for style_fn in args.style_imgs: path = os.path.join(args.style_imgs_dir, style_fn) # bgr image img = cv2.imread(path, cv2.IMREAD_COLOR) check_image(img, path) img = img.astype(np.float32) img = cv2.resize(img, dsize=(cw, ch), interpolation=cv2.INTER_AREA) img = preprocess(img) style_imgs.append(img) return style_imgs
def eval(self, args, test_data): x_test_batch, y_test_batch = util.preprocess(test_data) checkpoint = tf.train.get_checkpoint_state(args.model_folder) with tf.Session(graph=self.graph) as sess: print("Init models") self.saver.restore(sess, checkpoint.model_checkpoint_path) acc = sess.run(self.accuracy, feed_dict={ self.x_plh: x_test_batch, self.y_plh: y_test_batch }) print('Accuracy on test data: %f' % acc)
def skills_section(lang): assert lang in ["zh", "en"] s = "\\section{{{}}}\n\n".format(_["skills"][lang]) s += "\\vspace{0.618ex}\n" s += r"\begin{itemize}" s += "\n" for i in _["skills"]["details"][lang]: s += "\\item " + preprocess(i) + "\n" s += r"\end{itemize}" s += "\n" return s
def get_rnn_data(N_rows, bucket_size): parse_dates = [['Date', 'Time']] filename = "household_power_consumption1.txt" df = preprocess(N_rows, parse_dates, filename) global_power = pd.DataFrame( bucket_avg(df["Global_active_power"], bucket_size)) sub1 = pd.DataFrame(bucket_avg(df["Sub_metering_1"], bucket_size)) sub2 = pd.DataFrame(bucket_avg(df["Sub_metering_2"], bucket_size)) sub3 = pd.DataFrame(bucket_avg(df["Sub_metering_3"], bucket_size)) #df.dropna(inplace=True) #df.iloc[-1, :].index # last time step #2010-11-26 21:00:00 x = np.array(range(global_power.shape[0])) y = np.column_stack((sub1, sub2, sub3, global_power)) return x, y
def correct(sentence): sentence = preprocess(sentence) tokens = tokenize(sentence) print('segment sentens is:', ''.join([str(token) for token in tokens])) seg_range = [[token[1], token[2]] for token in tokens] _, _, maybe_error_range = score_sentence(sentence) maybe_error_ranges = [] if maybe_error_range: print('maybe error range:', maybe_error_range) maybe_error_ranges = merge_ranges(overlap_ranges(maybe_error_range, seg_range)) for range in maybe_error_ranges: start_index, end_index = range print('maybe error words:', sentence[start_index:end_index]) corrected_words = correct_chars(sentence, start_index, end_index) print('corrected words:', corrected_words) sentence = sentence[:start_index] + corrected_words + sentence[end_index:] return sentence, maybe_error_ranges
def run_process(n_click, algo, metric, gen, filename, label): # receive gen, filename, label, budget if n_click > 0: # remove previous files if path.exists('pipeline.csv'): os.remove('pipeline.csv') if path.exists('ref.csv'): os.remove('ref.csv') p_ref = pipelineRef(algo) X, y = preprocess(filename, label, algo) total_time, stat = long_process(gen, X, y, metric, algo) stat['status'] = 'Completed' print("waiting for 3 seconds to finish processes") time.sleep(3) return ' ', ' : ' + str(total_time) + " Seconds", True, True, True else: return ' ', ' ', False, False, False
def plot_lineages(filename, start_selector, end_selector, start_index=None, end_index=None, **kwargs): records = read_records(filename) lineages = preprocess(records, cmd_of_interest='', send_only=False) intervals = get_intervals(lineages, start_selector, end_selector, start_index=start_index, end_index=end_index, **kwargs) items = intervals.items() items.sort(key=lambda i: int(i[0])) return plt.plot([i[0] for i in items], [i[1] for i in items], **kwargs)
def has_stabilized(frames): if len(frames) < 1: return preprocessed = map(lambda x: util.preprocess(x), frames) sum_diff = 0 # find sum of diff for each pair of consequent frames for i in range(len(preprocessed)): if i + 1 > len(preprocessed) - 1: break diff = cv2.absdiff(preprocessed[i], preprocessed[i + 1]) sum_diff += np.sum(diff) # normalize diff by number of pixels and frames movement = sum_diff / len(preprocessed) / preprocessed[0].size return movement < MOVEMENT_THRESHOLD
def get_content_image(content_img): path = os.path.join(args.content_img_dir, content_img) # bgr image img = cv2.imread(path, cv2.IMREAD_COLOR) check_image(img, path) img = img.astype(np.float32) h, w, d = img.shape mx = args.max_size # resize if > max size if h > w and h > mx: w = (float(mx) / float(h)) * w img = cv2.resize(img, dsize=(int(w), mx), interpolation=cv2.INTER_AREA) if w > mx: h = (float(mx) / float(w)) * h img = cv2.resize(img, dsize=(mx, int(h)), interpolation=cv2.INTER_AREA) img = preprocess(img) return img
def get_canny(img): preprocessed = util.preprocess(img) canny = cv2.Canny(preprocessed, threshold1=200, threshold2=50) dilated = cv2.dilate(canny, (10, 10)) return dilated
X_test = [] print "#documents: {}".format(len(glob(corpus_dir + '*.txt'))) for p in glob(corpus_dir + '*.txt'): doc_id = os.path.basename(p).split('.')[0] if doc_id not in train_doc_ids: X_test.append(open(p).read()) X_train = train_docs label = LabelEncoder().fit(train_labels) Y_train = label.transform(train_labels) # preprocessing X_train = [preprocess(doc) for doc in X_train] X_test = [preprocess(doc) for doc in X_test] print "#X_train: {}, #X_test: {}".format(len(X_train), len(X_test)) clf.fit(X_train, Y_train) Y_test = clf.predict_proba(X_test) print for l, i in zip(label.classes_, xrange(Y_test.shape[1])): proba = Y_test[:, i] print "{}: {}".format(l, len(np.nonzero(proba >= 0.5)[0])) weights = clf.get_params()['clf'].coef_[0,:]
def get_binary(img, thresh=150): preprocessed = util.preprocess(img) _, threshold = cv2.threshold(preprocessed, thresh=thresh, maxval=255, type=cv2.THRESH_BINARY) return threshold