def f1(): try: file_path = root_dir + 'data/github/test_codevecs_npy/use.codevecs_0.npy' with FileIO(file_path, mode="rb") as fio: code_vec = np.load(fio) print('读取.npy成功', len(code_vec)) print(code_vec[0][:10]) except Exception as e: print('读取.npy失败') print(e) try: file_path = root_dir + 'data/github/vocab.apiseq.pkl' with FileIO(file_path, mode="rb") as fio: api_seq_vocab = pk.load(fio) print('读取.pkl成功', len(api_seq_vocab)) print(list(api_seq_vocab.keys())[:3]) except Exception as e: print('读取.pkl失败') print(e) try: file_path = root_dir + 'data/github/use.search.txt' with FileIO(file_path, mode="r") as fio: lines = fio.readlines() print('读取.txt成功') print(lines[0]) print(lines[1]) print(lines[2]) except Exception as e: print('读取.txt失败') print(e)
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "imdb/texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) sequences_reverse = [list(reversed(seq)) for seq in sequences] x = pad_sequences(sequences, maxlen=max_len) x_reverse=pad_sequences(sequences_reverse, maxlen=max_len) word_index = tokenizer.word_index embeddings_index = {} wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+')) allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+')) for i in range(len(allwords)): embeddings_index[allwords[i]] = wordX[i, :] embedding_matrix = np.zeros((num_words, 300)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and i < num_words: embedding_matrix[i] = embedding_vector y_train = np.zeros((25000,), dtype=np.float32) y_test = np.zeros((25000,), dtype=np.float32) y_train[12500:25000] = np.ones((12500,), dtype=np.float32) y_test[12500:25000] = np.ones((12500,), dtype=np.float32) x_seq= np.zeros((50000, (max_len - 3) * 4), dtype=np.int) for i in range(50000): for j in range(max_len - 3): x_seq[i, j * 4] = x[i, j] x_seq[i, j * 4 + 1] = x[i][j + 1] + num_words x_seq[i, j * 4 + 2] = x[i][j + 2] + num_words * 2 x_seq[i, j * 4 + 3] = x[i][j + 3] + num_words * 3 x_train_0 = x[:25000] x_train_1 = x_reverse[:25000] x_train_2=x_seq[:25000] x_test_0 = x[25000:] x_test_1 = x_reverse[25000:] x_test_2=x_seq[25000:] result=[] indice = np.arange(25000) np.random.shuffle(indice) result.append(x_train_0[indice]) result.append(x_train_1[indice]) result.append(x_train_2[indice]) result.append(x_test_0[indice]) result.append(x_test_1[indice]) result.append(x_test_2[indice]) result.append(y_train[indice]) result.append(y_test[indice]) result.append(embedding_matrix) return result
def prepare_train(): global tokenizer print("prepare training data") with FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') as f: texts = pickle.load(f)[:25000] with FileIO(os.path.join(FLAGS.buckets, "texts_unsup.pkl"), mode='r+') as f: texts += pickle.load(f) tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(texts) sequence = tokenizer.texts_to_sequences(texts) sum_words = sum([len(seq) for seq in sequence]) print('there are %d words' % (sum_words)) x = np.zeros((sum_words, 1), dtype=np.int32) y = np.zeros((sum_words, 1), dtype=np.int32) index = 0 for i, seq in enumerate(sequence): for s in seq: x[index] = i y[index] = s index += 1 indice = np.arange(sum_words) np.random.shuffle(indice) x = x[indice] y = y[indice] return x, y, sum_words
def prepare_train(): print("prepare training data") f = FileIO(os.path.join(FLAGS.buckets, 'texts.pkl'), 'rb') text1 = pickle.load(f) text1 = text1[:25000] f.close() f = FileIO(os.path.join(FLAGS.buckets, 'texts_unsup.pkl'), 'rb') text2 = pickle.load(f) f.close() texts = text1 + text2 tokenizer = Tokenizer(num_words=vocab_size) tokenizer.filters = '' tokenizer.fit_on_texts(texts) sequence = tokenizer.texts_to_sequences(texts) sequence_pad = pad_sequences(sequence, maxlen=MAX_DOCUMENT_LENGTH + 1, dtype=np.int32, padding='post', truncating='post') seq_len = [] for i in range(len(sequence)): r = len(sequence[i]) if r < MAX_DOCUMENT_LENGTH: seq_len.append(r) else: seq_len.append(MAX_DOCUMENT_LENGTH) x_1 = sequence_pad[:, :-1] y_ = sequence_pad[:, 1:] return x_1, seq_len, y_
def f3(): a = np.array([5, 4, 3, 2, 1]) file_path = root_dir + 'a.npy' with FileIO(file_path, mode="wb") as fio: np.save(fio, a) with FileIO(file_path, mode="rb") as fio: code_vec = np.load(fio) print('读取.npy成功', code_vec)
def is_database_created(username): filename = "{}.csv".format(username) file_exists = exists_in_gcp(filename) if file_exists: with FileIO(os.path.join("gs://", BUCKET_NAME, filename), 'r') as f: DATABASES[username] = pd.read_csv(f) else: DATABASES[username] = pd.DataFrame( columns=["username", "date", "cause", "spent"]) with FileIO(os.path.join("gs://", BUCKET_NAME, filename), 'w') as f: DATABASES[username].to_csv(f) return not file_exists
def load_embeddings(vocab, dim, filename): """ Load a subset of embedding vectors from file corresponding to vocabulary provided. Args: vocab: string->int map from words to their ids (id corresponds to vector's row in the resulting embedding matrix). All ids > 0. dim: embedding vector dimension filename: file where each line is a word followed by `dim` floats, all space-separated Returns: MxN = (len(vocab)+1) x dim numpy embedding matrix. The +1 for M is because 0th vector is a zero vector for padding. """ em = np.zeros((len(vocab) + 1, dim), dtype="float32") # with FileIO(filename, "r", encoding="utf-8") as f: with FileIO(filename, "r") as f: for linenum, line in enumerate(f): line = unidecode(line) idx = line.find(' ') if idx < 0: print("malformed line, no space found: line", linenum) continue word = line[:idx] if word not in vocab: continue i = vocab[word] em[i, :] = np.array(line.strip().split()[1:], dtype="float32") return em
def update_datasets(self, filter=None): if filter is None: filter = self._filter close_file = True log.info("Updateing datasets from file list: %s", self._input_source) if hasattr(self._input_source, 'read'): input_file = self._input_source close_file = False elif isinstance(self._input_source, str) and self._input_source.startswith("gs://"): log.info("Using tensorflow for IO") from tensorflow.python.lib.io.file_io import FileIO input_file = FileIO(self._input_source, "r") log.info("Tensorflow reported size: %d", input_file.size()) else: input_file = open(self._input_source) lines = input_file.readlines() for line in lines: fpath = line.strip() parts = fpath.split("/") file_name = parts[-1] match = self._re.match(file_name) if not match: continue match_components = match.groupdict() dataset_path = self._prepend_path + fpath dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path) dataset = self.get_dataset_by_id(dataset_id) if not filter(dataset_id, match_components, dataset): self.remove_dataset_by_id(dataset_id) if close_file: input_file.close()
def save_model(model, file): """ Save model to the given file (potentially Google storage). :param model: model :param file: output file """ print('Saving model to file {}.'.format(file)) temp_file = 'temp_model_{}.h5'.format(randint(0, 100000000)) model.save(temp_file) try: # copy model to google storage with FileIO(temp_file, mode='rb') as input_f: with FileIO(file, mode='wb') as output_f: output_f.write(input_f.read()) finally: remove(temp_file)
def potential_model(params, **kwargs): """Shortcut for generating potential model from paramters When creating the model, a params.yml is automatically created in model_dir containing network_params and model_params. The potential model can also be initiated with the model_dir, in that case, params.yml must locate in model_dir from which all parameters are loaded Args: params(str or dict): parameter dictionary or the model_dir **kwargs: additional options for the estimator, e.g. config """ import os import yaml from tensorflow.python.lib.io.file_io import FileIO from datetime import datetime if isinstance(params, str): model_dir = params assert tf.gfile.Exists('{}/params.yml'.format(model_dir)),\ "Parameters files not found." with FileIO(os.path.join(model_dir, 'params.yml'), 'r') as f: params = yaml.load(f, Loader=yaml.Loader) else: model_dir = params['model_dir'] yaml.Dumper.ignore_aliases = lambda *args: True to_write = yaml.dump(params) params_path = os.path.join(model_dir, 'params.yml') if not tf.gfile.IsDirectory(model_dir): tf.gfile.MakeDirs(model_dir) if tf.gfile.Exists(params_path): original = FileIO(params_path, 'r').read() if original != to_write: tf.gfile.Rename( params_path, params_path + '.' + datetime.now().strftime('%y%m%d%H%M')) FileIO(params_path, 'w').write(to_write) model = tf.estimator.Estimator(model_fn=_potential_model_fn, params=params, model_dir=model_dir, **kwargs) return model
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) # sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index sequences = [] for i in range(50000): t = [] tokens = texts[i].lower().split(' ') for j in range(len(tokens)): index = word_index.get(tokens[j], 0) if index < num_words: t.append(index) else: t.append(0) sequences.append(t) print('Found %s unique tokens.' % len(word_index)) data1 = pad_sequences(sequences[0:25000], maxlen=max_len) data2 = pad_sequences(sequences[25000:50000], maxlen=max_len) Ytrain = np.zeros((25000,), dtype=np.float32) Ytest = np.zeros((25000,), dtype=np.float32) Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32) Ytest[12500:25000] = np.ones((12500,), dtype=np.float32) Xtrain = np.zeros((25000, (max_len - 3) * 4), dtype=np.int) Xtest = np.zeros((25000, (max_len - 3) * 4), dtype=np.int) for i in range(25000): for j in range(max_len - 3): Xtrain[i, j * 4] = data1[i, j] Xtrain[i, j * 4 + 1] = data1[i][j + 1] + num_words Xtrain[i, j * 4 + 2] = data1[i][j + 2] + num_words * 2 Xtrain[i, j * 4 + 3] = data1[i][j + 3] + num_words * 3 for i in range(25000): for j in range(max_len - 3): Xtest[i, j * 4] = data2[i, j] Xtest[i, j * 4 + 1] = data2[i][j + 1] + num_words Xtest[i, j * 4 + 2] = data2[i][j + 2] + num_words * 2 Xtest[i, j * 4 + 3] = data2[i][j + 3] + num_words * 3 indice = np.arange(25000) np.random.shuffle(indice) Xtrain = Xtrain[indice] Ytrain = Ytrain[indice] Xtest = Xtest[indice] Ytest = Ytest[indice] return Xtrain, Ytrain, Xtest, Ytest
def copy(source, dest): """ Copy from source to dest, create all necessary dirs. :param source: source file :param dest: dest file """ with FileIO(source, mode='rb') as input_f: if '/' in dest and not isdir(dirname(dest)): makedirs(dirname(dest)) with open(dest, mode='wb') as output_f: while 1: buf = input_f.read(1024 * 1024) if not buf: break output_f.write(buf)
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) # word_index = tokenizer.word_index # sequences = [] # for i in range(50000): # t = [] # tokens = texts[i].lower().split(' ') # for j in range(len(tokens)): # index = word_index.get(tokens[j], 0) # if index < num_words: # t.append(index) # else: # t.append(0) # sequences.append(t) data1 = pad_sequences(sequences[0:25000], maxlen=max_len) data2 = pad_sequences(sequences[25000:50000], maxlen=max_len) Ytrain = np.zeros((25000,), dtype=np.float32) Ytest = np.zeros((25000,), dtype=np.float32) Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32) Ytest[12500:25000] = np.ones((12500,), dtype=np.float32) Xtrain = np.zeros((25000, (max_len - 1) * 2), dtype=np.int) Xtest = np.zeros((25000, (max_len - 1) * 2), dtype=np.int) for i in range(25000): for j in range(max_len - 1): Xtrain[i, j * 2] = data1[i, j] Xtrain[i, j * 2 + 1] = data1[i][j + 1] + num_words for i in range(25000): for j in range(max_len - 1): Xtest[i, j * 2] = data2[i, j] Xtest[i, j * 2 + 1] = data2[i][j + 1] + num_words indice = np.arange(25000) np.random.shuffle(indice) Xtrain = Xtrain[indice] Ytrain = Ytrain[indice] Xtest = Xtest[indice] Ytest = Ytest[indice] return Xtrain, Ytrain, Xtest, Ytest
def load_data(path, vocab, pad=32, numfiles=0, lowercase=False): X, Xu = [], [] t2s = Text2Seq(vocab, vocab_is_lowercase=lowercase) files = recursively_list_files(path) for i, fname in enumerate(tqdm(files, ascii=True, mininterval=0.5)): if 0 < numfiles < (i + 1): break # Process at most `numfiles` files # with FileIO(fname, "r", encoding="utf-8") as f: with FileIO(fname, "r") as f: text = f.read() seq, aux = t2s.toseq(text) X.extend(seq) Xu.extend(aux) X.extend([0] * pad) Xu.extend([[0, 0]] * pad) X = np.array(X, dtype="int32") Xu = np.array(Xu, dtype="float32") return X, Xu
def load_data_sequences(path, vocab, seqlen, stride, numfiles=0): XX, YY, XXu, YYu = [], [], [], [] t2s = Text2Seq(vocab) files = recursively_list_files(path) for i, fname in enumerate(tqdm(files, ascii=True)): if 0 < numfiles < (i + 1): break # Process at most `numfiles` files with FileIO(fname, "r") as f: seq, unk = t2s.toseq(f.read()) Xi, Yi = seqwindows(seq, seqlen, stride) Xui, Yui = seqwindows(unk, seqlen, stride, dtype="float32") XX.append(Xi) YY.append(Yi) XXu.append(Xui) YYu.append(Yui) X = np.concatenate(XX) Y = np.concatenate(YY) Xu = np.concatenate(XXu) Yu = np.concatenate(YYu) return X, Y, Xu, Yu
def train(): x_place = tf.placeholder(dtype=tf.int64, shape=(batch_size, 1)) y_place = tf.placeholder(dtype=tf.int64, shape=(batch_size, 1)) with tf.device("/cpu:0"): embedding_doc = tf.Variable( tf.random_uniform([num_ngram, 300], -0.5, 0.5)) nce_weights = tf.get_variable('nce_weights_words', [num_words, 300], trainable=True) nce_biases = tf.Variable(tf.zeros([num_words]), trainable=True) input_1 = tf.nn.embedding_lookup(embedding_doc, x_place) input_2 = tf.reshape(input_1, [-1, 300]) loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=y_place, inputs=input_2, num_sampled=num_sampled, num_classes=num_words)) optimizer = tf.train.AdamOptimizer().minimize(loss) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) x, y, embedding_metrix = prepare_data() init_nce = tf.assign(nce_weights, embedding_metrix) sess.run(init_nce) start = 0 for i in range(1000000): x_1, _y, start = get_input(x, y, start) # _loss, _ = sess.run([loss, optimizer], feed_dict={x1_place: x_1, x2_place: x_2, y_place: _y}) _loss, _ = sess.run([loss, optimizer], feed_dict={ x_place: x_1, y_place: _y }) if i % 300 == 0: print(i, " loss ", _loss) np.save( FileIO(os.path.join(FLAGS.buckets, "ngram_embedding.npy"), mode='w+'), embedding_doc.eval(sess))