예제 #1
0
def f1():
    try:
        file_path = root_dir + 'data/github/test_codevecs_npy/use.codevecs_0.npy'
        with FileIO(file_path, mode="rb") as fio:
            code_vec = np.load(fio)
            print('读取.npy成功', len(code_vec))
            print(code_vec[0][:10])
    except Exception as e:
        print('读取.npy失败')
        print(e)

    try:
        file_path = root_dir + 'data/github/vocab.apiseq.pkl'
        with FileIO(file_path, mode="rb") as fio:
            api_seq_vocab = pk.load(fio)
            print('读取.pkl成功', len(api_seq_vocab))
            print(list(api_seq_vocab.keys())[:3])
    except Exception as e:
        print('读取.pkl失败')
        print(e)

    try:
        file_path = root_dir + 'data/github/use.search.txt'
        with FileIO(file_path, mode="r") as fio:
            lines = fio.readlines()
            print('读取.txt成功')
            print(lines[0])
            print(lines[1])
            print(lines[2])
    except Exception as e:
        print('读取.txt失败')
        print(e)
예제 #2
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "imdb/texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    sequences_reverse = [list(reversed(seq)) for seq in sequences]

    x = pad_sequences(sequences, maxlen=max_len)
    x_reverse=pad_sequences(sequences_reverse, maxlen=max_len)

    word_index = tokenizer.word_index
    embeddings_index = {}
    wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+'))
    allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+'))
    for i in range(len(allwords)):
        embeddings_index[allwords[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    y_train = np.zeros((25000,), dtype=np.float32)
    y_test = np.zeros((25000,), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500,), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500,), dtype=np.float32)

    x_seq= np.zeros((50000, (max_len - 3) * 4), dtype=np.int)
    for i in range(50000):
        for j in range(max_len - 3):
            x_seq[i, j * 4] = x[i, j]
            x_seq[i, j * 4 + 1] = x[i][j + 1] + num_words
            x_seq[i, j * 4 + 2] = x[i][j + 2] + num_words * 2
            x_seq[i, j * 4 + 3] = x[i][j + 3] + num_words * 3

    x_train_0 = x[:25000]
    x_train_1 = x_reverse[:25000]
    x_train_2=x_seq[:25000]
    x_test_0 = x[25000:]
    x_test_1 = x_reverse[25000:]
    x_test_2=x_seq[25000:]

    result=[]

    indice = np.arange(25000)
    np.random.shuffle(indice)
    result.append(x_train_0[indice])
    result.append(x_train_1[indice])
    result.append(x_train_2[indice])
    result.append(x_test_0[indice])
    result.append(x_test_1[indice])
    result.append(x_test_2[indice])
    result.append(y_train[indice])
    result.append(y_test[indice])
    
    result.append(embedding_matrix)
    return result
예제 #3
0
def prepare_train():
    global tokenizer
    print("prepare training data")
    with FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') as f:
        texts = pickle.load(f)[:25000]
    with FileIO(os.path.join(FLAGS.buckets, "texts_unsup.pkl"), mode='r+') as f:
        texts += pickle.load(f)

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence = tokenizer.texts_to_sequences(texts)
    sum_words = sum([len(seq) for seq in sequence])
    print('there are %d words' % (sum_words))
    x = np.zeros((sum_words, 1), dtype=np.int32)
    y = np.zeros((sum_words, 1), dtype=np.int32)
    index = 0
    for i, seq in enumerate(sequence):
        for s in seq:
            x[index] = i
            y[index] = s
            index += 1
    indice = np.arange(sum_words)
    np.random.shuffle(indice)
    x = x[indice]
    y = y[indice]
    return x, y, sum_words
예제 #4
0
def prepare_train():
    print("prepare training data")
    f = FileIO(os.path.join(FLAGS.buckets, 'texts.pkl'), 'rb')
    text1 = pickle.load(f)
    text1 = text1[:25000]
    f.close()
    f = FileIO(os.path.join(FLAGS.buckets, 'texts_unsup.pkl'), 'rb')
    text2 = pickle.load(f)
    f.close()
    texts = text1 + text2

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts)
    sequence = tokenizer.texts_to_sequences(texts)
    sequence_pad = pad_sequences(sequence,
                                 maxlen=MAX_DOCUMENT_LENGTH + 1,
                                 dtype=np.int32,
                                 padding='post',
                                 truncating='post')
    seq_len = []
    for i in range(len(sequence)):
        r = len(sequence[i])
        if r < MAX_DOCUMENT_LENGTH:
            seq_len.append(r)
        else:
            seq_len.append(MAX_DOCUMENT_LENGTH)
    x_1 = sequence_pad[:, :-1]

    y_ = sequence_pad[:, 1:]
    return x_1, seq_len, y_
예제 #5
0
def f3():
    a = np.array([5, 4, 3, 2, 1])
    file_path = root_dir + 'a.npy'
    with FileIO(file_path, mode="wb") as fio:
        np.save(fio, a)
    with FileIO(file_path, mode="rb") as fio:
        code_vec = np.load(fio)
    print('读取.npy成功', code_vec)
예제 #6
0
def is_database_created(username):
    filename = "{}.csv".format(username)
    file_exists = exists_in_gcp(filename)
    if file_exists:
        with FileIO(os.path.join("gs://", BUCKET_NAME, filename), 'r') as f:
            DATABASES[username] = pd.read_csv(f)
    else:
        DATABASES[username] = pd.DataFrame(
            columns=["username", "date", "cause", "spent"])
        with FileIO(os.path.join("gs://", BUCKET_NAME, filename), 'w') as f:
            DATABASES[username].to_csv(f)
    return not file_exists
예제 #7
0
def load_embeddings(vocab, dim, filename):
    """
    Load a subset of embedding vectors from file corresponding to vocabulary provided.
    Args:
        vocab: string->int map from words to their ids (id corresponds to vector's row in the resulting embedding
             matrix). All ids > 0.
        dim: embedding vector dimension
        filename: file where each line is a word followed by `dim` floats, all space-separated

    Returns:
        MxN = (len(vocab)+1) x dim numpy embedding matrix.
        The +1 for M is because 0th vector is a zero vector for padding.
    """
    em = np.zeros((len(vocab) + 1, dim), dtype="float32")

    # with FileIO(filename, "r", encoding="utf-8") as f:
    with FileIO(filename, "r") as f:
        for linenum, line in enumerate(f):
            line = unidecode(line)
            idx = line.find(' ')
            if idx < 0:
                print("malformed line, no space found: line", linenum)
                continue
            word = line[:idx]
            if word not in vocab:
                continue
            i = vocab[word]

            em[i, :] = np.array(line.strip().split()[1:], dtype="float32")

    return em
예제 #8
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        close_file = True
        log.info("Updateing datasets from file list: %s", self._input_source)
        if hasattr(self._input_source, 'read'):
            input_file = self._input_source
            close_file = False
        elif isinstance(self._input_source,
                        str) and self._input_source.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._input_source, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._input_source)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components,
                                             dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        if close_file:
            input_file.close()
예제 #9
0
def save_model(model, file):
    """
    Save model to the given file (potentially Google storage).

    :param model: model
    :param file: output file
    """
    print('Saving model to file {}.'.format(file))
    temp_file = 'temp_model_{}.h5'.format(randint(0, 100000000))
    model.save(temp_file)
    try:
        # copy model to google storage
        with FileIO(temp_file, mode='rb') as input_f:
            with FileIO(file, mode='wb') as output_f:
                output_f.write(input_f.read())
    finally:
        remove(temp_file)
예제 #10
0
def potential_model(params, **kwargs):
    """Shortcut for generating potential model from paramters

    When creating the model, a params.yml is automatically created 
    in model_dir containing network_params and model_params.

    The potential model can also be initiated with the model_dir, 
    in that case, params.yml must locate in model_dir from which
    all parameters are loaded

    Args:
        params(str or dict): parameter dictionary or the model_dir
        **kwargs: additional options for the estimator, e.g. config
    """
    import os
    import yaml
    from tensorflow.python.lib.io.file_io import FileIO
    from datetime import datetime

    if isinstance(params, str):
        model_dir = params
        assert tf.gfile.Exists('{}/params.yml'.format(model_dir)),\
            "Parameters files not found."
        with FileIO(os.path.join(model_dir, 'params.yml'), 'r') as f:
            params = yaml.load(f, Loader=yaml.Loader)
    else:
        model_dir = params['model_dir']
        yaml.Dumper.ignore_aliases = lambda *args: True
        to_write = yaml.dump(params)
        params_path = os.path.join(model_dir, 'params.yml')
        if not tf.gfile.IsDirectory(model_dir):
            tf.gfile.MakeDirs(model_dir)
        if tf.gfile.Exists(params_path):
            original = FileIO(params_path, 'r').read()
            if original != to_write:
                tf.gfile.Rename(
                    params_path,
                    params_path + '.' + datetime.now().strftime('%y%m%d%H%M'))
        FileIO(params_path, 'w').write(to_write)

    model = tf.estimator.Estimator(model_fn=_potential_model_fn,
                                   params=params,
                                   model_dir=model_dir,
                                   **kwargs)
    return model
예제 #11
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    # sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    sequences = []
    for i in range(50000):
        t = []
        tokens = texts[i].lower().split(' ')
        for j in range(len(tokens)):
            index = word_index.get(tokens[j], 0)
            if index < num_words:
                t.append(index)
            else:
                t.append(0)
        sequences.append(t)

    print('Found %s unique tokens.' % len(word_index))

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000,), dtype=np.float32)
    Ytest = np.zeros((25000,), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500,), dtype=np.float32)

    Xtrain = np.zeros((25000, (max_len - 3) * 4), dtype=np.int)
    Xtest = np.zeros((25000, (max_len - 3) * 4), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 3):
            Xtrain[i, j * 4] = data1[i, j]
            Xtrain[i, j * 4 + 1] = data1[i][j + 1] + num_words
            Xtrain[i, j * 4 + 2] = data1[i][j + 2] + num_words * 2
            Xtrain[i, j * 4 + 3] = data1[i][j + 3] + num_words * 3
    for i in range(25000):
        for j in range(max_len - 3):
            Xtest[i, j * 4] = data2[i, j]
            Xtest[i, j * 4 + 1] = data2[i][j + 1] + num_words
            Xtest[i, j * 4 + 2] = data2[i][j + 2] + num_words * 2
            Xtest[i, j * 4 + 3] = data2[i][j + 3] + num_words * 3

    indice = np.arange(25000)
    np.random.shuffle(indice)
    Xtrain = Xtrain[indice]
    Ytrain = Ytrain[indice]
    Xtest = Xtest[indice]
    Ytest = Ytest[indice]
    return Xtrain, Ytrain, Xtest, Ytest
예제 #12
0
def copy(source, dest):
    """
    Copy from source to dest, create all necessary dirs.

    :param source: source file
    :param dest: dest file
    """
    with FileIO(source, mode='rb') as input_f:
        if '/' in dest and not isdir(dirname(dest)):
            makedirs(dirname(dest))
        with open(dest, mode='wb') as output_f:
            while 1:
                buf = input_f.read(1024 * 1024)
                if not buf:
                    break
                output_f.write(buf)
예제 #13
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    # word_index = tokenizer.word_index
    # sequences = []
    # for i in range(50000):
    #     t = []
    #     tokens = texts[i].lower().split(' ')
    #     for j in range(len(tokens)):
    #         index = word_index.get(tokens[j], 0)
    #         if index < num_words:
    #             t.append(index)
    #         else:
    #             t.append(0)
    #     sequences.append(t)

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000,), dtype=np.float32)
    Ytest = np.zeros((25000,), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500,), dtype=np.float32)

    Xtrain = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    Xtest = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 1):
            Xtrain[i, j * 2] = data1[i, j]
            Xtrain[i, j * 2 + 1] = data1[i][j + 1] + num_words
    for i in range(25000):
        for j in range(max_len - 1):
            Xtest[i, j * 2] = data2[i, j]
            Xtest[i, j * 2 + 1] = data2[i][j + 1] + num_words

    indice = np.arange(25000)
    np.random.shuffle(indice)
    Xtrain = Xtrain[indice]
    Ytrain = Ytrain[indice]
    Xtest = Xtest[indice]
    Ytest = Ytest[indice]
    return Xtrain, Ytrain, Xtest, Ytest
예제 #14
0
def load_data(path, vocab, pad=32, numfiles=0, lowercase=False):
    X, Xu = [], []
    t2s = Text2Seq(vocab, vocab_is_lowercase=lowercase)
    files = recursively_list_files(path)
    for i, fname in enumerate(tqdm(files, ascii=True, mininterval=0.5)):
        if 0 < numfiles < (i + 1):
            break  # Process at most `numfiles` files
        # with FileIO(fname, "r", encoding="utf-8") as f:
        with FileIO(fname, "r") as f:
            text = f.read()
            seq, aux = t2s.toseq(text)
            X.extend(seq)
            Xu.extend(aux)
            X.extend([0] * pad)
            Xu.extend([[0, 0]] * pad)

    X = np.array(X, dtype="int32")
    Xu = np.array(Xu, dtype="float32")
    return X, Xu
예제 #15
0
def load_data_sequences(path, vocab, seqlen, stride, numfiles=0):
    XX, YY, XXu, YYu = [], [], [], []
    t2s = Text2Seq(vocab)
    files = recursively_list_files(path)
    for i, fname in enumerate(tqdm(files, ascii=True)):
        if 0 < numfiles < (i + 1):
            break  # Process at most `numfiles` files
        with FileIO(fname, "r") as f:
            seq, unk = t2s.toseq(f.read())
            Xi, Yi = seqwindows(seq, seqlen, stride)
            Xui, Yui = seqwindows(unk, seqlen, stride, dtype="float32")
            XX.append(Xi)
            YY.append(Yi)
            XXu.append(Xui)
            YYu.append(Yui)
    X = np.concatenate(XX)
    Y = np.concatenate(YY)
    Xu = np.concatenate(XXu)
    Yu = np.concatenate(YYu)
    return X, Y, Xu, Yu
예제 #16
0
def train():
    x_place = tf.placeholder(dtype=tf.int64, shape=(batch_size, 1))
    y_place = tf.placeholder(dtype=tf.int64, shape=(batch_size, 1))
    with tf.device("/cpu:0"):
        embedding_doc = tf.Variable(
            tf.random_uniform([num_ngram, 300], -0.5, 0.5))
        nce_weights = tf.get_variable('nce_weights_words', [num_words, 300],
                                      trainable=True)
        nce_biases = tf.Variable(tf.zeros([num_words]), trainable=True)

        input_1 = tf.nn.embedding_lookup(embedding_doc, x_place)
    input_2 = tf.reshape(input_1, [-1, 300])
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=y_place,
                       inputs=input_2,
                       num_sampled=num_sampled,
                       num_classes=num_words))
    optimizer = tf.train.AdamOptimizer().minimize(loss)
    init = tf.global_variables_initializer()

    sess = tf.Session()
    sess.run(init)
    x, y, embedding_metrix = prepare_data()
    init_nce = tf.assign(nce_weights, embedding_metrix)
    sess.run(init_nce)
    start = 0
    for i in range(1000000):
        x_1, _y, start = get_input(x, y, start)
        # _loss, _ = sess.run([loss, optimizer], feed_dict={x1_place: x_1, x2_place: x_2, y_place: _y})
        _loss, _ = sess.run([loss, optimizer],
                            feed_dict={
                                x_place: x_1,
                                y_place: _y
                            })
        if i % 300 == 0:
            print(i, " loss ", _loss)
    np.save(
        FileIO(os.path.join(FLAGS.buckets, "ngram_embedding.npy"), mode='w+'),
        embedding_doc.eval(sess))