Exemplo n.º 1
0
sentences = []

with open("/home/siyuan/data/ner_sample.txt", "r") as f:
    cont = f.read()
    lines = cont.split("\n")
    idx = np.random.permutation(len(lines))
    for i in idx[:50000]:
        sentences.append(list(jieba.cut(lines[i], HMM=True)))

print(sentences)

model = Word2Vec(sentences,
                 sg=1,
                 size=100,
                 window=5,
                 min_count=5,
                 negative=3,
                 sample=0.001,
                 hs=1,
                 workers=4)

model.save("./word2vec_gensim.model")


def isstopword(word):
    if word == "_":
        return True
    if len(re.findall("(.先生|.女士)", word)) > 0:
        return True
    else:
        return False
Exemplo n.º 2
0
    f = s.replace("\n", " ")

    # iterate through each sentence in the file
    for i in sent_tokenize(f):
        #temp = []

        # tokenize the sentence into words
        for j in word_tokenize(i):
            temp.append(j.lower())

        data.append(temp)

sentences = data

# train model
model = Word2Vec(sentences, min_count=1)

# summarize the loaded model
print(model)

# summarize vocabulary
words = list(model.wv.vocab)
print(words)
#model.save('model.bin')

X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

plt.scatter(result[:, 0], result[:, 1])
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 18 16:04:35 2019

@author: chenjiannan
"""

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

path = get_tmpfile("word2vec.model")

model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

model.build_vocab([["hello", "world"]], update=True)
model.train([["hello", "world"]], total_examples=1, epochs=1)


model.wv['hello']


from gensim.test.utils import datapath
Exemplo n.º 4
0
tokenized_file = "tokenized_dataset.pickle"

# load our tokenized dataset
ids, questions, answers, all_answers = dl.get_dataset_tokens_loaded(dataset_file, tokenized_file)

# we want all vocabulary
alldocs = questions + all_answers

# for reshuffling per pass
doc_list = alldocs[:]

print('Input %d docs in total' % (len(doc_list)))

assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model = Word2Vec(size=word_dim, window=10, min_count=1, workers=32)
model.build_vocab(alldocs)
# We only want to train on new words, so intersect with google pre-trained.
model.intersect_word2vec_format(g_pretrain_bin_file, binary=True, lockf=0.0)


@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end - start


def cwidvec2str(cwid, vec):
Exemplo n.º 5
0
    with open(pathname) as file:
        noisy_words = [word.replace("\n", "") for word in file.readlines()]

    for odd_word in noisy_words:
        odd_word.replace("\n", '')
        text = text.replace(" {} ".format(odd_word), " ")
    return text

df = pd.read_csv("data/clean_qa.csv", sep='\t')

noisy_words_filepath = "res/noisy_words.txt"
sentences = [
        [Porter.stem(word) for word in delete_noisy_words(sentence.lower(), noisy_words_filepath).split() if word]
        for sentence in df["Question"] if sentence]

model = Word2Vec(sentences, size=100, batch_words=5, window=4, min_count=5)
word_vectors = KeyedVectors.load("model/word2vec.model")

vocab = word_vectors.wv.vocab

def preprocessQuery(sentence):
    """Splits sentence into words and intersects it with vocabulaty of the trained model"""
    preprocessed = [Porter.stem(word) for word in
                    delete_noisy_words(sentence.lower(), noisy_words_filepath).split() if word]
    return preprocessed


def intersectWithVocab(sentence):
    sentence = [word for word in sentence if word in vocab]
    return sentence
Exemplo n.º 6
0
def wmd_query_k_con_4_dia(train_dia, train_con, val_dia, val_con, infer_dia,
                          infer_con, k_set, out_path):
    """
    使用BM25算法,对给定的诊断文本,查询k个候选概念
    :param train_dia:
    :param train_con:
    :param val_dia:
    :param val_con:
    :param infer_dia:
    :param infer_con:
    :param k_set:
    :param out_path:
    :return:
    """
    start_time = time.time()

    train_dia_f = open(train_dia, 'r', encoding='utf-8').readlines()
    train_dia_f = [i.split('\t')[0].rstrip('\n') for i in train_dia_f]
    val_dia_f = open(val_dia, 'r', encoding='utf-8').readlines()
    val_dia_f = [i.split('\t')[0].rstrip('\n') for i in val_dia_f]
    infer_dia_f = open(infer_dia, 'r', encoding='utf-8').readlines()
    infer_dia_f = [i.split('\t')[0].rstrip('\n') for i in infer_dia_f]

    train_con_f = open(train_con, 'r', encoding='utf-8').readlines()
    train_con_f = [i.rstrip('\n') for i in train_con_f]
    val_con_f = open(val_con, 'r', encoding='utf-8').readlines()
    val_con_f = [i.rstrip('\n') for i in val_con_f]
    infer_con_f = open(infer_con, 'r', encoding='utf-8').readlines()
    infer_con_f = [i.rstrip('\n') for i in infer_con_f]

    all_dia = train_dia_f + val_dia_f + infer_dia_f
    all_dia = list(set(all_dia))
    all_dia_num = len(all_dia)
    all_con = train_con_f + val_con_f + infer_con_f
    all_con = list(set(all_con))
    all_con_num = len(all_con)
    all_infer_con = list(set(infer_con_f))
    all_infer_con_num = len(all_infer_con)

    corpus = all_dia + all_con
    corpus = [i.split(' ') for i in corpus]  # TODO 加载停用词
    corpus_dict = {}
    for idx, line in enumerate(corpus):
        corpus_dict[idx] = line

    w2v_model = Word2Vec(corpus, size=512, min_count=1, window=3,
                         sg=0)  # sg=0--CBOW;1-skip-gram
    wmd_model = WmdSimilarity(corpus=corpus,
                              w2v_model=w2v_model,
                              num_best=len(corpus))

    y_pred_acc = [0 for _ in range(len(infer_con_f))]
    y_pred_acc_5 = [0 for _ in range(len(infer_con_f))]
    y_pred_acc_10 = [0 for _ in range(len(infer_con_f))]
    y_true = [1 for _ in range(len(infer_con_f))]
    result = open(out_path + 'result.txt', 'w', encoding='utf-8')
    for idx, query_dia in enumerate(
            tqdm(infer_dia_f, desc='推理中...', leave=False)):
        query_result = wmd_model[query_dia.split(' ')]

        candidate_con = {}
        for idx_value in query_result:
            if idx_value[0] > all_dia_num - 1:
                candidate_con[idx_value[0]] = idx_value[1]

        assert len(candidate_con) == all_con_num, '抽取出来的概念文本个数不是{}个!'.format(
            all_con_num)

        candidate_txt_value = {}
        for key, value in candidate_con.items():
            candidate_txt_value[' '.join(corpus_dict[key])] = value

        assert len(candidate_con) == all_con_num, '查询出来的候选概念数量和实际的候选概率数量不一致!!!'
        sort_candidate_txt_value = sorted(candidate_txt_value.items(),
                                          key=lambda x: x[1],
                                          reverse=True)

        max_10_con = [txt_value[0] for txt_value in sort_candidate_txt_value]

        if infer_con_f[idx] == max_10_con[0]:
            y_pred_acc[idx] = 1
        if infer_con_f[idx] in max_10_con[:5]:
            y_pred_acc_5[idx] = 1
        if infer_con_f[idx] in max_10_con[:10]:
            y_pred_acc_10[idx] = 1

    acc = accuracy_score(y_true, y_pred_acc)
    acc_5 = accuracy_score(y_true, y_pred_acc_5)
    acc_10 = accuracy_score(y_true, y_pred_acc_10)
    f1 = f1_score(y_true, y_pred_acc)

    end_time = time.time()
    take_time = end_time - start_time
    per_item_time = take_time / len(infer_con_f)

    print('{}数据集下,Acc={},Acc@5={},Acc@10={},F1={},总耗时{},平均每条耗时{}'.format(
        data_set, round(acc, 3), round(acc_5, 3), round(acc_10, 3),
        round(f1, 3), take_time, round(per_item_time, 2)))

    result.write(
        '{}数据集下,Acc={},Acc@5={},Acc@10={},F1={},总耗时{},平均每条耗时{}'.format(
            data_set, acc, acc_5, acc_10, f1, take_time, per_item_time))
Exemplo n.º 7
0
    english[english.label == 1][test.columns]
]).reset_index()

#西班牙语问句1
data['spa_qura_list_1'] = data['spa_qura1'].apply(lambda x: x.split(' '))
#西班牙语问句2
data['spa_qura_list_2'] = data['spa_qura2'].apply(lambda x: x.split(' '))
#西班牙语问句拼到一个list
spa_list = list(data['spa_qura_list_1'])
spa_list.extend(list(data['spa_qura_list_2']))
#班牙语问句 Word2Vec
model = Word2Vec(spa_list,
                 sg=1,
                 size=30,
                 window=5,
                 min_count=1,
                 negative=3,
                 sample=0.001,
                 hs=1,
                 workers=8)


def seq_to_w2v(seq, model):
    words = []
    default = [0 for x in range(30)]
    for i in range(30):
        if i < len(seq):
            words.extend(model[seq[i]])
        else:
            words.extend(default)
    return words
text = re.sub(
    r'\d', ' ', text
)  ## Matches any Unicode digit (which includes [0-9], and also many other digit characters)
text = re.sub(
    r'\s+', ' ', text
)  ## Matches Unicode whitespace characters (which includes [ \t\n\r\f\v], and also many other characters

## Preparing dataset
sentences = nltk.sent_tokenize(text)  # paragraph into sentences

sentences = [nltk.word_tokenize(sentence)
             for sentence in sentences]  # sentences into words

for i in range(len(sentences)):
    sentences[i] = [
        word for word in sentences[i]
        if word not in set(stopwords.words('english'))
    ]

## Training Word2Vec model
model = Word2Vec(sentences,
                 min_count=1)  ##word should be presented more than 1 time.

words = model.wv.vocab  ## vocabularies in Word2Vec model

## Finding word vectors
vector = model.wv[
    'college']  # vector of 100 dimentions for the word 'college'.

## Most similar words
similar = model.wv.most_similar('college')  # similar word to the 'college'
def word2vec_model(train_data):
    model = Word2Vec(train_data,size=30,window = 3,min_count =1,iter=20)
    return model
Exemplo n.º 10
0
# on_the
# 1052
# at_the
# 1035
# we_
# 're               1033
# i_was
# 1018
# of_the
# 1014
# ca_n
# 't               1010
# are_you
# 994

bigram_model = Word2Vec(bigram[sentences], size=100)
bigram_model_counter = Counter()
for key in bigram_model.vocab.keys():
    if key not in stopwords.words("english"):
        if len(key.split("_")) > 1:
            bigram_model_counter[key] += bigram_model.vocab[key].count

for key, counts in bigram_model_counter.most_common(50):
    print
    '{0: <20} {1}'.format(key.encode("utf-8"), counts)

do_n
't               2436
gon_na
1576
ca_n
def main(argv):

    # ======================= +
    #                        /
    #    D E F A U L T S    /
    #                      /
    # ------------------- +

    # Default log level.
    logging.basicConfig(level=logging.INFO)

    # Default data directory.
    data_dir = ''

    # Default outout directory.
    output_dir = 'output'

    # ================================= +
    #                                  /
    #    P A R S E  C L I  A R G S    /
    #                                /
    # ----------------------------- +

    # Parse cli args.
    try:
        opts, args = getopt.getopt(argv, "d:o:", ['data_dir=', 'output_dir='])
    except getopt.GetoptError:
        print('data_prep_jigsaw.py -d <data_dir> -o <output_dir>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('data_prep_jigsaw.py -d <data_dir> -o <output_dir>\n' +
                  'Defaults:\n' + '  data_dir=\t\t' +
                  '  output_dir=\t\toutput')
            sys.exit()
        elif opt == '-v':
            logging.getLogger().setLevel(logging.DEBUG)
        elif opt == '-d':
            data_dir = str(arg)
        elif opt == '-o':
            output_dir = str(arg)

    # Set vars with default or passed-in values.

    # Path to data directory.
    data_path = Path(data_dir)

    # Output path.
    output_path = Path(output_dir)

    # Set vars with default or passed-in values.

    # Get the data, create dataframes from the CSVs.
    train_path = data_path / 'train.csv'
    train_df = pd.read_csv(train_path, header=0)

    test_path = data_path / 'test.csv'
    test_df = pd.read_csv(test_path, header=0)

    test_labels_path = data_path / 'test_labels.csv'
    test_labels_df = pd.read_csv(test_labels_path, header=0)

    # Print heads if debug.
    logging.debug(train_df.head())
    logging.debug(test_df.head())
    logging.debug(test_labels_df.head())

    # ============================== +
    #                               /
    #    P R E P A R E  D A T A    /
    #                             /
    # --------------------------- +

    # Drop everything except for comment_text and the labels.
    train_df.drop(['id'], axis=1, inplace=True)

    # Merge test with test labels.
    merged_test_df = pd.merge(test_df, test_labels_df, on='id')

    # Get list of records with -1 for labels (they weren't used in kaggle evaluation).

    # Drop them.

    # Create output dir if it doesn't exist.
    try:
        os.makedirs(output_path)
    except FileExistsError:
        logging.info('Output directory already exists.')

    # Write features and labels to disk.
    csv_path = output_path / 'raw_train_set.csv'
    train_df.to_csv(csv_path)

    # =================================================== +
    #                                                    /
    #    P R E P A R E  W O R D  E M B E D D I N G S    /
    #                                                  /
    # ----------------------------------------------- +

    # Build vocabulary and word embeddings from source if needed.

    # Store records
    all_labels = []
    tokens = []
    maxsentlen = 0
    maxdoclen = 0
    num_dropped = 0

    # Process csv one line at a time
    with open(csv_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        lineno = 0
        idx = 0
        for line in csv_reader:
            # Skip header.
            lineno += 1
            sys.stdout.write("Processing line %i     \r" % lineno)
            sys.stdout.flush()

            # Begin at index 1.
            idx += 1

            # TODO This is coupled to this field. Change to arg?
            text = line['comment_text']

            # Process raw text.

            # Force lowercase.
            text = text.lower()

            # Remove unwanted tokens.
            text = re.sub("\\n", ' ', text)
            text = re.sub("\\t", ' ', text)

            # Remove single and double backticks.
            text = re.sub("`", '', text)

            # Remove single quotes.
            text = re.sub("'", '', text)

            # Replace multiple periods in sequence with one period.
            text = re.sub("\.{2,}", '.', text)

            # Replace everything except words, '.', '|', '?', and '!' with space.
            text = re.sub('[^\w_|\.|\?|!]+', ' ', text)

            # Replace periods with ' . '.
            text = re.sub('\.', ' . ', text)

            # Replace '?' with ' ? '.
            text = re.sub('\?', ' ? ', text)

            # Replace '!' with ' ! '.
            text = re.sub('!', ' ! ', text)

            # Tokenize by splitting on whitespace.
            # No leading or trailing whitespace is kept.
            # Consecutive spaces are treated as a single space.
            text = text.split()

            # Drop empty reviews.
            if len(text) == 0:
                num_dropped += 1
                continue

            # Split into sentences.
            sentences = []
            sentence = []
            for t in text:
                # Use '.', '!', '?' as markers of end of sentence.
                if t not in ['.', '!', '?']:
                    # Not at end of a sentence.
                    sentence.append(t)
                else:
                    # At end of a sentence.
                    sentence.append(t)

                    # Add sentence to sentences.
                    sentences.append(sentence)

                    # Track longest sentence.
                    if len(sentence) > maxsentlen:
                        maxsentlen = len(sentence)

                    # Reset sentence list.
                    sentence = []

            # If sentence has word, add to list of sentences.
            if len(sentence) > 0:
                sentences.append(sentence)

            # Add split sentences to tokens.
            tokens.append(sentences)

            # Track longest document.
            if len(sentences) > maxdoclen:
                maxdoclen = len(sentences)

            # Build list of labels for record.
            doc_labels = []
            doc_labels.append(line['toxic'])
            doc_labels.append(line['severe_toxic'])
            doc_labels.append(line['obscene'])
            doc_labels.append(line['threat'])
            doc_labels.append(line['insult'])
            doc_labels.append(line['identity_hate'])

            # Add list of labels to list of all labels.
            all_labels.append(doc_labels)

    # Use all processed raw text to train word2vec.
    allsents = [sent for doc in tokens for sent in doc]
    # TODO Make embedding size a cli arg w/ default of 300.
    embedding_size = 300
    model = Word2Vec(allsents,
                     min_count=5,
                     size=embedding_size,
                     workers=4,
                     iter=5)
    model.init_sims(replace=True)

    # Save all word embeddings to matrix
    vocab = np.zeros((len(model.wv.vocab) + 1, embedding_size))
    word2id = {}

    # First row of embedding matrix isn't used so that 0 can be masked.
    for key, val in model.wv.vocab.items():
        # Begin indexes with offset of 1.
        idx = val.__dict__['index'] + 1

        # Build 2D np array (idx, vector)
        vocab[idx, :] = model[key]

        # Dictionary mapping word to index.
        word2id[key] = idx

    # Switch keys/values and store id2word dictionary (for decoding examples).
    id2word = {y: x for x, y in word2id.items()}

    # Normalize embeddings.
    vocab -= vocab.mean()
    vocab /= (vocab.std() * 2)

    # Reset first row to 0.
    vocab[0, :] = np.zeros(embedding_size)

    # Add additional word embedding for unknown words.
    vocab = np.concatenate((vocab, np.random.rand(1, embedding_size)))

    # Index for unknown words.
    unk = len(vocab) - 1

    # Convert words to word indices.
    data = {}
    for idx, doc in enumerate(tokens):
        sys.stdout.write('processing %i of %i records       \r' %
                         (idx + 1, len(tokens)))
        sys.stdout.flush()
        dic = {}

        # Get label for each index.
        dic['labels'] = all_labels[idx]

        # Get text of each document.
        dic['text'] = doc

        # Build list of indicies representing the words of each sentence,
        # if word is a key in word2id mapping, use unk, defined: vocab[len(vocab)-1].
        indicies = []
        for sent in doc:
            indicies.append(
                [word2id[word] if word in word2id else unk for word in sent])

        # Add indices to dictionary.
        dic['idx'] = indicies

        # Add dictionary containing label, text, indices to data dictionary at index.
        data[idx] = dic

    # Write data dictionary to file.
    data_output_path = output_path / 'jigsaw-WM-Gao-data.bin'
    with open(data_output_path, 'wb') as f:
        msgpack.pack(data, f)

    # Write embeddings to file in numpy binary format.
    embeddings_output_path = output_path / 'jigsaw-WM-EMB-Gao-300'
    np.save(embeddings_output_path, vocab)

    # Write id2word dict to file.
    id2word_output_path = output_path / 'jigsaw-WM-EMB-Gao-id2word.bin'
    with open(id2word_output_path, 'wb') as f:
        msgpack.pack(id2word, f)
        # split by line
        lines = cleaned_txt.split('\n')

        # tokenize by line with jieba
        for line in lines:
            if line:
                line_tokens = jieba.cut(line, cut_all=False, HMM=True)
                tokenized_sentence = [token for token in line_tokens]
                tokenized_sentences.append(tokenized_sentence)
    # get time information
    t2 = time.time()
    book_process_time = t2 - t1
    list_times.append(book_process_time)
n_books = n + 1
print("\nSentences tokenized !")
print("{} seconds in total and {} seconds per book".format(
    sum(list_times),
    sum(list_times) / n_books))

print('\nComputing Word2Vec ...')
model = Word2Vec(tokenized_sentences, window=5)
print('\nWord2Vec is computed !')

word_vectors = model.wv
name_model = 'sample_mandarin_embeddings_{}book_model.tsv'.format(n_books)
path_model = os.path.join(path_embeddings, name_model)
word_vectors.save_word2vec_format(path_model)

print('model saved at path : {}'.format(path_model))
    file = open(filename, 'r', encoding="utf8")
    doc = csv.reader(file, delimiter=',')
    for i, row in enumerate(doc):
        if (i >= train_size):
            break
        tweet = row[2]
        line = doc_to_clean_lines(tweet, vocab)
        lines += line
    return lines


# load the vocabulary
file = "sentiment_train.csv"
vocab_filename = file + '_vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# load training data
sentences = load_sentences(file, vocab, True)
print('Total training sentences: %d' % len(sentences))

# train word2vec model (workers=cpu cores, window= number of neighbor words considered)
model = Word2Vec(sentences, size=100, window=5, workers=8, min_count=1)
# summarize vocabulary size in model
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

# save model in ASCII (word2vec) format
filename = file + '_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)
Exemplo n.º 14
0
    lambda x: x.replace(" ", ""))
model_dataframe["separates"] = model_dataframe["sentences"].apply(
    lambda x: x.replace(",", ""))
model_dataframe["separates"] = model_dataframe["separates"].apply(
    lambda x: x.replace(";", ""))
model_dataframe["separates"] = model_dataframe["separates"].apply(
    lambda x: x.replace("\"", ""))
model_dataframe["separates"] = model_dataframe["separates"].apply(
    lambda x: x.replace('"', ''))
model_dataframe["separates"] = model_dataframe["separates"].apply(
    lambda x: x.split())

# -- 문장별 Word2Vec 처리
model = Word2Vec(model_dataframe["separates"],
                 sg=1,
                 size=300,
                 min_count=1,
                 iter=10)
'''
sg = 0이면 cbow, 1이면 skip-grm
min_count = 5  (등장 횟수가 5 이하인 단어는 무시)
size = 300 (300차원짜리 벡터스페이스에 embedding)
iter (보통 딥러닝에서 말하는 epoch와 비슷한, 반복횟수
workers : cpu의 코어수에 따라 multi-thread를 지원해서 병렬처리하는 옵션
alpha : 초기학습률, min_alpha: alpha값이 학습과정에서 선형으로 줄어서 도달하는 최소 값
'''

count = 0
sum = 0.0
average = 0.0
Exemplo n.º 15
0
import sys
import multiprocessing

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    inp = '../policy.seg'
    outp1 = '../policy.model'
    outp2 = '../policy.vector'

    model = Word2Vec(LineSentence(inp),
                     size=300,
                     window=5,
                     min_count=5,
                     workers=multiprocessing.cpu_count(),
                     iter=3)

    # trim unneeded model memory = use(much) less RAM
    #model.init_sims(replace=True)
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)
    print('OK')
Exemplo n.º 16
0
    # unlabeled.fnames = []
    for m in tar.getmembers():
        if ".txt" in m.name:
            # unlabeled.fnames.append(m.name)
            unlabeled.data.append(read_instance(tar, m.name))
    tar.close()
    return unlabeled.data


def read_instance(tar, ifname):
    inst = tar.getmember(ifname)
    ifile = tar.extractfile(inst)
    content = ifile.read().strip()
    return content


if __name__ == "__main__":
    print("Reading files")
    tarfname = "data/speech.tar.gz"
    docs = read_files(tarfname)
    lmtzr = WordNetLemmatizer()
    print("Lemmatizing and Tokenizing")
    lemmatized = [[
        lmtzr.lemmatize(word).lower().strip('.,;:')
        for word in word_tokenize(d.decode("utf-8"))
        if len(word) >= g.min_length
    ] for d in docs]
    print("Computing Word2Vec Matrix")
    wv = Word2Vec(lemmatized, workers=g.num_jobs)
    wv.save("word2vec.model")
Exemplo n.º 17
0
        resp = resp.json()
        data = resp['items']
        start += len(data)

        # DataFrame
        df = pd.DataFrame(data=data)
        df['title'] = df['title'].apply(preprocessing)
        df['description'] = df['description'].apply(preprocessing)
        df_list.append(df)

        # break
        if len(data) != 100:
            break

    return pd.concat(df_list)


if __name__ == "__main__":
    df = request_book_by_query("파이썬")
    target = df['title'] + ' ' + df['description']
    target = target.apply(get_nouns)
    target = target.str.split()

    # Training
    model = Word2Vec(target.to_list(), size=300, window=10, min_count=1)
    model.init_sims(replace=True)

    # Test
    result = model.wv.most_similar("알고리즘", topn=10)
    print("/".join([x[0] for x in result]))
Exemplo n.º 18
0
def build_word2vec_model(data, embedding_size=2, save=True):
    model = Word2Vec(data, min_count=0, size=embedding_size)
    if save:
        model.save_word2vec_format('output')
    return model
Exemplo n.º 19
0
    if (s in frequency):
        store_total = store_total + frequency[s]
# print(len(store))
print(a, "이상의 단어 총수:", store_total)

# Word2Vec 적용하기
# CBOW model
# size: number of dimensions of the embeddings (default = 100)
# window: target word와 target word 주변 단어 간의 최대 거리 (default = 5)
# min_count: 단어 빈도 수가 이 값보다 작으면 무시됨 (default = 5)
# workers: numbers of partitions during training (default = 3)

minCount = 20
s = 250
w = 6
cbow_model = Word2Vec(data, min_count=minCount, iter=5, size=s, window=w)

cbow_model.save('CBOWModelFile')
print("size = %d" % s)
print("window = %d" % w)

cModel = g.Doc2Vec.load('CBOWModelFile')
vocab = list(cModel.wv.vocab)

# 좌표평면 상에 그리기
cModel = g.Doc2Vec.load('CBOWModelFile')
vocab = list(cModel.wv.vocab)
X = cModel[vocab]

# 이차원 그래프로 표현: t-SNE
tsne = TSNE(n_components=2)
Exemplo n.º 20
0
from wikipedia import page
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import re

#Get data from wiki : https://codesachin.wordpress.com/2015/10/09/generating-a-word2vec-model-from-a-block-of-text-using-gensim-python/
title = "Word2vec"
wikipage = page(title)
raw_content = wikipage.content
alphanumeric_content = re.sub('[^0-9a-zA-Z ]+', ' ', raw_content)

text_file = open("Output.txt", "w")
text_file.write(alphanumeric_content)
text_file.close()

sentences = LineSentence("Output.txt", max_sentence_length=10)
print(sentences)
#exit()

min_count = 2
size = 50
window = 4
model = Word2Vec(sentences, min_count=min_count, size=size, window=window)
#print(model.wv.vocab)

for i in model.wv.vocab:
    print(i)
    print(model[i])

#print(model[page_list[0]])
#print(model.batch_words)
 def train(self, min_count=3, workers=1):
     self.model = Word2Vec(self.corpus, min_count=min_count, 
                           workers=workers)
Exemplo n.º 22
0
Created on Thu Aug  9 15:53:53 2018
@author: jjuppuluri13
"""

import gensim
from gensim.models import Word2Vec
#import data_proc

print(df.sentence[9])

from nltk.tokenize import word_tokenize

df.sentence = df.sentence.apply(lambda x: word_tokenize(x))

sentences = df.sentence
model = Word2Vec(sentences, size=200, min_count=1)
print(model)
words = list(model.wv.vocab)
#print(words)
#print(model['venlafaxine'])

#####               Export X train/test data
sequence = []
i = 0
while i < 5457:
    sequence.append(model[df.sentence[i]])
    i += 1

thefile = open('train_word.txt', 'w')
for item in sequence:
    thefile.write("%s\n" % item)
Exemplo n.º 23
0
def run(dataset="biorxiv_medrxiv", test=False):
    millis = int(round(time() * 1000))
    logging.basicConfig(filename=f"{defaultpath}/results/info_{millis}.log",
                        filemode='a',
                        format="%(levelname)s - %(asctime)s: %(message)s",
                        datefmt='%H:%M:%S',
                        level=logging.INFO)

    filepath = f"{defaultpath}/processed/{dataset}/body.csv"
    logging.info(f"Reading {filepath}")
    df = pd.read_csv(filepath, sep="\t")
    if test:
        df = df[:100]
    logging.info(f"Dataset size: {len(df)}")
    logging.info(df.shape)
    df = df.dropna().reset_index(drop=True)
    logging.info("Number of Null lines")
    logging.info(df.isnull().sum())

    nlp = spacy.load('en', disable=['ner', 'parser'])

    brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower()
                      for row in df['body'])

    t = time()
    logging.info("Cleanning")
    txt = [
        cleaning(doc)
        for doc in nlp.pipe(brief_cleaning, batch_size=8, n_threads=-1)
    ]

    logging.info(printTime(t))

    df_clean = pd.DataFrame({'clean': txt})
    df_clean = df_clean.dropna().drop_duplicates()
    logging.info(print(df_clean.shape))
    sent = [row.split() for row in df_clean['clean']]
    phrases = Phrases(sent, min_count=30, progress_per=10000)
    sentences = phrases[sent]

    word_freq = defaultdict(int)
    for sent in sentences:
        for i in sent:
            word_freq[i] += 1
    print(len(word_freq))

    sorted(word_freq, key=word_freq.get, reverse=True)

    cores = multiprocessing.cpu_count()

    w2v_model = Word2Vec(min_count=20,
                         window=2,
                         size=300,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=cores - 1)

    t = time()
    w2v_model.build_vocab(sentences, progress_per=10000)
    logging.info(printTime(t))

    t = time()
    w2v_model.train(sentences,
                    total_examples=w2v_model.corpus_count,
                    epochs=30,
                    report_delay=1)
    logging.info(printTime(t))

    w2v_model.init_sims(replace=True)

    w2v_model.save(f"{defaultpath}/results/word2Vec.model")
    logging.info("Finished")
Exemplo n.º 24
0
def train(sens, modelfp):
    model = Word2Vec(size=100, window=15, sg=1, min_count=1, workers=4)
    model.build_vocab(sens)
    model.train(sens, total_examples=len(sens), epochs=10)
    model.wv.save(modelfp)
Exemplo n.º 25
0
df['verse'] = df['verse'].str.split()

# Remove Arabic stop words
df['verse'] = df['verse'].map(lambda x: [w for w in x if w not in arb_stopwords])

# Exclude these words from the stemmer
stem_not = ['الله', 'لله', 'إلهكم', 'اله', 'لله', 'إلهكم', 'إله', 'بالله', 'ولله']

# [On/Off] Stemming the words to reduce dimensionality except stem_not list
# df['verse'] = df['verse'].map(lambda x: [w if w in stem_not else st.stem(w) for w in x])

# You can filter for one surah too if you want!
verses = df['verse'].values.tolist()

# train model
model = Word2Vec(verses, min_count=15, window=7, workers=8, alpha=0.22)
# summarize the loaded model

# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
plt.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

# Pass list of words as an argument
# disable for now in order to show the one below
# for i, word in enumerate(words):
   # reshaped_text = arabic_reshaper.reshape(word)
   # artext = get_display(reshaped_text)
 wordsDocList=[]
 targetFolder='all'
 trainFolder='buildDataSet/'+targetFolder+'/google code'
 pathDir=os.listdir(trainFolder)
 
 sen2List(pathDir, wordsDocList)
 trainFolder='buildDataSet/'+targetFolder+'/github'
 pathDir=os.listdir(trainFolder)
 
 sen2List(pathDir, wordsDocList)
 
 
 ##############3 this is the API of doc2vec
 #documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(wordsDocList)]
 
 model = Word2Vec(wordsDocList,size=vectorsize,window=10,min_count=0,workers=10,iter=10)
 model.save("my_word2vec_model_"+str(vectorsize)+"_noPun")
 '''
 word_vectors = model.wv
 if 'word' in word_vectors.vocab:
     print("bingo")
 
 print("not")
 '''
 '''
 w1=["click"]
 aa=model.wv.most_similar(positive=w1, topn=6)
 print(aa)
 
 vector = model.wv['computer']
 print(vector)
Exemplo n.º 27
0
train_data = Dataset(train_sents, cate2idx=ent2idx)
train_data.build_vocab_dict(vocab_size=vocab_size)

with open('word2idx.json', 'w') as f:
    f.write(str(train_data.word2idx))

test_data = Dataset(test_sents, word2idx=train_data.word2idx, cate2idx=ent2idx)
test_X, _ = test_data[:]

vocab_size = len(train_data.word2idx)

w2v_train_sents = []
for doc in docs:
    w2v_train_sents.append(list(doc.text))
w2v_model = Word2Vec(w2v_train_sents, size=emb_size)

w2v_embeddings = np.zeros((vocab_size, emb_size))
for char, char_idx in train_data.word2idx.items():
    if char in w2v_model.wv:
        w2v_embeddings[char_idx] = w2v_model.wv[char]

np.save("w2v_embeddings.npy", w2v_embeddings)

seq_len = sent_len + 2 * sent_pad
model = build_lstm_crf_model(num_cates, seq_len=seq_len, vocab_size=vocab_size,
                             model_opts={'emb_matrix': w2v_embeddings, 'emb_size': emb_size, 'emb_trainable': False})
print(model.summary())

train_X, train_y = train_data[:]
print('train_X.shape', train_X.shape)
Exemplo n.º 28
0
# coding:utf-8

import sys
import gensim
import sklearn
import numpy as np

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

if __name__ == '__main__':
    #训练model
    model = Word2Vec(LineSentence("train_set.txt"),
                     size=100,
                     window=2,
                     min_count=0,
                     workers=4)
    model.wv.save_word2vec_format('address_word2vec_model')
    print('word2vec model get!')
    Model = gensim.models.KeyedVectors.load_word2vec_format(
        'address_word2vec_model')
    print(Model.wv['霄云路'])
    print(Model.wv.similarity('霄云路', '霄云路'))
from __future__ import print_function
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

if __name__ == '__main__':
    inputFile = sys.argv[
        1]  # preprocess.py dosyasından elde ettiğimiz wiki.tr.txt dosyasının linkinin input olarak yazabilirsiniz ya da consoldan input olarak verebilirsiniz
    outputFile = "trmodel"
    model = Word2Vec(LineSentence(inputFile),
                     size=400,
                     window=5,
                     min_count=5,
                     workers=multiprocessing.cpu_count())
    model.wv.save_word2vec_format(outputFile, binary=True)
Exemplo n.º 30
0
Author:     liuyao8
Descritipn: 
"""

from gensim.models import Word2Vec, KeyedVectors, Phrases
from gensim.test.utils import common_texts, get_tmpfile, datapath
from gensim.scripts.glove2word2vec import glove2word2vec

# 0. 通用数据和函数
# common_texts: list of list,每个list表示一个文档或句子的分词结果
# get_tmpfile(fname):与temporary目录拼接,表示临时目录下的某文件,如C:\\Users\\liuyao8\\AppData\\Local\\Temp\\<fname>
# datapath(fname): os.path.join(module_path, 'test_data', fname),表示当前模块的测试目录下的某文件

# 1. 训练word embedding
# 1.1 初始训练
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
path = get_tmpfile("word2vec.model")
model.save(path)

# 1.2 加载模型并继续训练(流式训练,reading data from disk on-the-fly)
model = Word2Vec.load(path)
model.train([["hello", "world"]], total_examples=1, epochs=1)

# 1.3 训练得到word embedding
word2vector = model.wv  # KeyedVectors
vector = word2vector['computer']  # numpy vector of shape (100, )

path = get_tmpfile("wordvectors.kv")
word2vector.save(path)
word2vector = KeyedVectors.load(path, mmap='r')