Exemplo n.º 1
0
def runWord2Vec(condition_one, condition_two):
	num_features = 300
	
	try:
		model = Word2Vec.load("../models/W2V/"+ str(num_features) + "features_20minwords_10context")
	except Exception, e:
		Create_W2V_model.trainWorld2Vec(num_features)
		model = Word2Vec.load("../models/W2V/"+ str(num_features) + "features_20minwords_10context")
Exemplo n.º 2
0
def _load_vector_space_mapper(model_1_path, model_2_path, bilingual_path):
    """Build a vector space mapper from model 1,2 and bilingual dict."""
    model_1 = Word2Vec.load(model_1_path)
    model_2 = Word2Vec.load(model_2_path)
    bilingual_dict = bg.load_bilingual_dictionary(bilingual_path)
    tvecs_vm = VectorSpaceMapper(model_1, model_2, bilingual_dict)
    tvecs_vm.map_vector_spaces()
    return tvecs_vm
Exemplo n.º 3
0
def query(question):
	model = Word2Vec.load('/home/jcoreyes/news_model')
	extractor = Rake()
	words = extractor.run(question)
	keywords = [words[i][0] for i in xrange(len(words))]

	return model.most_similar(positive=keywords)[0][0]
Exemplo n.º 4
0
def out_of_core_x_normalisation(data_dir=HEP_TRAIN_PATH, batch_size=1024,
                                persist=False):
    """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
     This scaler can be used afterwards for normalizing feature matrices. """
    doc_generator = get_documents(data_dir=data_dir)
    word2vec_model = Word2Vec.load(WORD2VEC_MODELPATH)
    scaler = StandardScaler(copy=False)

    no_more_samples = False
    while not no_more_samples:
        batch = []
        for i in xrange(batch_size):
            try:
                batch.append(doc_generator.next())
            except StopIteration:
                no_more_samples = True
                break

        vectors = []
        for doc in batch:
            for word in doc.get_all_words():
                if word in word2vec_model:
                    vectors.append(word2vec_model[word])

        matrix = np.array(vectors)
        print "Matrix shape: {}".format(matrix.shape)

        scaler.partial_fit(matrix)

    if persist:
        save_to_disk(SCALER_PATH, scaler)

    return scaler
Exemplo n.º 5
0
def train(train_dir, test_dir=None, nn='cnn', nb_epochs=NB_EPOCHS,
          batch_size=BATCH_SIZE, persist=False, no_of_labels=NO_OF_LABELS,
          verbose=1):
    model = MagpieModel(
        word2vec_model=Word2Vec.load(WORD2VEC_PATH),
        scaler=load_from_disk(SCALER_PATH),
    )

    logger = CustomLogger(nn)
    model_checkpoint = ModelCheckpoint(
        os.path.join(logger.log_dir, 'keras_model'),
        save_best_only=True,
    )

    history = model.train(
        train_dir,
        get_labels(no_of_labels),
        test_dir=test_dir,
        nn_model=nn,
        callbacks=[logger, model_checkpoint],
        batch_size=batch_size,
        nb_epochs=nb_epochs,
        verbose=verbose,
    )

    finish_logging(logger, history, model.keras_model, persist=persist)

    return history, model
Exemplo n.º 6
0
def build_model_for_corpus(corpus):
    """ Build an appropriate Keras NN model depending on the corpus """
    if corpus == 'keywords':
        keras_model = cnn(embedding_size=100, output_length=10000)
    elif corpus == 'categories':
        keras_model = cnn(embedding_size=100, output_length=14)
    elif corpus == 'experiments':
        keras_model = cnn(embedding_size=100, output_length=500)
    else:
        raise ValueError('The corpus is not valid')

    model_path = os.path.join(DATA_DIR, corpus, 'model.pickle')
    keras_model.load_weights(model_path)

    w2v_model = Word2Vec.load(WORD2VEC_PATH)
    scaler = load_from_disk(SCALER_PATH)
    labels = get_labels(keras_model.output_shape[1])

    model = MagpieModel(
        keras_model=keras_model,
        word2vec_model=w2v_model,
        scaler=scaler,
        labels=labels,
    )

    return model
	def test_ofm_word2vec_cosine_selection(self):
		model = Word2Vec.load(self.brownFilePath)
		ofmPredictor = OFMPredictions()
		testData = self.getOFMTestData()
		pred = ofmPredictor.word2VecSimilaritySelectionCosine(testData, model)
		optionSentences = [option['sent'] for option in testData['word1']['options']]
		self.assertTrue(pred['word1']['solution'] in optionSentences)
def word2vec_model(argument):
    try:
        return Word2Vec.load(argument)
    except:
        raise ArgumentTypeError(
            'Could not read embeddings from {}'.format(argument)
        )
def mineAbbreviation():
	print 'mining abbreviation'
	jieba.load_userdict("../../../data/jieba_userdict.txt")
	stopword_set = text_process.getStopword('../../../data/stopword.txt')
	word2vec_model = Word2Vec.load('../../../data/word2vec.model')
	word_set = getWords()
	word_syn_dict = {}
	for word in word_set:
		word_syn_dict.setdefault(word,set([word]))
		if len(word) != 2:
			continue
		try:
			for simi_word_tuple in word2vec_model.most_similar(positive=[word],topn=20):
				simi_word = simi_word_tuple[0]
				simi_value = simi_word_tuple[1]
				reverse_word = word[1]+word[0]
				if reverse_word == simi_word:
					pass
				else:	
					if len(set(word)&set(simi_word)) != len(word) or simi_value < 0.5 or word in simi_word or reverse_word in simi_word:
						continue
				word_syn_dict[word].add(simi_word)
		except:
			pass
			# print word

	outfile = open('abbreviation.txt','wb')
	for word in word_syn_dict.keys():
		if len(word_syn_dict[word])>=2:
			outfile.write(word+'@'+','.join(word_syn_dict[word])+'\r\n')	
Exemplo n.º 10
0
def __main__():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-f', action='store', dest='filename', help='Data filename')
    parser.add_argument('-d', action='store', nargs="+", dest='dataset', help='Dataset name')
    parser.add_argument('-c', action='store', nargs="+", dest='categories', help='Dataset name')
    parser.add_argument('--topn', action='store', nargs="+", dest='topn', default='0', help='Dataset name')
    parser.add_argument('--model', action='store', nargs="+", dest='modelname', help='Similarity dictionary name')
    parser.add_argument('--lda', action='store_true', dest='test_lda', help='If on test lda features')
    parser.add_argument('--sd', action='store_true', dest='test_simdict', help='knn similarity')
    parser.add_argument('--w2v', action='store_true', dest='test_w2v', help='If on test w2v features')
    parser.add_argument('--w2v-topn', action='store_true', dest='test_w2v_topn', help='If on test w2v features')
    parser.add_argument('--pword', action='store_true', dest='perword', help='whether similar words taken per word')
    parser.add_argument('--kt', action='store_true', dest='kt', help='kenyan twits')
    arguments = parser.parse_args()

    print arguments

    datasets, filenames = prep_arguments(arguments)
    topns = map(int, arguments.topn)
    perword = arguments.perword

    if arguments.modelname is not None and not arguments.test_simdict:
        w2v_model_name = arguments.modelname[0]
        print w2v_model_name

        w2v_model = Word2Vec.load(w2v_model_name)
        w2v_model.init_sims(replace=True)
    else:
        w2v_model = None

    for dataset, filename in zip(datasets, filenames):
        for topn in topns:
            print dataset, filename, topn
            test_one_file(filename, dataset, topn, perword, w2v_model, arguments)
Exemplo n.º 11
0
def train(train_dir, test_dir=None, nn='berger_cnn', nb_epochs=NB_EPOCHS,
          batch_size=BATCH_SIZE, verbose=1):
    # Figure out whether we're predicting categories or keywords
    if NO_OF_LABELS == 14:
        scaler_path = CATEGORY_SCALER
        w2v_path = CATEGORY_WORD2VEC
    else:
        scaler_path = KEYWORD_SCALER
        w2v_path = KEYWORD_WORD2VEC

    model = MagpieModel(
        word2vec_model=Word2Vec.load(w2v_path),
        scaler=load_from_disk(scaler_path),
    )

    logger = CustomLogger(nn)
    model_checkpoint = ModelCheckpoint(
        os.path.join(logger.log_dir, 'keras_model'),
        save_best_only=True,
    )

    history = model.train(
        train_dir,
        get_labels(NO_OF_LABELS),
        test_dir=test_dir,
        nn_model=nn,
        callbacks=[logger, model_checkpoint],
        batch_size=batch_size,
        nb_epochs=nb_epochs,
        verbose=verbose,
    )

    finish_logging(logger, history)

    return history, model
Exemplo n.º 12
0
    def __init__(self,
                 model_file_path=''.join(config['model_file_path']),
                 label_file_path=''.join(config['label_file_path']),
                 word2vec_to_solve_oov=config['word2vec_to_solve_oov']
                 ):
        '''

        :param word2vec_to_solve_oov: 是否使用word2vec去查oov
        '''
        self._word2vec_to_solve_oov = word2vec_to_solve_oov
        self._model_file_path = model_file_path
        self._full_mode = config['full_mode']
        logging.debug('使用full切词模式?%s...'%(self._full_mode))

        logging.debug('=' * 20)
        logging.debug('加载分类器模型和编码器...')
        model_in_file = open(model_file_path, 'r')

        self._model = pickle.load(model_in_file)
        self._bow_encoder = pickle.load(model_in_file)
        self._cnn_encoder = pickle.load(model_in_file)

        self._index_to_label = np.load(open(label_file_path,'r'))
        self._keywords = self._bow_encoder.get_feature_names()

        # 测试
        logging.debug('=' * 20)
        logging.debug('测试...')
        logging.debug('-' * 20)
        logging.debug('加载word2vec模型...')
        logging.debug('-' * 20)
        logging.debug('=' * 20)

        if config['word2vec_to_solve_oov']:
            self._word2vec_model = Word2Vec.load(config['word2vec_model_file_path'])
Exemplo n.º 13
0
def load(filename):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    if filename[:-6] == "bin.gz":
        model = Word2Vec.load_word2vec_format(filename, binary=True)
    else:
        model = Word2Vec.load(filename)
    return model
Exemplo n.º 14
0
def main(nouns_loc, word2vec_loc, n_nouns, out_loc):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # Load trained Word2Vec model
    model = Word2Vec.load(word2vec_loc)
    logger.info('Word2Vec object loaded')

    logger.info('Keeping %s nouns', n_nouns)
    # Empty dictionary for noun to vector mapping
    noun_to_vect_dict = {}
    # Counter to know when to stop
    counter = 0
    with open(nouns_loc, 'r') as f:
        while counter < int(n_nouns):
            line = make_tuple(f.readline())
            # Add noun and vector to mapping dictionary
            noun = line[0]
            noun_to_vect_dict[noun] = model[noun]
            # Increment counter
            counter += 1

    logger.info('Pickling noun to vector dictionary')
    # Pickle dictionary
    with open(path.join(out_loc, 'noun_to_vect_dict_' + n_nouns + '.pkl'), 'w') as f:
        pickle.dump(noun_to_vect_dict, f)
def get_model():
    ''' lazy initialization for w2v model so it works in pool '''
    global model
    if model == None:
        print 'loading the w2v model...'
        model = Word2Vec.load('w2v/lemma_stopwords')
    return model
Exemplo n.º 16
0
    def write(self, model_path, dim):
        model = W.load(model_path)
        words = model.vocab.keys()
        conn = self.conn
        cur = self.cur

        create_q = "CREATE TABLE %s (word text," % 'array'
        for i in range(dim):
            create_q += "D%d real," % (i)
        create_q = create_q[:-1]+")"

        insert_q = "INSERT INTO %s VALUES (" % 'array'
        for i in range(dim+1):insert_q += "?,"
        insert_q = insert_q[:-1]+")"

        cur.execute(create_q)
        conn.commit()
        inp =[]
        for idx,word in enumerate(words):
            inp.append((word,) + tuple(model[word].tolist()))
            if idx % 10000 == 0:
                cur.execute(insert_q,inp)
                conn.commit()
                del inp
                inp=[]
                continue
        cur.execute("CREATE UNIQUE INDEX idx_word ON %s(word)" % 'array')
        cur.close()
        conn.close()
Exemplo n.º 17
0
Arquivo: w2v.py Projeto: Suluo/Kaggle
def train():
    t0 = time.time()
    filename = './data/seg20180327.txt'
    if not os.path.exists(filename):
        for tsv in ['labeledTrainData.tsv', 'unlabeledTrainData.tsv', 'testData.tsv']:
            logger.info("loading %s ...." % tsv)
            load_tsv('./data/' + tsv, filename)
    sents = word2vec.Text8Corpus(filename)
    t1 = time.time()
    logger.info("load text taks %s" % (time.time()-t0))

    model_path = './data/model.w2v'
    if not os.path.exists(model_path):
        num_features, num_workers = 300, 4
        min_word_count, context = 20, 10
        downsampling = 1e-3
        model = word2vec.Word2Vec(
            sents, workers=num_workers,
            size=num_features, min_count=min_word_count,
            window=context, sample=downsampling
        )
        model.init_sims(replace=True)
    else:
        model = Word2Vec.load(model_path)
        # model.save_word2vec_foramt(output_vec, binary=False)
        model.build_vocab(sents, update=True)
        model.train(sents, total_examples=model.corpus_count, epochs=model.iter)
    # 生成的词典
    # model.vocab
    logger.info('w2v train taks %s' % (time.time()-t1))
    model.save('./data/model.w2v')
Exemplo n.º 18
0
def predict(algorithm='rf'):
    train = get_reviews('data/imdb/train_data.csv')

    if not os.path.exists(model_name):
        #unlabeled_train = get_reviews('data/unlabeledTrainData.tsv')
        sentences = get_sentences(train['review'])# + get_sentences(unlabeled_train['review'])
        train_word2vec(sentences)

    model = Word2Vec.load(model_name)

    clean_train_reviews = get_clean_reviews(train['review'])
    train_features = get_features(clean_train_reviews, model, num_features, with_idf=False)

    classifier = train_classifier(algorithm, train_features, train)

    # Free memory !
    del train
    del clean_train_reviews
    del train_features

    test = get_reviews('data/imdb/test_data.csv')
    clean_test_reviews = get_clean_reviews(test['review'])
    test_features = get_features(clean_test_reviews, model, num_features)

    evaluate(test_features, test, classifier)
def getAllFeatures(train, mapper):
    print "this is getAllFeatures"
    # every record has a cluster value calculated by lda
    w2c_f, w2c_w = 10, 14
    lda_dict_1 = util.read_dict(util.features_prefix + 'id_lda_256.pkl')
    lda_dict_2 = util.read_dict(util.features_prefix + 'id_lda_512.pkl')
    k_mean_dict_1 = util.read_dict(util.features_prefix + 'c_k_all_64.pkl')
    k_mean_dict_2 = util.read_dict(util.features_prefix + 'c_k_all_128.pkl')
    sentence_dict_path = util.txt_prefix + 'id_sentences.pkl'
    word2vec_path = util.txt_prefix + str(w2c_f) + 'features_1minwords_' + str(w2c_w) + 'context.pkl'
    sentence_dic = util.read_dict(sentence_dict_path)
    model = Word2Vec.load(word2vec_path)

    train_X = train[features]
    train_X = mapper.transform(train_X)  # .values
    new_train_X = []
    for i in xrange(len(train_X)):
        id = train_X[i][0]
        lda_1 = lda_dict_1[id]
        lda_2 = lda_dict_2[id]
        s = sentence_dic.get(id)
        f = np.concatenate(([train_X[i][1:].astype(np.float32)],
                            [sentence_to_matrix_vec(s, model, w2c_f, k_mean_dict_1, k_mean_dict_2)]), axis=1)[0]
        f = np.concatenate(([f], [[lda_1, lda_2]]), axis=1)[0]
        new_train_X.append(f)
    new_train_X = np.array(new_train_X)
    return new_train_X
Exemplo n.º 20
0
    def __init__(
        self,
        train_data=None,
        word2vec_model_file_path=None,
        word_embedding_length=None,
        full_mode=True,
        remove_stopword=True,
        sentence_padding=7,
        verbose=0,
    ):
        """

        :param train_data: 训练句子
        :type train_data: np.array([])
        """
        self.__full_mode__ = full_mode
        self.__remove_stopword__ = remove_stopword
        self.__verbose__ = verbose
        self.__sentence_padding__ = sentence_padding
        self.__word_embedding_length__ = word_embedding_length
        self.__word2vec_model__ = Word2Vec.load(word2vec_model_file_path % word_embedding_length)

        if train_data is not None:
            self.__train_data__ = train_data
            self.build_encoder(train_data)
Exemplo n.º 21
0
def compare(dataset, model_name, pre_model_name):

    # build model
    if(os.path.isfile(model_name)):
        model = Word2Vec.load(model_name)
        logger.debug("model %s already exist, stop training wordvector", model_name)
    else:
        logger.info("start trainning word vector")
        start_time = timeit.default_timer()
        model = wordvector.build_word_vector(dataset, save=True, save_file=model_name)
        logger.info("model %s trained in %.4lfs", model_name, timeit.default_timer() - start_time)

    # find most similar words:
    for word in keywords:
        print word
        print model.most_similar(word, topn=10);

    # load pre-trained google news model
    logger.info("start loading pre-trained dataset")
    start_time = timeit.default_timer()
    pre_model = Word2Vec.load_word2vec_format(pre_model_name, binary=True)
    logger.info("pre-trained dataset loaded in %.4lfs", timeit.default_timer() - start_time)

    # find most similar words:
    for word in keywords:
        print word
        print pre_model.most_similar(word, topn=10);
Exemplo n.º 22
0
def fit_scaler(data_dir, word2vec_model=WORD2VEC_MODELPATH, batch_size=1024,
               persist_to_path=SCALER_PATH):
    if type(word2vec_model) == str:
        word2vec_model = Word2Vec.load(word2vec_model)

    doc_generator = get_documents(data_dir)

    scaler = StandardScaler(copy=False)

    no_more_samples = False
    while not no_more_samples:
        batch = []
        for i in xrange(batch_size):
            try:
                batch.append(doc_generator.next())
            except StopIteration:
                no_more_samples = True
                break

        vectors = []
        for doc in batch:
            for word in doc.get_all_words():
                if word in word2vec_model:
                    vectors.append(word2vec_model[word])

        matrix = np.array(vectors)
        print "Fitted to {} vectors".format(matrix.shape[0])

        scaler.partial_fit(matrix)

    if persist_to_path:
        save_to_disk(persist_to_path, scaler)

    return scaler
def main():
    """
    main function to make prediction
    use random forest
    :return:
    """
    train = pd.read_csv("/path/labeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
    test = pd.read_csv("/path/testData.tsv",
                   header=0, delimiter="\t", quoting=3)

    modelName = "/path/Word2VectforNLPTraining"
    model = Word2Vec.load(modelName)

    print("Processing training data...")
    cleaned_training_data = processData.clean_data(train)
    trainingDataFV = getAvgFeatureVecs(cleaned_training_data,model)
    print("Processing test data...")
    cleaned_test_data = processData.clean_data(test)
    testDataFV = getAvgFeatureVecs(cleaned_test_data,model)

    n_estimators = 100
    result = randomForestClassifier.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV)
    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
    output.to_csv("Word2Vec_AvgVecPredict.csv", index=False, quoting=3)
Exemplo n.º 24
0
def fit_scaler(data_dir, word2vec_model, batch_size=1024, persist_to_path=None):
    """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
     This scaler can be used afterwards for normalizing feature matrices. """
    if type(word2vec_model) == str:
        word2vec_model = Word2Vec.load(word2vec_model)

    doc_generator = get_documents(data_dir)
    scaler = StandardScaler(copy=False)

    no_more_samples = False
    while not no_more_samples:
        batch = []
        for i in range(batch_size):
            try:
                batch.append(six.next(doc_generator))
            except StopIteration:
                no_more_samples = True
                break

        vectors = []
        for doc in batch:
            for word in doc.get_all_words():
                if word in word2vec_model:
                    vectors.append(word2vec_model[word])

        matrix = np.array(vectors)
        print("Fitted to {} vectors".format(matrix.shape[0]))

        scaler.partial_fit(matrix)

    if persist_to_path:
        save_to_disk(persist_to_path, scaler)

    return scaler
Exemplo n.º 25
0
def refine_senses(model, words):
	"""
	Determine a more accurate number of senses for each word based on the most_similar
	senses of each sense of each word in the model
	"""
	model = Word2Vec.load(model)
	gold_word_senses = {}
	
	for w in words:
		senses = get_senses(model, w)
		if len(senses) > 1:
			sense_overlaps = find_overlaps(senses)
			if sense_overlaps:
				gold_word_senses[w] = determine_num_senses(sense_overlaps)
				#print gold_word_senses[w], w
			else:
				gold_word_senses[w] = add_senses(senses)
				#print gold_word_senses[w], w
		elif len(senses) == 1:
			gold_word_senses[w] = add_senses(senses)
			#print gold_word_senses[w], w
		else:
			gold_word_senses[w] = 2

	with open('/Users/adamberger/Desktop/CLMasters/Word_Representation_WSD/gold_word_senses.txt', 'w') as f:
		for word in gold_word_senses:
			f.write(word + ' ' + str(gold_word_senses[word]) + '\n')


	return gold_word_senses
Exemplo n.º 26
0
def initialize(fword, tword, modelfn, start, debug):
    juman = Juman()
    # parse and check from_word
    ms_f = juman.analysis(fword).mrph_list()
    if len(ms_f) > 1:
        print(u'{} is parsed multiple words'.format(fword))
        exit(1)
    wm_f = ms_f[0]
    if not wm_f.repname:
        print(u'no repname with {}'.format(fword))
        exit(1)
    fword = wm_f.repname
    # parse and check to_word
    ms_t = juman.analysis(tword).mrph_list()
    if len(ms_t) > 1:
        print(u'{} is parsed multiple words'.format(tword))
        exit(1)
    wm_t = ms_t[0]
    if not wm_t.repname:
        print(u'no repname with {}'.format(tword))
        exit(1)
    tword = wm_t.repname
    # load and check model
    print(u'loading model...')
    if modelfn.split('.')[-1] == 'model':
        model = Word2Vec.load(modelfn)
    elif modelfn.split('.')[-1] == 'bin':
        model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore')
    if fword not in model.vocab:
        raise KeyError(u'{} is not found in the model'.format(fword))
        exit(1)
    elif tword not in model.vocab:
        raise KeyError(u'{} is not found in the model'.format(tword))
        exit(1)
    model.save('hs0.100m.500.5.18mgt100.model')

    t1 = time.clock() - start
    if debug:
        printtime(t1)

    print(u'constructing id2vocab map...')
    id2vocab = {}
    for i, v in enumerate(model.vocab):
        id2vocab[i] = v

    t2 = time.clock() - t1
    if debug:
        printtime(t2)

    print(u'constructing V...')
    V = []
    for v in model.vocab:
        V.append(model[v])
    V = np.vstack(V)

    t3 = time.clock() - t2
    if debug:
        printtime(t3)
    return fword, tword, model, V, id2vocab, t3
Exemplo n.º 27
0
def main():
    mymodel = Word2Vec.load('./model/model1_2_5')
    freq = freqWord(4,0.01,stopwordsFile='../config/stopWords1.txt')
    data = get_array_data('../data/news_lines_splited.txt')[:5000]
    num=0
    for line in data:
        num+=len(line)
    print(num)
Exemplo n.º 28
0
def most_similar(file,word,num):
    '''取得单个词的相似词语'''
    file=os.path.join(os.getcwd(),file)
    if not os.path.exists(file):
        raise gg.NameError('训练好的文本不存在')
    models=Word2Vec.load(file)
    list1=models.most_similar(str(word),topn=int(num))
    return str(list1)
Exemplo n.º 29
0
def show_tsne():
    model = Word2Vec.load(model_name)
    embeddings = np.zeros((len(model.index2word),  num_features), dtype="float32")
    for i, word in enumerate(model.index2word):
       if (i+1) % 1000 == 0:
           print('Embeddings {}'.format(i+1))
       embeddings[i,:] = model[word]
    plot_tsne(model.index2word, embeddings)
Exemplo n.º 30
0
def test_vector(n=0, mincount=1):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    fname = 'WIKI_'+'.10epochs.singletok.min'+str(mincount)+'.deep'
    model = Word2Vec.load(fname)
    
    for termid, term in texeval_corpus.terms('test', sbc):
        if len(term.split()) == 1 and term in model:
            print termid, term, model[term]
Exemplo n.º 31
0
# import modules & set up logging
import gensim, logging
from fileObject import FileObj
from gensim.models import Word2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

if __name__ == '__main__':

    file_obj = FileObj(r"testSet/data")
    sentences = file_obj.read_lines_1_words()
    #model = Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
    #model.save('w2v_model')
    model = Word2Vec.load('w2v_model')
    print(model.most_similar(['怀孕']))
    print(model.similarity('怀孕', '孕妇'))
Exemplo n.º 32
0
from tries_Harshil import Trie
from tries_Harshil import TrieNode
import autocomplete2
from autocomplete2 import helpers
import json
from gensim.models import Word2Vec
app = Flask(__name__)
huge_file = "/Users/harshitg/github/autocomplete/autocomplete/everything_combined_processed.txt"
huge_list = []
with open(huge_file, "r") as f:
    for line in f:
        huge_list.extend(line.split())
keys = list(helpers.chunks(huge_list, 2))  # keys to form the trie structure.
new_list = []
model = Word2Vec.load(
    "/Users/harshitg/github/autocomplete/autocomplete/bigrams_fasttext_processed_cb.model"
)
model_vectors = model.wv
for i in range(0, len(keys)):
    curr_list = keys[i]
    new_list.append(curr_list[0] + ' ' + curr_list[1])
trie = Trie()
trie.formTrie(new_list)


@app.route('/', methods=['GET', 'POST'])
def print_suggestions():
    if request.method == 'POST':
        auto_suggestions = trie.printAutoSuggestions(
            request.get_json().get('item'), new_list, 10)
        if auto_suggestions == 0 or auto_suggestions == -1:
# give each word an index
def toindex(words):
    data = []
    for word in words:
        try:
            data.append(word_index[word])
        except:
            continue

    return data


if __name__ == '__main__':

    # import trained words vector
    model = Word2Vec.load("skipgram.model")

    wordvectors = model.wv
    vocab_list = [word for word, Vocab in wordvectors.vocab.items()]
    word_index = {" ": 0}
    word_vector = {}
    embedding_dim = model.vector_size
    embeddings_matrix = np.zeros((len(vocab_list) + 1, embedding_dim))
    for i in range(len(vocab_list)):
        word = vocab_list[i]
        word_index[word] = i + 1
        word_vector[word] = wordvectors[word]
        embeddings_matrix[i + 1] = wordvectors[word]

    cvscores = []
Exemplo n.º 34
0
                                     embeddings_path=embeddings_path,
                                     vocab_path=vocab_path,
                                     min_count=W2V_MINCOUNT,
                                     size=EMBEDDING_SIZE,
                                     sg=1,
                                     batch_words=W2V_BATCHWORDS,
                                     iter=W2V_ITERS,
                                     workers=multiprocessing.cpu_count())

else:

    print("Loading embeddings...\n")
    # load vocab todo: does this work???
    vocab, _ = load_vocab(vocab_path)
    # load embedding model todo: does this work???
    model = Word2Vec.load(embeddings_path)

# load the data.
print("Loading data...\n")

f_sents = codecs.open(sents_file, 'rb', encoding='utf8')
f_classes = codecs.open(label_file, 'rb', encoding='utf8')
sents = [sent.strip() for sent in f_sents.readlines()]
labels = [label.strip() for label in f_classes.readlines()]
# number of labels
num_labels = len(set(labels))

# fit vectorizers
print("Fitting tokenizer...\n")

# get count vectors
Exemplo n.º 35
0
# emotion lists
emotion_dict = {
    6:
    sorted(['love', 'anger', 'surprise', 'joy', 'sadness', 'fear']),
    8:
    sorted([
        'amusement', 'awe', 'contentment', 'excitement', 'anger', 'disgust',
        'fear', 'sadness'
    ])
}

# load word2vec model
print("Adobe Word2Vec loading")
model_folder = "/nfs/bigfovea/add_disk0/eugenia/Emotion/wordembedding_models/"
model_file = "w2v_adobe.model"
model = Word2Vec.load(os.path.join(model_folder, model_file))

for num in [6, 8]:
    for method in ['tag-emo', 'all_tags-emo']:
        print("--> Baselines %s %s-class:" % (method, num))
        data_set = []
        data_counts = {}
        k = 0
        for img_path in glob.glob(img_folder + "*.jpg"):
            k += 1
            if k % 1000 == 0:
                print(k // 1000, "K images processed")
                # save the data_set
                save2pickle(
                    os.path.join(dataset_dir, '%s_%s.pkl' % (method, num)),
                    data_set)
Exemplo n.º 36
0
import keras
from keras.models import Model
from keras import backend as K
from gensim.models import Word2Vec
import W2VProcessing as processing
import pytextvec as pytextvec

model = keras.models.load_model('data/model.h5')

print("Loading Gensim Model...")
word_vectors = Word2Vec.load('data/word2vec/500features_10minwords_10context')
maxlen = 10
X = processing.loadXTrain(
    '/home/quelibrio/Work/Bevrage/BevBox/receiptcomprehension/data/SamplePrintScans/bevmo_sample_receipt_01.txt'
)
x_test = processing.comments2Matrix(X, word_vectors, maxlen)
print(x_test)
prediction = model.predict(x=x_test, batch_size=1000)

print(prediction)

#fpr_keras, tpr_keras, thresholds_keras = roc_curve(ytest.argmax(),prediction.argmax())

#from sklearn.metrics import auc
#auc_keras = auc(fpr_keras, tpr_keras)

#import tensorflow as tf
#prediction1=tf.argmax(logits,1)
#print(prediction[0])
#print(prediction)
Exemplo n.º 37
0
	predicted = model.predict(data)[0]
	predictedY = predicted.argmax(axis=-1)

	return predictedY

if __name__ == "__main__":
	filterSizes = [3, 4, 5]
	numOfFilters = 100    # tested with 10, 20
	dropout = 0.5
	batchSize = 1000
	epochs = 20
	sequenceLength = 20 # Twitter max length is 140 chars
	embeddingDim = 50
	numOfLabels = 5
	drop = 0.5
	wvModel = Word2Vec.load('vectors.bin')
	# sentencesTrain, emojisTrain = obtainData()
	# dataTrain, labelsTrain, wordIdTrain = obtainData()
	# dataTest, labelsTest, wordIdTest = obtainData("test")
	dataTrain, dataTest, labelsTrain, labelsTest, wordIdMap, maxLength, idEmojiMap = buildDataFull()
	packedData = {"len": maxLength, "dic": wordIdMap, "emo": idEmojiMap}
	js = json.dumps(packedData)
	fp = open("datacnn.json", "w")
	fp.write(js)
	fp.close()

	embeddingMatrix = np.zeros((len(wordIdMap)+1, embeddingDim))
	for word, i in wordIdMap.items():
		try:
			vector = wvModel.wv[word]
			embeddingMatrix[i] = vector
Exemplo n.º 38
0
             ['and', 'the', 'final', 'sentence']]

# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)
# acces to the vector associated to "sentence"
print(model['sentence'])

# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

# fit a 2d PCA model to the vectors
X = []
for i in words:
    X.append(model[i])

pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
Exemplo n.º 39
0
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence, PathLineSentences

model = Word2Vec.load("word2vec.model")
score, predictions = model.wv.evaluate_word_analogies(
    './data/questions-words.txt')
print(score)

model = KeyedVectors.load_word2vec_format(
    "./data/GoogleNews-vectorsnegative300.bin", binary=True, limit=60000)
score, predictions = model.evaluate_word_analogies(
    './data/questions-words.txt')
print(score)
Exemplo n.º 40
0
save_id = args.save_id

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

print('' * 100)
print('Runing test ' + test)

n_batch = n_steps * n_cpu
n_updates = n_iters // n_batch

desc = os.path.join(
    "QuickDraw/embeddings/embeddings/",
    "act2vec_dataset:" + dataset + "_wordlen:" + str(stroke_length) + "_dim:" +
    str(embedding_dim) + "_win:" + str(window))

act2vec_model = Word2Vec.load(desc + ".model")

print('Found {} words in model'.format(len(list(act2vec_model.wv.vocab))))

env = SubprocVecEnv([
    lambda: PainterGym(width=256,
                       height=256,
                       embd_dim=embedding_dim,
                       action_translator=action_translator,
                       square_size=args.square_size,
                       act2vec_model=act2vec_model,
                       action_buff_size=1,
                       action_space='discrete',
                       emb_type=test) for i in range(n_cpu)
])
Exemplo n.º 41
0
    sentence = r.sub('', str(item))
    seg_list = tokenizer(sentence)
    to_csv_content.append(seg_list)

target_word = [
    '芯片', '紫光', '行业', '农业', '券商', '电动车', '电池', '服务器', '军工', '白酒', '医药', '健康',
    '医疗', '水泥', '上涨', '北京'
]
target_ner = [
    '紫光国微', '五粮液', '中兵红箭', '宁德时代', '三一重工', '东方航空', '恒瑞医药', '山东药玻', '太极实业',
    '中船防务', '中国平安', '招商轮船', '中兴通讯', '浪潮信息', '东华软件', '东山精密', '旋极信息'
]

#############寻找最相似词汇######################
text = data['content'][12]
model = Word2Vec.load("D:/nlp_learning/sinavoacb.model")
r = re.compile("[\s+\.\!\/_,$%^*(+\"\']+|[+——!;「」》::“”·‘’《,。?、~@#¥%……&*()()]+")
#########停用词##############


def stopwordslist(filepath):
    stopwords = [
        line.strip()
        for line in open(filepath, 'r', encoding='utf-8').readlines()
    ]
    return stopwords


stopwords = stopwordslist('D:/nlp_learning/停用词表/characters-master/stop_words')

score_list = []
def load_model(iter):
    model = Word2Vec.load(output_modelPath + str(iter) + 'model.bin')
    return model
Exemplo n.º 43
0
Arquivo: main.py Projeto: abcp4/ETM
test_tokens = test['tokens']
test_counts = test['counts']
args.num_docs_test = len(test_tokens)
test_1_tokens = test['tokens_1']
test_1_counts = test['counts_1']
args.num_docs_test_1 = len(test_1_tokens)
test_2_tokens = test['tokens_2']
test_2_counts = test['counts_2']
args.num_docs_test_2 = len(test_2_tokens)

emb_type = 'none'  #bert
embeddings = None

if (emb_type == 'w2v'):
    from gensim.models import Word2Vec
    model = Word2Vec.load("/content/word2vec/w2v_10eps_model.model")
    vectors = model.wv
    print('loaded')
elif (emb_type == 'bert'):
    from sentence_transformers import SentenceTransformer
    from sentence_transformers import models, losses
    import scipy.spatial
    import pickle as pkl
    word_embedding_model = models.BERT("/content/models")
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
def w2v_pad(df_train, df_test, col, maxlen_, victor_size):

    tokenizer = text.Tokenizer(num_words=args.num_words,
                               lower=False,
                               filters="")
    tokenizer.fit_on_texts(
        list(df_train[col].values) + list(df_test[col].values))

    train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(
        df_train[col].values),
                                    maxlen=maxlen_)
    test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(
        df_test[col].values),
                                   maxlen=maxlen_)

    word_index = tokenizer.word_index

    count = 0
    nb_words = len(word_index)
    print(nb_words)
    all_data = pd.concat([df_train[col], df_test[col]])
    file_name = '../embedding/' + 'Word2Vec_' + col + "_" + str(
        victor_size) + '.model'
    if not os.path.exists(file_name):
        model = Word2Vec([[word for word in document.split(' ')]
                          for document in all_data.values],
                         size=victor_size,
                         window=5,
                         iter=10,
                         workers=11,
                         seed=2018,
                         min_count=2)
        model.save(file_name)
    else:
        model = Word2Vec.load(file_name)
    print("add word2vec finished....")

    glove_model = {}
    with open("../embedding/glove_vectors_word.txt", encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            glove_model[word] = coefs
    print("add glove finished....")

    embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
    for word, i in word_index.items():
        embedding_vector = model[word] if word in model else None
        if embedding_vector is not None:
            count += 1
            embedding_word2vec_matrix[i] = embedding_vector
        else:
            unk_vec = np.random.random(victor_size) * 0.5
            unk_vec = unk_vec - unk_vec.mean()
            embedding_word2vec_matrix[i] = unk_vec

    glove_count = 0
    embedding_glove_matrix = np.zeros((nb_words + 1, victor_size))
    for word, i in word_index.items():
        embedding_glove_vector = glove_model[
            word] if word in glove_model else None
        if embedding_glove_vector is not None:
            glove_count += 1
            embedding_glove_matrix[i] = embedding_glove_vector
        else:
            unk_vec = np.random.random(victor_size) * 0.5
            unk_vec = unk_vec - unk_vec.mean()
            embedding_glove_matrix[i] = unk_vec

    embedding_matrix = np.concatenate(
        (embedding_word2vec_matrix, embedding_glove_matrix), axis=1)

    print(embedding_matrix.shape, train_.shape, test_.shape,
          count * 1.0 / embedding_matrix.shape[0],
          glove_count * 1.0 / embedding_matrix.shape[0])
    return train_, test_, word_index, embedding_matrix
Exemplo n.º 45
0
from gensim.models import Word2Vec

# test model
print('loading model...')
model = Word2Vec.load("assets/gay_seattle.w2v")
print("seattle", model.wv.most_similar('seattle', topn=50))

print(model.wv.distances('seattle', ('news', 'june', 'times', 'march')))

# seattle [('news', 0.9989323616027832), ('june', 0.998815655708313), ('times', 0.9987982511520386), ('march', 0.9987823963165283), ('apr', 0.9987049102783203), ('july', 0.9985809326171875), ('nov', 0.9984444379806519),


# print("model details: ", model)
# print('similar words to seattle:')
# print("capitol", model.wv.most_similar('capitol'))
#
# print("gay", model.wv.most_similar('gay', topn=50))
# print(model.wv.most_similar('lesbian'))
# print(model.wv.most_similar('considered'))
# print(model.wv.most_similar('number'))
# print("=================")
# print(model.wv.distances('seattle', ('gay', 'renton', 'lesbian', 'rain')))
# print(model.wv.distance('seattle', 'civil'))
# print(model.wv.distance('seattle', 'lesbian'))
# print(model.wv.rank('seattle', 'gay'))
# print(model.wv.rank('seattle', 'lesbian'))
# print(model.wv.distances('seattle'))
Exemplo n.º 46
0
model_num = 0  # Какую из моделей использовать

similar_qty = 10  # Количество похожих товаров

models_list = [
    'models/word2vec/w2v_mymodel_33_min50_sg0_i220_window5_size300',  #
    'models/word2vec/w2v_mymodel_33_min1000_sg0_i200_window5_size300',  # ------------
    'models/word2vec/w2v_mymodel_33_min5_sg0_i250_window3_size300_transSplit',  #
    'models/word2vec/w2v_mymodel_33_min1000_sg0_i200_window5_size300',  # ------------
    'models/word2vec/w2v_mymodel_33_mincount1_min1_sg0_i230_window5_size300',  #
    # 'models/word2vec/w2v_mymodel_33_mincount1_min1_sg0_i400_window10_size300',     #
    'models/word2vec/test_29 of 81 nodes_250 alpha_ 0.0025 epochs_ 100 windows_ 4 time_0_02_20_155266'
]

# Общие требования: sg1 - не использовать
# Скорость обучения и количество эпох (количество) - и скорость 0,005 пока лучший результат 220 достаточно

bdd_rms = pd.read_excel('library/BDD.xlsx',
                        names=[
                            'product', 'department', 'model_adeo',
                            'model_name', 'date_created', 'product_name',
                            'is_stm'
                        ])

bdd_rms['product'] = bdd_rms['product'].astype(str)

model = Word2Vec.load(models_list[model_num])

#%%
Exemplo n.º 47
0
import os
import time
import gc
import random
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors, word2vec, Word2Vec
import pickle, json

wv_from_bin = pickle.load(open("GloVe_50.pkl", 'rb'))  ###GLOVE
wv_from_scratch = Word2Vec.load('word2vec.model')  ##word2vec from scratch
wordVectors = np.load(
    "/home/wzh/wzh/glove/wordVectors.npy")  ##word2vec delta training
tokens = json.load(
    open("/home/wzh/wzh/glove/tokens.json"))  ##word2vec delta training

cate2id = json.load(open("label.json", "rb"))
NUM_LABLES = len(cate2id)
NUM_MODELS = 1
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 40
BATCH_SIZE = 32
EPOCH = 20
MODE = "delta_word2vec"  ###GLOVE, word2vec, delta_word2vec
DATASET = "embed_eval_data_sample_general.csv"
Exemplo n.º 48
0
    scores = {}
    for word in test_words:

        if word not in positive + negative:

            test_word = unitvec(np.array(model[word]))

            # Cosine Similarity
            scores[word] = np.dot(test_word, mean)

    print(sorted(scores, key=scores.get, reverse=True)[:10])  #1


## js
from gensim.models import Word2Vec
model = Word2Vec.load('wiki.en.word2vec.model')

positive_words = ["", ""]

negative_words = [""]

# Test Word2vec
print("Testing Word2vec")
#model = word2vec.getModel()
test(model, positive_words, negative_words, model.wv.vocab)

# # Test Fasttext
# print("Testing Fasttext")
# model = fasttxt.getModel()
# test(model,positive_words,negative_words,model.words)
Exemplo n.º 49
0
def query_model(word):
    results = list()
    most_similar = list()
    global model
    global model_path
    global num_of_words
    global stopwords

    if list(word)[0].isupper():
        firstupper = True
    else:
        firstupper = False    

    if model == None:               # if model is not initialised
        print("Loading Natural Language Processing Model...")
        try:
            start = time()
            model = w2v.load(model_path)
            print("Model Loaded Successfully! Took " + str(round(time()-start,2)) + " seconds.")
        except FileNotFoundError:
            print("Error: Model does not exist in path. Exiting...")
            exit()
        except ValueError:
            print("Path is not a valid model. Perhaps it is corrupt? Exiting...")
            exit()

    word = word.lower()                 # convert word to lowercase
    s = remove_nums_after_s(word)
    s = remove_nums_before_s(s)

    if s not in stopwords:                                   # only queries the model if word is not a stopword
        try:
            print("Querying model for: " + s)
            most_similar = model.wv.most_similar(s, topn=num_of_words)
        except KeyError:
            word_found = False
            for ret_word in remove_substitution(word,removeall=True):
                try:
                    most_similar = model.wv.most_similar(ret_word,topn=num_of_words)
                    word_found = True                               # becomes true if exception does not occur
                    print("Word stripped to: " + ret_word)
                    break
                except KeyError:
                    try:
                        ret_word = remove_nums_after_s(ret_word)
                        ret_word = remove_nums_before_s(ret_word)
                        most_similar = model.wv.most_similar(ret_word,topn=num_of_words)
                        word_found = True
                        print("Word stripped to: " + ret_word)
                        break
                    except KeyError:
                        continue

            if word_found == False:
                print(word + " - Could not find result for this word in NLP model. This word will be skipped")

    for item in most_similar:
        if item[1] > 0.5:                                      # if likeness > 0.7
            if firstupper:
                results.append(capitalise_first_char(item[0]))
            else:
                results.append(item[0])

    return results
Exemplo n.º 50
0
from gensim.models import Word2Vec
import sys

model_name = sys.argv[1]

model = Word2Vec.load(model_name)
# Get wordvectors for all words in vocabulary.
word_vectors = model.wv.syn0

print(model)
print(word_vectors.shape)

words = list(model.wv.vocab)
print(words)

print(model['yellow'])

# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
# model.wv['word']

#sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
#			['this', 'is', 'the', 'second', 'sentence'],
#			['yet', 'another', 'sentence'],
#			['one', 'more', 'sentence'],
#			['and', 'the', 'final', 'sentence']]
# train model
#model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
# print(model)
# summarize vocabulary
# words = list(model.wv.vocab)
subjects_eng = [
    'location_traffic_convenience', 'location_distance_from_business_district',
    'location_easy_to_find', 'service_wait_time', 'service_waiters_attitude',
    'service_parking_convenience', 'service_serving_speed', 'price_level',
    'price_cost_effective', 'price_discount', 'environment_decoration',
    'environment_noise', 'environment_space', 'environment_cleaness',
    'dish_portion', 'dish_taste', 'dish_look', 'dish_recommendation',
    'others_overall_experience', 'others_willing_to_consume_again'
]
subjects_dict = OrderedDict(zip(subjects_eng, subjects))
config.max_aspect_len = len(subjects[0].split(' '))

print('building word vector...')
print('构造 word2id 映射')
config.w2v_path, config.w2v_word2id_txt = '../data/all_content_no_punc_100_8_mc2_fnl.w2v', 'word2id_map_mc2_fnl'
w2v = Word2Vec.load(config.w2v_path)
print(len(w2v.wv.vocab))
word2id, max_context_len, max_aspect_len = get_word2id(
    '../data/',
    subjects,
    'ai_challenger_sentiment_analysis_trainingset_20180816/sentiment_analysis_trainingset',
    'ai_challenger_sentiment_analysis_validationset_20180816/sentiment_analysis_validationset',
    'ai_challenger_sentiment_analysis_testa_20180816/sentiment_analysis_testa',
    w2v,
    pre_processed=False,
    save_fname=config.w2v_word2id_txt,
    suffix='_cut_word_rst.txt')
print(len(word2id), max_context_len, max_aspect_len)
config.max_context_len, config.max_aspect_len = max_context_len, max_aspect_len

print('对评论编码')
            word1, word2 = h.split("-", 1)
            h2 = word1 + "_" + word2
        else:
            h2 = h
        res[h] = retrieve_vector(sem_model, h2)
    return res


######################################################################
############################ Main script #############################
######################################################################

spanish = False

if spanish:
    new_model = Word2Vec.load("spanish_word2vec.model")
    train_hypos = "SemEval2018-Task9/training/data/1C.spanish.training.data.txt"
    train_hypers = "SemEval2018-Task9/training/gold/1C.spanish.training.gold.txt"
    test_hypos = "SemEval2018-Task9/test/data/1C.spanish.test.data.txt"
    test_hypers = "SemEval2018-Task9/test/gold/1C.spanish.test.gold.txt"
    output_file = "SemEval2018-Task9/output_spanish.txt"
else:
    new_model = Word2Vec.load("english_word2vec.model")
    train_hypos = "SemEval2018-Task9/training/data/1A.english.training.data.txt"
    train_hypers = "SemEval2018-Task9/training/gold/1A.english.training.gold.txt"
    test_hypos = "SemEval2018-Task9/test/data/1A.english.test.data.txt"
    test_hypers = "SemEval2018-Task9/test/gold/1A.english.test.gold.txt"
    output_file = "SemEval2018-Task9/output.txt"

hypos_hypers_train = get_hypos_hypers(train_hypos, train_hypers)
hypos_train = hypos_hypers_train.keys()
Exemplo n.º 53
0
 def get_w2v_model(self):
     # 把之前訓練好的 word to vec 模型讀進來
     self.embedding = Word2Vec.load(self.w2v_path)
     self.embedding_dim = self.embedding.vector_size
import re
from gensim.models.phrases import Phrases, Phraser
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#Load model and start predicting.
model = Word2Vec.load("word2vec_TDS_14.model")


def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)

    # gets list of most similar words
    close_words = model.wv.most_similar([word])
            summary = summary + sentences[sent_id[0]] + '\n<br>'
            sent_count += 1
        # intermediate2 = updateScores(sent_id[0])
        else:
            sent_count += 1

        print(len(summary.split()))
        print(sent_count)
        print(len(sentences) - 1)
        print('============================')

    return [summary, len(input_str.split()), len(summary.split())]


app = Flask(__name__)  #create the Flask app
model = Word2Vec.load('resources/w2v_300cleaned_phrases+word2vec.bin')


@app.route('/summarize', methods=['POST'])
def query_example():
    if request.method == 'POST':
        if request.form.get('method') == 1:
            res = summarize_base(request.form.get('input'),
                                 request.form.get('len'))
        else:
            res = summarize_advanced(request.form.get('input'),
                                     request.form.get('len'))
        return jsonify({
            'summary': res[0],
            'input_count': res[1],
            'summary_count': res[2]
Exemplo n.º 56
0
import jieba_fast as jieba
from gensim.models import Word2Vec
import re, os
import codecs
import editdistance
import warnings
warnings.filterwarnings("ignore") # 忽略keras带来的满屏警告


mode = 0
char_size = 128
maxlen = 256
min_count = 16


word2vec = Word2Vec.load('../word2vec_baike/word2vec_baike')


id2word = {i+1:j for i,j in enumerate(word2vec.wv.index2word)}
word2id = {j:i for i,j in id2word.items()}
word2vec = word2vec.wv.syn0
word_size = word2vec.shape[1]
word2vec = np.concatenate([np.zeros((1, word_size)), word2vec])

for w in word2id:
    if w not in jieba.dt.FREQ:
        jieba.add_word(w)


def tokenize(s):
    return jieba.lcut(s, HMM=False)
Exemplo n.º 57
0
 def __init__(self, path_to_word2vec_model):
     self.model = Word2Vec.load(path_to_word2vec_model)
Exemplo n.º 58
0
import numpy as np
 from gensim.models import Word2Vec
 import _pickle as cPickle

 np.random.seed(123)
#item_emb contains neural embeddings of all items
 with open('./item_embed', 'rb') as f:
     item_list=cPickle.load(f)

 new_list=np.load('tot_x_seq1.npy')
 model = Word2Vec.load('word2vec_model')
 y_tot=np.load('tot_y1.npy')
 
 # Appending product vectors with their neural embeddings
 def w2v_data_ext(new_list):
     w2v_data=[]
     for i in range(0,len(new_list)):
         seq_vec=[]
         for j in range(0,len(new_list[i])):
             q = np.concatenate([model.wv[new_list[i][j]], item_list[new_list[i][j]]])
             if len(q)==82:
                 seq_vec.append(q)
         if len(seq_vec)==5:
             w2v_data.append(seq_vec)
     return np.asarray(w2v_data)


 # Train and test split
 def train_test_split(w2v_data,y_tot):
     train_x=w2v_data[0:69349]
     test_x=w2v_data[69349:]
Exemplo n.º 59
0
from gensim.models import Word2Vec

model = Word2Vec.load("./model/predictNewsTitle.model")
print(model["苹果"])
print(model.similarity('范冰冰', '李晨'))
Exemplo n.º 60
0
 def load_from(self, file):
     self.model = Word2Vec.load(file)
     self.model_initialized = True